354 files changed, 46072 insertions, 8657 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d5f5c7bbba8..aa29dcb3947 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -358,6 +358,7 @@ struct cell_spu_function_info
 
 
 /** This is the object passed to spe_create_thread() */
+PIPE_ALIGN_TYPE(16,
 struct cell_init_info
 {
    unsigned id;
@@ -370,7 +371,7 @@ struct cell_init_info
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 
    struct cell_spu_function_info *spu_functions;
-} ALIGN16_ATTRIB;
+});
 
 
 #endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 79ad687ea94..3a3f968a492 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -59,9 +59,9 @@ cell_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
 
    if (buffers & PIPE_CLEAR_COLOR) {
       uint surfIndex = 0;
-      uint clearValue;
+      union util_color uc;
 
-      util_pack_color(rgba, cell->framebuffer.cbufs[0]->format, &clearValue);
+      util_pack_color(rgba, cell->framebuffer.cbufs[0]->format, &uc);
 
       /* Build a CLEAR command and place it in the current batch buffer */
       STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
@@ -70,7 +70,7 @@ cell_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
          cell_batch_alloc16(cell, sizeof(*clr));
       clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
       clr->surface = surfIndex;
-      clr->value = clearValue;
+      clr->value = uc.ui;
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 5c3188e7f9d..e402ed29220 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -89,7 +89,7 @@ struct cell_buffer_node;
  */
 struct cell_buffer_list
 {
-   struct cell_fence fence ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_fence fence;
    struct cell_buffer_node *head;
 };
 
@@ -115,7 +115,7 @@ struct cell_context
 
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[2];
+   struct pipe_buffer *constants[2];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
@@ -150,18 +150,18 @@ struct cell_context
    /** Mapped constant buffers */
    void *mapped_constants[PIPE_SHADER_TYPES];
 
-   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info spu_functions;
 
    uint num_cells, num_spus;
 
    /** Buffers for command batches, vertex/index data */
    uint buffer_size[CELL_NUM_BUFFERS];
-   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE];
 
    int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4];
 
 
    /** Associated with each command/batch buffer is a list of pipe_buffers
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 644496db40c..0a4da8ecc85 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -51,17 +51,17 @@ cell_map_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size) {
-         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+      if (sp->constants[i] && sp->constants[i]->size) {
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i],
                                                    PIPE_BUFFER_USAGE_CPU_READ);
          cell_flush_buffer_range(sp, sp->mapped_constants[i], 
-                                 sp->constants[i].buffer->size);
+                                 sp->constants[i]->size);
       }
    }
 
-   draw_set_mapped_constant_buffer(sp->draw,
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   sp->constants[PIPE_SHADER_VERTEX].buffer->size);
+                                   sp->constants[PIPE_SHADER_VERTEX]->size);
 }
 
 static void
@@ -70,8 +70,8 @@ cell_unmap_constant_buffers(struct cell_context *sp)
    struct pipe_winsys *ws = sp->pipe.winsys;
    uint i;
    for (i = 0; i < 2; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      if (sp->constants[i] && sp->constants[i]->size)
+         ws->buffer_unmap(ws, sp->constants[i]);
       sp->mapped_constants[i] = NULL;
    }
 }
@@ -85,7 +85,7 @@ cell_unmap_constant_buffers(struct cell_context *sp)
  *
  * XXX should the element buffer be specified/bound with a separate function?
  */
-static boolean
+static void
 cell_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -145,47 +145,35 @@ cell_draw_range_elements(struct pipe_context *pipe,
 
    /* Note: leave drawing surfaces mapped */
    cell_unmap_constant_buffers(sp);
-
-   return TRUE;
 }
 
 
-static boolean
+static void
 cell_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned mode, unsigned start, unsigned count)
 {
-   return cell_draw_range_elements( pipe, indexBuffer,
-                                    indexSize,
-                                    0, 0xffffffff,
-                                    mode, start, count );
+   cell_draw_range_elements( pipe, indexBuffer,
+                             indexSize,
+                             0, 0xffffffff,
+                             mode, start, count );
 }
 
 
-static boolean
+static void
 cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
-}
-
-
-static void
-cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
-{
-   struct cell_context *cell = cell_context(pipe);
-   draw_set_edgeflags(cell->draw, edgeflags);
+   cell_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
-
 void
 cell_init_draw_functions(struct cell_context *cell)
 {
    cell->pipe.draw_arrays = cell_draw_arrays;
    cell->pipe.draw_elements = cell_draw_elements;
    cell->pipe.draw_range_elements = cell_draw_range_elements;
-   cell->pipe.set_edgeflags = cell_set_edgeflags;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 19e3ab08440..1d8a11a4ac9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -237,8 +237,8 @@ is_register_src(struct codegen *gen, int channel,
    if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
       return FALSE;
    }
-   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
-       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+   if (src->Register.File == TGSI_FILE_TEMPORARY ||
+       src->Register.File == TGSI_FILE_IMMEDIATE) {
       return TRUE;
    }
    return FALSE;
@@ -249,7 +249,7 @@ static boolean
 is_memory_dst(struct codegen *gen, int channel,
               const struct tgsi_full_dst_register *dst)
 {
-   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+   if (dst->Register.File == TGSI_FILE_OUTPUT) {
       return TRUE;
    }
    else {
@@ -279,15 +279,15 @@ get_src_reg(struct codegen *gen,
    assert(swizzle <= TGSI_SWIZZLE_W);
 
    {
-      int index = src->SrcRegister.Index;
+      int index = src->Register.Index;
 
       assert(swizzle < 4);
 
-      if (src->SrcRegister.Indirect) {
+      if (src->Register.Indirect) {
          /* XXX unfinished */
       }
 
-      switch (src->SrcRegister.File) {
+      switch (src->Register.File) {
       case TGSI_FILE_TEMPORARY:
          reg = gen->temp_regs[index][swizzle];
          break;
@@ -374,12 +374,12 @@ get_dst_reg(struct codegen *gen,
 {
    int reg = -1;
 
-   switch (dest->DstRegister.File) {
+   switch (dest->Register.File) {
    case TGSI_FILE_TEMPORARY:
       if (gen->if_nesting > 0 || gen->loop_nesting > 0)
          reg = get_itemp(gen);
       else
-         reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         reg = gen->temp_regs[dest->Register.Index][channel];
       break;
    case TGSI_FILE_OUTPUT:
       reg = get_itemp(gen);
@@ -419,10 +419,10 @@ store_dest_reg(struct codegen *gen,
    }
 #endif
 
-   switch (dest->DstRegister.File) {
+   switch (dest->Register.File) {
    case TGSI_FILE_TEMPORARY:
       if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
-         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int d_reg = gen->temp_regs[dest->Register.Index][channel];
          int exec_reg = get_exec_mask_reg(gen);
          /* Mix d with new value according to exec mask:
           * d[i] = mask_reg[i] ? value_reg : d_reg
@@ -437,7 +437,7 @@ store_dest_reg(struct codegen *gen,
    case TGSI_FILE_OUTPUT:
       {
          /* offset is measured in quadwords, not bytes */
-         int offset = dest->DstRegister.Index * 4 + channel;
+         int offset = dest->Register.Index * 4 + channel;
          if (gen->if_nesting > 0 || gen->loop_nesting > 0) {
             int exec_reg = get_exec_mask_reg(gen);
             int curval_reg = get_itemp(gen);
@@ -544,7 +544,7 @@ emit_epilogue(struct codegen *gen)
 
 #define FOR_EACH_ENABLED_CHANNEL(inst, ch) \
    for (ch = 0; ch < 4; ch++) \
-      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch))
+      if (inst->Dst[0].Register.WriteMask & (1 << ch))
 
 
 static boolean
@@ -552,7 +552,7 @@ emit_ARL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch = 0, src_reg, addr_reg;
 
-   src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+   src_reg = get_src_reg(gen, ch, &inst->Src[0]);
    addr_reg = get_address_reg(gen);
 
    /* convert float to int */
@@ -570,19 +570,19 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch, src_reg[4], dst_reg[4];
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      src_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      dst_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
-          is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+      if (is_register_src(gen, ch, &inst->Src[0]) &&
+          is_memory_dst(gen, ch, &inst->Dst[0])) {
          /* special-case: register to memory store */
-         store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, src_reg[ch], ch, &inst->Dst[0]);
       }
       else {
          spe_move(gen->f, dst_reg[ch], src_reg[ch]);
-         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->Dst[0]);
       }
    }
 
@@ -601,9 +601,9 @@ emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    /* Loop over Red/Green/Blue/Alpha channels, do the op, store results */
@@ -626,7 +626,7 @@ emit_binop(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    /* Free any intermediate temps we allocated */
@@ -645,16 +645,16 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
       spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
    }
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
    free_itemps(gen);
    return TRUE;
@@ -671,10 +671,10 @@ emit_LRP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* setup/get src/dst/temp regs */
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      s3_reg[ch] = get_src_reg(gen, ch, &inst->Src[2]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
       tmp_reg[ch] = get_itemp(gen);
    }
 
@@ -687,7 +687,7 @@ emit_LRP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
    }
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
    free_itemps(gen);
    return TRUE;
@@ -704,8 +704,8 @@ emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch, s1_reg[4], d_reg[4], tmp_reg[4];
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
       tmp_reg[ch] = get_itemp(gen);
    }
 
@@ -726,7 +726,7 @@ emit_RCP_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -747,8 +747,8 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    /* d = sign bit cleared in s1 */
@@ -757,7 +757,7 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -775,12 +775,12 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s2x_reg, s2y_reg, s2z_reg;
    int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
 
-   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
 
    /* t0 = x0 * x1 */
    spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
@@ -795,9 +795,9 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
       spe_move(gen->f, d_reg, t0_reg);
-      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -815,14 +815,14 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
    int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
 
-   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
-   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->Src[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]);
 
    /* t0 = x0 * x1 */
    spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
@@ -840,9 +840,9 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
       spe_move(gen->f, d_reg, t0_reg);
-      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -857,31 +857,31 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    /* XXX rewrite this function to look more like DP3/DP4 */
    int ch;
-   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
    int tmp_reg = get_itemp(gen);
 
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
    /* t = y0 * y1 + t */
    spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
    /* t = z0 * z1 + t */
    spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
-   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->Src[1]);
    /* t = w1 + t */
    spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
       spe_move(gen->f, d_reg, tmp_reg);
-      store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, tmp_reg, ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -898,9 +898,9 @@ emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int src_reg[3];
    int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
 
-   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
 
    /* t0 = x * x */
    spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
@@ -919,10 +919,10 @@ emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
       /* dst = src[ch] * t1 */
       spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
-      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -936,48 +936,48 @@ emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
    int tmp_reg = get_itemp(gen);
 
    /* t = z0 * y1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
    /* t = y0 * z1 - t */
    spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
-   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
-      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->Dst[0]);
    }
 
-   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->Src[1]);
    /* t = x0 * z1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
    /* t = z0 * x1 - t */
    spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
-   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
-      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->Dst[0]);
    }
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->Src[1]);
    /* t = y0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->Src[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->Src[1]);
    /* t = x0 * y1 - t */
    spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
 
-   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
-      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   if (inst->Dst[0].Register.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -995,14 +995,14 @@ static boolean
 emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch, s1_reg[4], s2_reg[4], d_reg[4], one_reg;
-   bool complement = FALSE;
+   boolean complement = FALSE;
 
    one_reg = get_const_one_reg(gen);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s2_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
@@ -1043,7 +1043,7 @@ emit_inequality(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -1060,10 +1060,10 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch;
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int s1_reg = get_src_reg(gen, ch, &inst->Src[0]);
+      int s2_reg = get_src_reg(gen, ch, &inst->Src[1]);
+      int s3_reg = get_src_reg(gen, ch, &inst->Src[2]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
       int zero_reg = get_itemp(gen);
    
       spe_zero(gen->f, zero_reg);
@@ -1072,7 +1072,7 @@ emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
       spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
 
-      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
       free_itemps(gen);
    }
 
@@ -1090,8 +1090,8 @@ emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch, s1_reg[4], d_reg[4];
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    /* Convert float to int */
@@ -1105,7 +1105,7 @@ emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -1129,8 +1129,8 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
    one_reg = get_const_one_reg(gen);
    
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
       tmp_reg[ch] = get_itemp(gen);
    }
 
@@ -1156,7 +1156,7 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -1177,8 +1177,8 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
    one_reg = get_const_one_reg(gen);
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
       tmp_reg[ch] = get_itemp(gen);
    }
 
@@ -1210,7 +1210,7 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* store result */
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -1272,7 +1272,7 @@ emit_function_call(struct codegen *gen,
 
    if (scalar) {
       for (a = 0; a < num_args; a++) {
-         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->Src[a]);
       }
       /* we'll call the function, put the return value in this register,
        * then replicate it across all write-enabled components in d_reg.
@@ -1287,11 +1287,11 @@ emit_function_call(struct codegen *gen,
 
       if (!scalar) {
          for (a = 0; a < num_args; a++) {
-            s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+            s_regs[a] = get_src_reg(gen, ch, &inst->Src[a]);
          }
       }
 
-      d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
 
       if (!scalar || !func_called) {
          /* for a scalar function, we'll really only call the function once */
@@ -1336,7 +1336,7 @@ emit_function_call(struct codegen *gen,
          spe_move(gen->f, d_reg, retval_reg);
       }
 
-      store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg, ch, &inst->Dst[0]);
       free_itemps(gen);
    }
 
@@ -1351,8 +1351,8 @@ emit_function_call(struct codegen *gen,
 static boolean
 emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   const uint target = inst->InstructionExtTexture.Texture;
-   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   const uint target = inst->Texture.Texture;
+   const uint unit = inst->Src[1].Register.Index;
    uint addr;
    int ch;
    int coord_regs[4], d_regs[4];
@@ -1373,14 +1373,14 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
       return FALSE;
    }
 
-   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+   assert(inst->Src[1].Register.File == TGSI_FILE_SAMPLER);
 
    spe_comment(gen->f, -4, "CALL tex:");
 
    /* get src/dst reg info */
    for (ch = 0; ch < 4; ch++) {
-      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
    }
 
    {
@@ -1425,7 +1425,7 @@ emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_regs[ch], ch, &inst->Dst[0]);
       free_itemps(gen);
    }
 
@@ -1452,7 +1452,7 @@ emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* get src regs */
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      s_regs[ch] = get_src_reg(gen, ch, &inst->Src[0]);
    }
 
    /* test if any src regs are < 0 */
@@ -1500,9 +1500,9 @@ emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-      d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      s0_reg[ch] = get_src_reg(gen, ch, &inst->Src[0]);
+      s1_reg[ch] = get_src_reg(gen, ch, &inst->Src[1]);
+      d_reg[ch] = get_dst_reg(gen, ch, &inst->Dst[0]);
       tmp_reg[ch] = get_itemp(gen);         
    }
 
@@ -1518,7 +1518,7 @@ emit_MIN_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
    }
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      store_dest_reg(gen, d_reg[ch], ch, &inst->Dst[0]);
    }
 
    free_itemps(gen);
@@ -1575,7 +1575,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    /* update conditional execution mask with the predicate register */
    int tmp_reg = get_itemp(gen);
-   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+   int s1_reg = get_src_reg(gen, channel, &inst->Src[0]);
 
    /* tmp = (s1_reg == 0) */
    spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
@@ -1699,8 +1699,8 @@ emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
    int ch;
 
    FOR_EACH_ENABLED_CHANNEL(inst, ch) {
-      int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-      int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      int s_reg = get_src_reg(gen, ch, &inst->Src[0]);
+      int d_reg = get_dst_reg(gen, ch, &inst->Dst[0]);
 
       int t1_reg = get_itemp(gen);
       int t2_reg = get_itemp(gen);
@@ -1909,8 +1909,8 @@ emit_declaration(struct cell_context *cell,
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-      for (i = decl->DeclarationRange.First;
-           i <= decl->DeclarationRange.Last;
+      for (i = decl->Range.First;
+           i <= decl->Range.Last;
            i++) {
          assert(i < MAX_TEMPS);
          for (ch = 0; ch < 4; ch++) {
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index ccd0fef6e84..c18a5d0635e 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -383,10 +383,10 @@ cell_init_state_functions(struct cell_context *cell)
    cell->pipe.delete_blend_state = cell_delete_blend_state;
 
    cell->pipe.create_sampler_state = cell_create_sampler_state;
-   cell->pipe.bind_sampler_states = cell_bind_sampler_states;
+   cell->pipe.bind_fragment_sampler_states = cell_bind_sampler_states;
    cell->pipe.delete_sampler_state = cell_delete_sampler_state;
 
-   cell->pipe.set_sampler_textures = cell_set_sampler_textures;
+   cell->pipe.set_fragment_sampler_textures = cell_set_sampler_textures;
 
    cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state;
    cell->pipe.bind_depth_stencil_alpha_state   = cell_bind_depth_stencil_alpha_state;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index efc4f78364b..b723e794e71 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -66,7 +66,7 @@ calculate_vertex_layout( struct cell_context *cell )
    vinfo->num_attribs = 0;
 
    /* we always want to emit vertex pos */
-   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
    assert(src >= 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
 
@@ -82,14 +82,14 @@ calculate_vertex_layout( struct cell_context *cell )
          break;
 
       case TGSI_SEMANTIC_COLOR:
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_COLOR, 
                                    fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
          break;
 
       case TGSI_SEMANTIC_FOG:
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
 #if 1
          if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
             src = 0;
@@ -100,7 +100,7 @@ calculate_vertex_layout( struct cell_context *cell )
 
       case TGSI_SEMANTIC_GENERIC:
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_GENERIC,
                               fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 9479c0898fd..f1e1dcb9eb0 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
 #include "cell_state.h"
@@ -239,12 +240,12 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
       const uint shader = PIPE_SHADER_FRAGMENT;
-      const uint num_const = cell->constants[shader].buffer->size / sizeof(float);
+      const uint num_const = cell->constants[shader]->size / sizeof(float);
       uint i, j;
       float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
       uint32_t *ibuf = (uint32_t *) buf;
       const float *constants = pipe_buffer_map(cell->pipe.screen,
-                                               cell->constants[shader].buffer,
+                                               cell->constants[shader],
                                                PIPE_BUFFER_USAGE_CPU_READ);
       ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
       ibuf[4] = num_const;
@@ -252,7 +253,7 @@ cell_emit_state(struct cell_context *cell)
       for (i = 0; i < num_const; i++) {
          buf[j++] = constants[i];
       }
-      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader]);
    }
 
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
@@ -299,9 +300,9 @@ cell_emit_state(struct cell_context *cell)
                for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
                   texture->start[level] = (ct->mapped +
                                            ct->level_offset[level]);
-                  texture->width[level] = ct->base.width[level];
-                  texture->height[level] = ct->base.height[level];
-                  texture->depth[level] = ct->base.depth[level];
+                  texture->width[level] = u_minify(ct->base.width0, level);
+                  texture->height[level] = u_minify(ct->base.height0, level);
+                  texture->depth[level] = u_minify(ct->base.depth0, level);
                }
                texture->target = ct->base.target;
             }
@@ -330,7 +331,7 @@ cell_emit_state(struct cell_context *cell)
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
 
-      info.num_outputs = draw_num_vs_outputs(draw);
+      info.num_outputs = draw_num_shader_outputs(draw);
       info.declarations = (uintptr_t) draw->vs.machine.Declarations;
       info.num_declarations = draw->vs.machine.NumDeclarations;
       info.instructions = (uintptr_t) draw->vs.machine.Instructions;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 6568c784fec..1b09cf7f7d7 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -183,7 +183,7 @@ cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 static void
 cell_set_constant_buffer(struct pipe_context *pipe,
                          uint shader, uint index,
-                         const struct pipe_constant_buffer *buf)
+                         struct pipe_buffer *buf)
 {
    struct cell_context *cell = cell_context(pipe);
 
@@ -193,7 +193,7 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    draw_flush(cell->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&cell->constants[shader].buffer, buf->buffer);
+   pipe_buffer_reference(&cell->constants[shader], buf);
 
    if (shader == PIPE_SHADER_VERTEX)
       cell->dirty |= CELL_NEW_VS_CONSTANTS;
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index ae4c61efb3b..998944f77a3 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -35,6 +35,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/internal/p_winsys_screen.h"
+
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -49,9 +51,9 @@ cell_texture_layout(struct cell_texture *ct)
 {
    struct pipe_texture *pt = &ct->base;
    unsigned level;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
 
    ct->buffer_size = 0;
 
@@ -65,17 +67,11 @@ cell_texture_layout(struct cell_texture *ct)
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
 
-      pt->width[level] = width;
-      pt->height[level] = height;
-      pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
-
-      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
+      ct->stride[level] = util_format_get_stride(pt->format, w_tile);
 
       ct->level_offset[level] = ct->buffer_size;
 
-      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      size = ct->stride[level] * util_format_get_nblocksy(pt->format, h_tile);
       if (pt->target == PIPE_TEXTURE_CUBE)
          size *= 6;
       else
@@ -83,9 +79,9 @@ cell_texture_layout(struct cell_texture *ct)
 
       ct->buffer_size += size;
 
-      width = minify(width);
-      height = minify(height);
-      depth = minify(depth);
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
    }
 }
 
@@ -276,8 +272,8 @@ cell_get_tex_surface(struct pipe_screen *screen,
       pipe_reference_init(&ps->reference, 1);
       pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
       ps->offset = ct->level_offset[level];
       /* XXX may need to override usage flags (see sp_texture.c) */
       ps->usage = usage;
@@ -286,10 +282,12 @@ cell_get_tex_surface(struct pipe_screen *screen,
       ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE) {
-         ps->offset += face * pt->nblocksy[level] * ct->stride[level];
+         unsigned h_tile = align(ps->height, TILE_SIZE);
+         ps->offset += face * util_format_get_nblocksy(ps->format, h_tile) * ct->stride[level];
       }
       else if (pt->target == PIPE_TEXTURE_3D) {
-         ps->offset += zslice * pt->nblocksy[level] * ct->stride[level];
+         unsigned h_tile = align(ps->height, TILE_SIZE);
+         ps->offset += zslice * util_format_get_nblocksy(ps->format, h_tile) * ct->stride[level];
       }
       else {
          assert(face == 0);
@@ -330,14 +328,10 @@ cell_get_tex_transfer(struct pipe_screen *screen,
    if (ctrans) {
       struct pipe_transfer *pt = &ctrans->base;
       pipe_texture_reference(&pt->texture, texture);
-      pt->format = texture->format;
-      pt->block = texture->block;
       pt->x = x;
       pt->y = y;
       pt->width = w;
       pt->height = h;
-      pt->nblocksx = texture->nblocksx[level];
-      pt->nblocksy = texture->nblocksy[level];
       pt->stride = ct->stride[level];
       pt->usage = usage;
       pt->face = face;
@@ -347,10 +341,12 @@ cell_get_tex_transfer(struct pipe_screen *screen,
       ctrans->offset = ct->level_offset[level];
 
       if (texture->target == PIPE_TEXTURE_CUBE) {
-         ctrans->offset += face * pt->nblocksy * pt->stride;
+         unsigned h_tile = align(u_minify(texture->height0, level), TILE_SIZE);
+         ctrans->offset += face * util_format_get_nblocksy(texture->format, h_tile) * pt->stride;
       }
       else if (texture->target == PIPE_TEXTURE_3D) {
-         ctrans->offset += zslice * pt->nblocksy * pt->stride;
+         unsigned h_tile = align(u_minify(texture->height0, level), TILE_SIZE);
+         ctrans->offset += zslice * util_format_get_nblocksy(texture->format, h_tile) * pt->stride;
       }
       else {
          assert(face == 0);
@@ -386,8 +382,8 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    struct pipe_texture *pt = transfer->texture;
    struct cell_texture *ct = cell_texture(pt);
    const uint level = ctrans->base.level;
-   const uint texWidth = pt->width[level];
-   const uint texHeight = pt->height[level];
+   const uint texWidth = u_minify(pt->width0, level);
+   const uint texHeight = u_minify(pt->height0, level);
    const uint stride = ct->stride[level];
    unsigned size;
 
@@ -403,7 +399,8 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
     * Create a buffer of ordinary memory for the linear texture.
     * This is the memory that the user will read/write.
     */
-   size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+   size = util_format_get_stride(pt->format, align(texWidth, TILE_SIZE)) *
+          util_format_get_nblocksy(pt->format, align(texHeight, TILE_SIZE));
 
    ctrans->map = align_malloc(size, 16);
    if (!ctrans->map)
@@ -411,7 +408,7 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
 
    if (transfer->usage & PIPE_TRANSFER_READ) {
       /* need to untwiddle the texture to make a linear version */
-      const uint bpp = pf_get_size(ct->base.format);
+      const uint bpp = util_format_get_blocksize(ct->base.format);
       if (bpp == 4) {
          const uint *src = (uint *) (ct->mapped + ctrans->offset);
          uint *dst = ctrans->map;
@@ -440,8 +437,8 @@ cell_transfer_unmap(struct pipe_screen *screen,
    struct pipe_texture *pt = transfer->texture;
    struct cell_texture *ct = cell_texture(pt);
    const uint level = ctrans->base.level;
-   const uint texWidth = pt->width[level];
-   const uint texHeight = pt->height[level];
+   const uint texWidth = u_minify(pt->width0, level);
+   const uint texHeight = u_minify(pt->height0, level);
    const uint stride = ct->stride[level];
 
    if (!ct->mapped) {
@@ -454,7 +451,7 @@ cell_transfer_unmap(struct pipe_screen *screen,
       /* The user wrote new texture data into the mapped buffer.
        * We need to convert the new linear data into the twiddled/tiled format.
        */
-      const uint bpp = pf_get_size(ct->base.format);
+      const uint bpp = util_format_get_blocksize(ct->base.format);
       if (bpp == 4) {
          const uint *src = ctrans->map;
          uint *dst = (uint *) (ct->mapped + ctrans->offset);
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 5c0179d9546..55bd85bde2b 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -53,8 +53,7 @@ struct spu_vs_context draw;
 /**
  * Buffers containing dynamically generated SPU code:
  */
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS];
 
 
 
@@ -405,8 +404,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
@@ -418,8 +415,6 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    case PIPE_TEX_FILTER_LINEAR:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
       break;
-   case PIPE_TEX_FILTER_ANISO:
-      /* fall-through, for now */
    case PIPE_TEX_FILTER_NEAREST:
       spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
       break;
@@ -547,7 +542,7 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) qword buffer[CELL_BUFFER_SIZE / 16];
    const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 4c32b2d06d7..d2166a49016 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -108,10 +108,10 @@
    for (CHAN = 0; CHAN < 4; CHAN++)
 
 #define IS_CHANNEL_ENABLED(INST, CHAN)\
-   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
 
 #define IS_CHANNEL_ENABLED2(INST, CHAN)\
-   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
 
 #define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
    FOR_EACH_CHANNEL( CHAN )\
@@ -431,22 +431,22 @@ fetch_source(
    index.i[0] =
    index.i[1] =
    index.i[2] =
-   index.i[3] = reg->SrcRegister.Index;
+   index.i[3] = reg->Register.Index;
 
-   if (reg->SrcRegister.Indirect) {
+   if (reg->Register.Indirect) {
       union spu_exec_channel index2;
       union spu_exec_channel indir_index;
 
       index2.i[0] =
       index2.i[1] =
       index2.i[2] =
-      index2.i[3] = reg->SrcRegisterInd.Index;
+      index2.i[3] = reg->Indirect.Index;
 
-      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->Indirect,
                                                    CHAN_X);
       fetch_src_file_channel(
          mach,
-         reg->SrcRegisterInd.File,
+         reg->Indirect.File,
          swizzle,
          &index2,
          &indir_index );
@@ -454,8 +454,8 @@ fetch_source(
       index.q = si_a(index.q, indir_index.q);
    }
 
-   if( reg->SrcRegister.Dimension ) {
-      switch( reg->SrcRegister.File ) {
+   if( reg->Register.Dimension ) {
+      switch( reg->Register.File ) {
       case TGSI_FILE_INPUT:
          index.q = si_mpyi(index.q, 17);
          break;
@@ -466,24 +466,24 @@ fetch_source(
          ASSERT( 0 );
       }
 
-      index.i[0] += reg->SrcRegisterDim.Index;
-      index.i[1] += reg->SrcRegisterDim.Index;
-      index.i[2] += reg->SrcRegisterDim.Index;
-      index.i[3] += reg->SrcRegisterDim.Index;
+      index.i[0] += reg->Dimension.Index;
+      index.i[1] += reg->Dimension.Index;
+      index.i[2] += reg->Dimension.Index;
+      index.i[3] += reg->Dimension.Index;
 
-      if (reg->SrcRegisterDim.Indirect) {
+      if (reg->Dimension.Indirect) {
          union spu_exec_channel index2;
          union spu_exec_channel indir_index;
 
          index2.i[0] =
          index2.i[1] =
          index2.i[2] =
-         index2.i[3] = reg->SrcRegisterDimInd.Index;
+         index2.i[3] = reg->DimIndirect.Index;
 
-         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
          fetch_src_file_channel(
             mach,
-            reg->SrcRegisterDimInd.File,
+            reg->DimIndirect.File,
             swizzle,
             &index2,
             &indir_index );
@@ -495,7 +495,7 @@ fetch_source(
    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    fetch_src_file_channel(
       mach,
-      reg->SrcRegister.File,
+      reg->Register.File,
       swizzle,
       &index,
       chan );
@@ -517,7 +517,7 @@ fetch_source(
       break;
    }
 
-   if (reg->SrcRegisterExtMod.Complement) {
+   if (reg->RegisterExtMod.Complement) {
       chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
    }
 }
@@ -532,21 +532,21 @@ store_dest(
 {
    union spu_exec_channel *dst;
 
-   switch( reg->DstRegister.File ) {
+   switch( reg->Register.File ) {
    case TGSI_FILE_NULL:
       return;
 
    case TGSI_FILE_OUTPUT:
       dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
-                           + reg->DstRegister.Index].xyzw[chan_index];
+                           + reg->Register.Index].xyzw[chan_index];
       break;
 
    case TGSI_FILE_TEMPORARY:
-      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      dst = &mach->Temps[reg->Register.Index].xyzw[chan_index];
       break;
 
    case TGSI_FILE_ADDRESS:
-      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      dst = &mach->Addrs[reg->Register.Index].xyzw[chan_index];
       break;
 
    default:
@@ -583,10 +583,10 @@ store_dest(
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
-    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
 
 #define STORE(VAL,INDEX,CHAN)\
-    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
 
 
 /**
@@ -612,7 +612,7 @@ exec_kil(struct spu_exec_machine *mach,
 
       /* unswizzle channel */
       swizzle = tgsi_util_get_full_src_register_swizzle (
-                        &inst->FullSrcRegisters[0],
+                        &inst->Src[0],
                         chan_index);
 
       /* check if the component has not been already tested */
@@ -677,7 +677,7 @@ exec_tex(struct spu_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
          boolean biasLod, boolean projected)
 {
-   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   const uint unit = inst->Src[1].Register.Index;
    union spu_exec_channel r[8];
    uint chan_index;
    float lodBias;
@@ -833,8 +833,8 @@ exec_declaration(struct spu_exec_machine *mach,
          unsigned first, last, mask;
          interpolation_func interp;
 
-         first = decl->DeclarationRange.First;
-         last = decl->DeclarationRange.Last;
+         first = decl->Range.First;
+         last = decl->Range.Last;
          mask = decl->Declaration.UsageMask;
 
          switch( decl->Declaration.Interpolate ) {
@@ -1681,7 +1681,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
@@ -1839,10 +1839,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    /* execute declarations (interpolants) */
    if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
       for (i = 0; i < mach->NumDeclarations; i++) {
+         PIPE_ALIGN_VAR(16)
          union {
             struct tgsi_full_declaration decl;
             qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
-         } d ALIGN16_ATTRIB;
+         } d;
          unsigned ea = (unsigned) (mach->Declarations + pc);
 
          spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
@@ -1853,10 +1854,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      PIPE_ALIGN_VAR(16)
       union {
          struct tgsi_full_instruction inst;
          qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
-      } i ALIGN16_ATTRIB;
+      } i;
       unsigned ea = (unsigned) (mach->Instructions + pc);
 
       spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
index 86056799405..0ca92af248d 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.h
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -98,9 +98,9 @@ struct spu_exec_machine
     * 4  internal temporaries
     * 1  address
     */
+   PIPE_ALIGN_VAR(16)
    struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
-                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
-       ALIGN16_ATTRIB;
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
 
    struct spu_exec_vector       *Addrs;
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index ff3d609d258..98919c43ffc 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -144,7 +144,7 @@ export_func(struct cell_spu_function_info *spu_functions,
 void
 return_function_info(void)
 {
-   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) struct cell_spu_function_info funcs;
    int tag = TAG_MISC;
 
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 33767e7c51d..b18f4c22ef1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -93,6 +93,7 @@ typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
                                                          vector float *constants);
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_framebuffer
 {
    void *color_start;              /**< addr of color surface in main memory */
@@ -107,10 +108,11 @@ struct spu_framebuffer
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
    float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
-} ALIGN16_ATTRIB;
+});
 
 
 /** per-texture level info */
+PIPE_ALIGN_TYPE(16,
 struct spu_texture_level
 {
    void *start;
@@ -123,20 +125,22 @@ struct spu_texture_level
    vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
    vector signed int max_s, max_t, max_r;
-} ALIGN16_ATTRIB;
+});
 
 
+PIPE_ALIGN_TYPE(16,
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
    uint target;  /**< PIPE_TEXTURE_x */
-} ALIGN16_ATTRIB;
+});
 
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
  */
+PIPE_ALIGN_TYPE(16,
 struct spu_global
 {
    /** One-time init/constant info */
@@ -155,8 +159,8 @@ struct spu_global
    struct vertex_info vertex_info;
 
    /** Current color and Z tiles */
-   tile_t ctile ALIGN16_ATTRIB;
-   tile_t ztile ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) tile_t ctile;
+   PIPE_ALIGN_VAR(16) tile_t ztile;
 
    /** Read depth/stencil tiles? */
    boolean read_depth_stencil;
@@ -165,8 +169,8 @@ struct spu_global
    ubyte cur_ctile_status, cur_ztile_status;
 
    /** Status of all tiles in framebuffer */
-   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
+   PIPE_ALIGN_VAR(16) ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE];
 
    /** Current fragment ops machine code, at 8-byte boundary */
    uint *fragment_ops_code;
@@ -175,7 +179,7 @@ struct spu_global
    spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
-   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   PIPE_ALIGN_VAR(8) uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
    /** Current fragment ops function */
    spu_fragment_program_func fragment_program;
 
@@ -187,7 +191,7 @@ struct spu_global
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
 
-} ALIGN16_ATTRIB;
+});
 
 
 extern struct spu_global spu;
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 5ffb7073abf..14987e3c3a2 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -169,7 +169,7 @@ void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   PIPE_ALIGN_VAR(16) ubyte vertex_data[CELL_BUFFER_SIZE];
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    uint index_bytes;
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index c2c32b22d5a..24057e29e36 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -33,7 +33,7 @@ tgsi_util_get_full_src_register_swizzle(
    unsigned component )
 {
    return tgsi_util_get_src_register_swizzle(
-      reg->SrcRegister,
+      reg->Register,
       component );
 }
 
@@ -45,10 +45,10 @@ tgsi_util_get_full_src_register_sign_mode(
 {
    unsigned sign_mode;
 
-   if( reg->SrcRegisterExtMod.Absolute ) {
+   if( reg->RegisterExtMod.Absolute ) {
       /* Consider only the post-abs negation. */
 
-      if( reg->SrcRegisterExtMod.Negate ) {
+      if( reg->RegisterExtMod.Negate ) {
          sign_mode = TGSI_UTIL_SIGN_SET;
       }
       else {
@@ -60,8 +60,8 @@ tgsi_util_get_full_src_register_sign_mode(
 
       unsigned negate;
 
-      negate = reg->SrcRegister.Negate;
-      if( reg->SrcRegisterExtMod.Negate ) {
+      negate = reg->Register.Negate;
+      if( reg->RegisterExtMod.Negate ) {
          negate = !negate;
       }
 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 03375d84a57..087963960df 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -43,7 +43,8 @@ typedef void (*spu_fetch_func)(qword *out, const qword *in,
 			       const qword *shuffle_data);
 
 
-static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16) static const qword
+fetch_shuffle_data[5] = {
    /* Shuffle used by CVT_64_FLOAT
     */
    {
@@ -110,7 +111,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       unsigned idx;
       const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
       const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
-      qword in[2 * 4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16) qword in[2 * 4];
 
 
       /* Fetch four attributes for four vertices.  
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index fbe5b34d397..3e9804bf8ee 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -107,8 +107,8 @@ run_vertex_program(struct spu_vs_context *draw,
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
 
-   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector inputs[PIPE_MAX_ATTRIBS];
+   PIPE_ALIGN_VAR(16) struct spu_exec_vector outputs[PIPE_MAX_ATTRIBS];
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
@@ -119,8 +119,8 @@ run_vertex_program(struct spu_vs_context *draw,
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   machine->Outputs = ALIGN16_ASSIGN(outputs);
+   machine->Inputs = inputs;
+   machine->Outputs = outputs;
 
    spu_vertex_fetch( draw, machine, elts, count );
 
@@ -132,8 +132,9 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      PIPE_ALIGN_VAR(16)
       unsigned char buffer[sizeof(struct vertex_header)
-          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE];
       struct vertex_header *const tmpOut =
           (struct vertex_header *) buffer;
       const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
@@ -186,8 +187,8 @@ run_vertex_program(struct spu_vs_context *draw,
 }
 
 
-unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
-    ALIGN16_ATTRIB;
+PIPE_ALIGN_VAR(16) unsigned char
+immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]);
 
 
 void
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 37184eac7b1..46e4338d98a 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -44,11 +44,19 @@ static void failover_destroy( struct pipe_context *pipe )
 }
 
 
+void failover_fail_over( struct failover_context *failover )
+{
+   failover->dirty = TRUE;
+   failover->mode = FO_SW;
+}
+
 
-static boolean failover_draw_elements( struct pipe_context *pipe,
-				       struct pipe_buffer *indexBuffer,
-				       unsigned indexSize,
-				       unsigned prim, unsigned start, unsigned count)
+static void failover_draw_elements( struct pipe_context *pipe,
+                                    struct pipe_buffer *indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned prim, 
+                                    unsigned start, 
+                                    unsigned count)
 {
    struct failover_context *failover = failover_context( pipe );
 
@@ -62,24 +70,22 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
    /* Try hardware:
     */
    if (failover->mode == FO_HW) {
-      if (!failover->hw->draw_elements( failover->hw, 
-					indexBuffer, 
-					indexSize, 
-					prim, 
-					start, 
-					count )) {
-
-	 failover->hw->flush( failover->hw, ~0, NULL );
-	 failover->mode = FO_SW;
-      }
+      failover->hw->draw_elements( failover->hw, 
+                                   indexBuffer, 
+                                   indexSize, 
+                                   prim, 
+                                   start, 
+                                   count );
    }
 
    /* Possibly try software:
     */
    if (failover->mode == FO_SW) {
 
-      if (failover->dirty) 
+      if (failover->dirty) {
+         failover->hw->flush( failover->hw, ~0, NULL );
 	 failover_state_emit( failover );
+      }
 
       failover->sw->draw_elements( failover->sw, 
 				   indexBuffer, 
@@ -94,15 +100,13 @@ static boolean failover_draw_elements( struct pipe_context *pipe,
        */
       failover->sw->flush( failover->sw, ~0, NULL );
    }
-
-   return TRUE;
 }
 
 
-static boolean failover_draw_arrays( struct pipe_context *pipe,
+static void failover_draw_arrays( struct pipe_context *pipe,
 				     unsigned prim, unsigned start, unsigned count)
 {
-   return failover_draw_elements(pipe, NULL, 0, prim, start, count);
+   failover_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 static unsigned int
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
index 9ba86ba8664..191a44c3dfc 100644
--- a/src/gallium/drivers/failover/fo_context.h
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -72,6 +72,7 @@ struct failover_context {
     */
    const struct fo_state     *blend;
    const struct fo_state     *sampler[PIPE_MAX_SAMPLERS];
+   const struct fo_state     *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
    const struct fo_state     *depth_stencil;
    const struct fo_state     *rasterizer;
    const struct fo_state     *fragment_shader;
@@ -83,6 +84,7 @@ struct failover_context {
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_texture *vertex_textures[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
    struct pipe_vertex_element vertex_elements[PIPE_MAX_ATTRIBS];
@@ -92,11 +94,15 @@ struct failover_context {
 
    void *sw_sampler_state[PIPE_MAX_SAMPLERS];
    void *hw_sampler_state[PIPE_MAX_SAMPLERS];
+   void *sw_vertex_sampler_state[PIPE_MAX_VERTEX_SAMPLERS];
+   void *hw_vertex_sampler_state[PIPE_MAX_VERTEX_SAMPLERS];
 
    unsigned dirty;
 
    unsigned num_samplers;
+   unsigned num_vertex_samplers;
    unsigned num_textures;
+   unsigned num_vertex_textures;
 
    unsigned mode;
    struct pipe_context *hw;
@@ -119,7 +125,7 @@ failover_context( struct pipe_context *pipe )
 void
 failover_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf);
+                             struct pipe_buffer *buf);
 
 
 #endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/failover/fo_state.c b/src/gallium/drivers/failover/fo_state.c
index c8eb9262994..d6ec4d13136 100644
--- a/src/gallium/drivers/failover/fo_state.c
+++ b/src/gallium/drivers/failover/fo_state.c
@@ -322,8 +322,9 @@ failover_create_sampler_state(struct pipe_context *pipe,
 }
 
 static void
-failover_bind_sampler_states(struct pipe_context *pipe,
-                             unsigned num, void **sampler)
+failover_bind_fragment_sampler_states(struct pipe_context *pipe,
+                                      unsigned num,
+                                      void **sampler)
 {
    struct failover_context *failover = failover_context(pipe);
    struct fo_state *state = (struct fo_state*)sampler;
@@ -339,10 +340,40 @@ failover_bind_sampler_states(struct pipe_context *pipe,
    }
    failover->dirty |= FO_NEW_SAMPLER;
    failover->num_samplers = num;
-   failover->sw->bind_sampler_states(failover->sw, num,
-                                     failover->sw_sampler_state);
-   failover->hw->bind_sampler_states(failover->hw, num,
-                                     failover->hw_sampler_state);
+   failover->sw->bind_fragment_sampler_states(failover->sw, num,
+                                              failover->sw_sampler_state);
+   failover->hw->bind_fragment_sampler_states(failover->hw, num,
+                                              failover->hw_sampler_state);
+}
+
+static void
+failover_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)samplers;
+   uint i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == failover->num_vertex_samplers &&
+       !memcmp(failover->vertex_samplers, samplers, num_samplers * sizeof(void *))) {
+      return;
+   }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      failover->sw_vertex_sampler_state[i] = i < num_samplers ? state[i].sw_state : NULL;
+      failover->hw_vertex_sampler_state[i] = i < num_samplers ? state[i].hw_state : NULL;
+   }
+   failover->dirty |= FO_NEW_SAMPLER;
+   failover->num_vertex_samplers = num_samplers;
+   failover->sw->bind_vertex_sampler_states(failover->sw,
+                                            num_samplers,
+                                            failover->sw_vertex_sampler_state);
+   failover->hw->bind_vertex_sampler_states(failover->hw,
+                                            num_samplers,
+                                            failover->hw_vertex_sampler_state);
 }
 
 static void
@@ -360,9 +391,9 @@ failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
 
 
 static void
-failover_set_sampler_textures(struct pipe_context *pipe,
-                              unsigned num,
-                              struct pipe_texture **texture)
+failover_set_fragment_sampler_textures(struct pipe_context *pipe,
+                                       unsigned num,
+                                       struct pipe_texture **texture)
 {
    struct failover_context *failover = failover_context(pipe);
    uint i;
@@ -381,8 +412,38 @@ failover_set_sampler_textures(struct pipe_context *pipe,
                              NULL);
    failover->dirty |= FO_NEW_TEXTURE;
    failover->num_textures = num;
-   failover->sw->set_sampler_textures( failover->sw, num, texture );
-   failover->hw->set_sampler_textures( failover->hw, num, texture );
+   failover->sw->set_fragment_sampler_textures( failover->sw, num, texture );
+   failover->hw->set_fragment_sampler_textures( failover->hw, num, texture );
+}
+
+
+static void
+failover_set_vertex_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num_textures,
+                                     struct pipe_texture **textures)
+{
+   struct failover_context *failover = failover_context(pipe);
+   uint i;
+
+   assert(num_textures <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_textures == failover->num_vertex_textures &&
+       !memcmp(failover->vertex_textures, textures, num_textures * sizeof(struct pipe_texture *))) {
+      return;
+   }
+   for (i = 0; i < num_textures; i++) {
+      pipe_texture_reference((struct pipe_texture **)&failover->vertex_textures[i],
+                             textures[i]);
+   }
+   for (i = num_textures; i < failover->num_vertex_textures; i++) {
+      pipe_texture_reference((struct pipe_texture **)&failover->vertex_textures[i],
+                             NULL);
+   }
+   failover->dirty |= FO_NEW_TEXTURE;
+   failover->num_vertex_textures = num_textures;
+   failover->sw->set_vertex_sampler_textures(failover->sw, num_textures, textures);
+   failover->hw->set_vertex_sampler_textures(failover->hw, num_textures, textures);
 }
 
 
@@ -434,7 +495,7 @@ failover_set_vertex_elements(struct pipe_context *pipe,
 void
 failover_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             struct pipe_buffer *buf)
 {
    struct failover_context *failover = failover_context(pipe);
 
@@ -453,7 +514,8 @@ failover_init_state_functions( struct failover_context *failover )
    failover->pipe.bind_blend_state   = failover_bind_blend_state;
    failover->pipe.delete_blend_state = failover_delete_blend_state;
    failover->pipe.create_sampler_state = failover_create_sampler_state;
-   failover->pipe.bind_sampler_states  = failover_bind_sampler_states;
+   failover->pipe.bind_fragment_sampler_states  = failover_bind_fragment_sampler_states;
+   failover->pipe.bind_vertex_sampler_states  = failover_bind_vertex_sampler_states;
    failover->pipe.delete_sampler_state = failover_delete_sampler_state;
    failover->pipe.create_depth_stencil_alpha_state = failover_create_depth_stencil_state;
    failover->pipe.bind_depth_stencil_alpha_state   = failover_bind_depth_stencil_state;
@@ -473,7 +535,8 @@ failover_init_state_functions( struct failover_context *failover )
    failover->pipe.set_framebuffer_state = failover_set_framebuffer_state;
    failover->pipe.set_polygon_stipple = failover_set_polygon_stipple;
    failover->pipe.set_scissor_state = failover_set_scissor_state;
-   failover->pipe.set_sampler_textures = failover_set_sampler_textures;
+   failover->pipe.set_fragment_sampler_textures = failover_set_fragment_sampler_textures;
+   failover->pipe.set_vertex_sampler_textures = failover_set_vertex_sampler_textures;
    failover->pipe.set_viewport_state = failover_set_viewport_state;
    failover->pipe.set_vertex_buffers = failover_set_vertex_buffers;
    failover->pipe.set_vertex_elements = failover_set_vertex_elements;
diff --git a/src/gallium/drivers/failover/fo_state_emit.c b/src/gallium/drivers/failover/fo_state_emit.c
index bd4fce9d209..a3341e33f80 100644
--- a/src/gallium/drivers/failover/fo_state_emit.c
+++ b/src/gallium/drivers/failover/fo_state_emit.c
@@ -92,13 +92,19 @@ failover_state_emit( struct failover_context *failover )
       failover->sw->set_viewport_state( failover->sw, &failover->viewport );
 
    if (failover->dirty & FO_NEW_SAMPLER) {
-      failover->sw->bind_sampler_states( failover->sw, failover->num_samplers,
-                                         failover->sw_sampler_state );
+      failover->sw->bind_fragment_sampler_states( failover->sw, failover->num_samplers,
+                                                  failover->sw_sampler_state );
+      failover->sw->bind_vertex_sampler_states(failover->sw,
+                                               failover->num_vertex_samplers,
+                                               failover->sw_vertex_sampler_state);
    }
 
    if (failover->dirty & FO_NEW_TEXTURE) {
-      failover->sw->set_sampler_textures( failover->sw, failover->num_textures, 
-                                          failover->texture );
+      failover->sw->set_fragment_sampler_textures( failover->sw, failover->num_textures, 
+                                                   failover->texture );
+      failover->sw->set_vertex_sampler_textures(failover->sw,
+                                                failover->num_vertex_textures, 
+                                                failover->vertex_textures);
    }
 
    if (failover->dirty & FO_NEW_VERTEX_BUFFER) {
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
index a8ce997a1f6..533122b69da 100644
--- a/src/gallium/drivers/failover/fo_winsys.h
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -36,10 +36,13 @@
 
 
 struct pipe_context;
+struct failover_context;
 
 
 struct pipe_context *failover_create( struct pipe_context *hw,
 				      struct pipe_context *sw );
 
 
+void failover_fail_over( struct failover_context *failover );
+
 #endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 16ca6381191..4c5ff37ca84 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -42,7 +42,7 @@
  */
 
 
-static boolean
+static void
 i915_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -81,7 +81,7 @@ i915_draw_range_elements(struct pipe_context *pipe,
    }
 
 
-   draw_set_mapped_constant_buffer(draw,
+   draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
                                    i915->current.constants[PIPE_SHADER_VERTEX],
                                    (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
                                       4 * sizeof(float)));
@@ -103,27 +103,25 @@ i915_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, indexBuffer);
       draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
    }
-
-   return TRUE;
 }
 
-static boolean
+static void
 i915_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
                    unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_range_elements(pipe, indexBuffer,
-                                   indexSize,
-                                   0, 0xffffffff,
-                                   prim, start, count);
+   i915_draw_range_elements(pipe, indexBuffer,
+                            indexSize,
+                            0, 0xffffffff,
+                            prim, start, count);
 }
 
-static boolean
+static void
 i915_draw_arrays(struct pipe_context *pipe,
                  unsigned prim, unsigned start, unsigned count)
 {
-   return i915_draw_elements(pipe, NULL, 0, prim, start, count);
+   i915_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 234b441ce6e..37cbd56036b 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -233,7 +233,8 @@ struct i915_context
 
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   /* XXX unneded */
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 379d47e79a3..25c53210be8 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -143,12 +143,12 @@ static uint
 src_vector(struct i915_fp_compile *p,
            const struct tgsi_full_src_register *source)
 {
-   uint index = source->SrcRegister.Index;
+   uint index = source->Register.Index;
    uint src = 0, sem_name, sem_ind;
 
-   switch (source->SrcRegister.File) {
+   switch (source->Register.File) {
    case TGSI_FILE_TEMPORARY:
-      if (source->SrcRegister.Index >= I915_MAX_TEMPORARY) {
+      if (source->Register.Index >= I915_MAX_TEMPORARY) {
          i915_program_error(p, "Exceeded max temporary reg");
          return 0;
       }
@@ -215,26 +215,25 @@ src_vector(struct i915_fp_compile *p,
    }
 
    src = swizzle(src,
-		 source->SrcRegister.SwizzleX,
-		 source->SrcRegister.SwizzleY,
-		 source->SrcRegister.SwizzleZ,
-		 source->SrcRegister.SwizzleW);
+		 source->Register.SwizzleX,
+		 source->Register.SwizzleY,
+		 source->Register.SwizzleZ,
+		 source->Register.SwizzleW);
 
 
    /* There's both negate-all-components and per-component negation.
     * Try to handle both here.
     */
    {
-      int n = source->SrcRegister.Negate;
+      int n = source->Register.Negate;
       src = negate(src, n, n, n, n);
    }
 
-   /* no abs() or post-abs negation */
+   /* no abs() */
 #if 0
    /* XXX assertions disabled to allow arbfplight.c to run */
    /* XXX enable these assertions, or fix things */
-   assert(!source->SrcRegisterExtMod.Absolute);
-   assert(!source->SrcRegisterExtMod.Negate);
+   assert(!source->Register.Absolute);
 #endif
    return src;
 }
@@ -247,10 +246,10 @@ static uint
 get_result_vector(struct i915_fp_compile *p,
                   const struct tgsi_full_dst_register *dest)
 {
-   switch (dest->DstRegister.File) {
+   switch (dest->Register.File) {
    case TGSI_FILE_OUTPUT:
       {
-         uint sem_name = p->shader->info.output_semantic_name[dest->DstRegister.Index];
+         uint sem_name = p->shader->info.output_semantic_name[dest->Register.Index];
          switch (sem_name) {
          case TGSI_SEMANTIC_POSITION:
             return UREG(REG_TYPE_OD, 0);
@@ -262,7 +261,7 @@ get_result_vector(struct i915_fp_compile *p,
          }
       }
    case TGSI_FILE_TEMPORARY:
-      return UREG(REG_TYPE_R, dest->DstRegister.Index);
+      return UREG(REG_TYPE_R, dest->Register.Index);
    default:
       i915_program_error(p, "Bad inst->DstReg.File");
       return 0;
@@ -277,7 +276,7 @@ static uint
 get_result_flags(const struct tgsi_full_instruction *inst)
 {
    const uint writeMask
-      = inst->FullDstRegisters[0].DstRegister.WriteMask;
+      = inst->Dst[0].Register.WriteMask;
    uint flags = 0x0;
 
    if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
@@ -339,14 +338,14 @@ emit_tex(struct i915_fp_compile *p,
          const struct tgsi_full_instruction *inst,
          uint opcode)
 {
-   uint texture = inst->InstructionExtTexture.Texture;
-   uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint texture = inst->Texture.Texture;
+   uint unit = inst->Src[1].Register.Index;
    uint tex = translate_tex_src_target( p, texture );
    uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex);
-   uint coord = src_vector( p, &inst->FullSrcRegisters[0]);
+   uint coord = src_vector( p, &inst->Src[0]);
 
    i915_emit_texld( p,
-                    get_result_vector( p, &inst->FullDstRegisters[0] ),
+                    get_result_vector( p, &inst->Dst[0] ),
                     get_result_flags( inst ),
                     sampler,
                     coord,
@@ -368,13 +367,13 @@ emit_simple_arith(struct i915_fp_compile *p,
 
    assert(numArgs <= 3);
 
-   arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->FullSrcRegisters[0] );
-   arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->FullSrcRegisters[1] );
-   arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->FullSrcRegisters[2] );
+   arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->Src[0] );
+   arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->Src[1] );
+   arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->Src[2] );
 
    i915_emit_arith( p,
                     opcode,
-                    get_result_vector( p, &inst->FullDstRegisters[0]),
+                    get_result_vector( p, &inst->Dst[0]),
                     get_result_flags( inst ), 0,
                     arg1,
                     arg2,
@@ -394,8 +393,8 @@ emit_simple_arith_swap2(struct i915_fp_compile *p,
 
    /* transpose first two registers */
    inst2 = *inst;
-   inst2.FullSrcRegisters[0] = inst->FullSrcRegisters[1];
-   inst2.FullSrcRegisters[1] = inst->FullSrcRegisters[0];
+   inst2.Src[0] = inst->Src[1];
+   inst2.Src[1] = inst->Src[0];
 
    emit_simple_arith(p, &inst2, opcode, numArgs);
 }
@@ -424,10 +423,10 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ABS:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       i915_emit_arith(p,
                       A0_MAX,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       src0, negate(src0, 1, 1, 1, 1), 0);
       break;
@@ -437,17 +436,17 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_CMP:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
-      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      src2 = src_vector(p, &inst->Src[2]);
       i915_emit_arith(p, A0_CMP, 
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 
                       0, src0, src2, src1);   /* NOTE: order of src2, src1 */
       break;
 
    case TGSI_OPCODE_COS:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       tmp = i915_get_utemp(p);
 
       i915_emit_arith(p,
@@ -490,7 +489,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
       i915_emit_arith(p,
                       A0_DP4,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(tmp, ONE, Z, Y, X),
                       i915_emit_const4fv(p, cos_constants), 0);
@@ -505,19 +504,19 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_DPH:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
 
       i915_emit_arith(p,
                       A0_DP4,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, X, Y, Z, ONE), src1, 0);
       break;
 
    case TGSI_OPCODE_DST:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
 
       /* result[0] = 1    * 1;
        * result[1] = a[1] * b[1];
@@ -526,7 +525,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
        */
       i915_emit_arith(p,
                       A0_MUL,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, ONE, Y, Z, ONE),
                       swizzle(src1, ONE, Y, ONE, W), 0);
@@ -537,11 +536,11 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_EX2:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
 
       i915_emit_arith(p,
                       A0_EXP,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, X, X, X, X), 0, 0);
       break;
@@ -556,7 +555,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
    case TGSI_OPCODE_KIL:
       /* kill if src[0].x < 0 || src[0].y < 0 ... */
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       tmp = i915_get_utemp(p);
 
       i915_emit_texld(p,
@@ -572,17 +571,17 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_LG2:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
 
       i915_emit_arith(p,
                       A0_LOG,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, X, X, X, X), 0, 0);
       break;
 
    case TGSI_OPCODE_LIT:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       tmp = i915_get_utemp(p);
 
       /* tmp = max( a.xyzw, a.00zw )
@@ -606,7 +605,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
                       swizzle(tmp, Y, Y, Y, Y), 0, 0);
 
       i915_emit_arith(p, A0_CMP,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0),
                       swizzle(tmp, ONE, X, ZERO, ONE),
@@ -615,9 +614,9 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_LRP:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
-      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
+      src2 = src_vector(p, &inst->Src[2]);
       flags = get_result_flags(inst);
       tmp = i915_get_utemp(p);
 
@@ -632,7 +631,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
                       flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2);
 
       i915_emit_arith(p, A0_MAD,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp);
       break;
 
@@ -645,8 +644,8 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_MIN:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
       tmp = i915_get_utemp(p);
       flags = get_result_flags(inst);
 
@@ -658,7 +657,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
       i915_emit_arith(p,
                       A0_MOV,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0);
       break;
 
@@ -671,8 +670,8 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_POW:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
       tmp = i915_get_utemp(p);
       flags = get_result_flags(inst);
 
@@ -687,7 +686,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
       i915_emit_arith(p,
                       A0_EXP,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       flags, 0, swizzle(tmp, X, X, X, X), 0, 0);
       break;
       
@@ -696,27 +695,27 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
       
    case TGSI_OPCODE_RCP:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
 
       i915_emit_arith(p,
                       A0_RCP,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                          get_result_flags(inst), 0,
                       swizzle(src0, X, X, X, X), 0, 0);
       break;
 
    case TGSI_OPCODE_RSQ:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
 
       i915_emit_arith(p,
                       A0_RSQ,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, X, X, X, X), 0, 0);
       break;
 
    case TGSI_OPCODE_SCS:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       tmp = i915_get_utemp(p);
 
       /* 
@@ -739,7 +738,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
                       swizzle(tmp, X, Y, X, Y),
                       swizzle(tmp, X, X, ONE, ONE), 0);
 
-      writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+      writemask = inst->Dst[0].Register.WriteMask;
 
       if (writemask & TGSI_WRITEMASK_Y) {
          uint tmp1;
@@ -757,7 +756,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
          i915_emit_arith(p,
                          A0_DP4,
-                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         get_result_vector(p, &inst->Dst[0]),
                          A0_DEST_CHANNEL_Y, 0,
                          swizzle(tmp1, W, Z, Y, X),
                          i915_emit_const4fv(p, sin_constants), 0);
@@ -772,7 +771,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
          i915_emit_arith(p,
                          A0_DP4,
-                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         get_result_vector(p, &inst->Dst[0]),
                          A0_DEST_CHANNEL_X, 0,
                          swizzle(tmp, ONE, Z, Y, X),
                          i915_emit_const4fv(p, cos_constants), 0);
@@ -789,7 +788,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_SIN:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src0 = src_vector(p, &inst->Src[0]);
       tmp = i915_get_utemp(p);
 
       i915_emit_arith(p,
@@ -832,7 +831,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
       i915_emit_arith(p,
                       A0_DP4,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(tmp, W, Z, Y, X),
                       i915_emit_const4fv(p, sin_constants), 0);
@@ -848,12 +847,12 @@ i915_translate_instruction(struct i915_fp_compile *p,
       break;
 
    case TGSI_OPCODE_SUB:
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
 
       i915_emit_arith(p,
                       A0_ADD,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       src0, negate(src1, 1, 1, 1, 1), 0);
       break;
@@ -877,8 +876,8 @@ i915_translate_instruction(struct i915_fp_compile *p,
        *      result.z = src0.x * src1.y - src0.y * src1.x;
        *      result.w = undef;
        */
-      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
-      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src0 = src_vector(p, &inst->Src[0]);
+      src1 = src_vector(p, &inst->Src[1]);
       tmp = i915_get_utemp(p);
 
       i915_emit_arith(p,
@@ -889,7 +888,7 @@ i915_translate_instruction(struct i915_fp_compile *p,
 
       i915_emit_arith(p,
                       A0_MAD,
-                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_vector(p, &inst->Dst[0]),
                       get_result_flags(inst), 0,
                       swizzle(src0, Y, Z, X, ONE),
                       swizzle(src1, Z, X, Y, ONE),
@@ -929,8 +928,8 @@ i915_translate_instructions(struct i915_fp_compile *p,
          if (parse.FullToken.FullDeclaration.Declaration.File
                   == TGSI_FILE_CONSTANT) {
             uint i;
-            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
-                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+            for (i = parse.FullToken.FullDeclaration.Range.First;
+                 i <= parse.FullToken.FullDeclaration.Range.Last;
                  i++) {
                assert(ifs->constant_flags[i] == 0x0);
                ifs->constant_flags[i] = I915_CONSTFLAG_USER;
@@ -940,8 +939,8 @@ i915_translate_instructions(struct i915_fp_compile *p,
          else if (parse.FullToken.FullDeclaration.Declaration.File
                   == TGSI_FILE_TEMPORARY) {
             uint i;
-            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
-                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+            for (i = parse.FullToken.FullDeclaration.Range.First;
+                 i <= parse.FullToken.FullDeclaration.Range.Last;
                  i++) {
                assert(i < I915_MAX_TEMPORARY);
                /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index 1c8cb91525b..06949c15f88 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -72,8 +72,6 @@ static unsigned translate_img_filter( unsigned filter )
       return FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:
       return FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:
-      return FILTER_ANISOTROPIC;
    default:
       assert(0);
       return FILTER_NEAREST;
@@ -219,6 +217,9 @@ i915_create_sampler_state(struct pipe_context *pipe,
    minFilt = translate_img_filter( sampler->min_img_filter );
    magFilt = translate_img_filter( sampler->mag_img_filter );
    
+   if (sampler->max_anisotropy > 1.0)
+      minFilt = magFilt = FILTER_ANISOTROPIC;
+
    if (sampler->max_anisotropy > 2.0) {
       cso->state[0] |= SS2_MAX_ANISO_4;
    }
@@ -515,7 +516,7 @@ static void i915_delete_vs_state(struct pipe_context *pipe, void *shader)
 
 static void i915_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
    struct i915_context *i915 = i915_context(pipe);
    struct pipe_screen *screen = pipe->screen;
@@ -535,13 +536,13 @@ static void i915_set_constant_buffer(struct pipe_context *pipe,
     */
    if (buf) {
       void *mapped;
-      if (buf->buffer && buf->buffer->size &&
-          (mapped = pipe_buffer_map(screen, buf->buffer,
+      if (buf->size &&
+          (mapped = pipe_buffer_map(screen, buf,
                                     PIPE_BUFFER_USAGE_CPU_READ))) {
-         memcpy(i915->current.constants[shader], mapped, buf->buffer->size);
-         pipe_buffer_unmap(screen, buf->buffer);
+         memcpy(i915->current.constants[shader], mapped, buf->size);
+         pipe_buffer_unmap(screen, buf);
          i915->current.num_user_constants[shader]
-            = buf->buffer->size / (4 * sizeof(float));
+            = buf->size / (4 * sizeof(float));
       }
       else {
          i915->current.num_user_constants[shader] = 0;
@@ -750,22 +751,15 @@ static void i915_set_vertex_elements(struct pipe_context *pipe,
 }
 
 
-static void i915_set_edgeflags(struct pipe_context *pipe,
-                               const unsigned *bitfield)
-{
-   /* TODO do something here */
-}
-
 void
 i915_init_state_functions( struct i915_context *i915 )
 {
-   i915->base.set_edgeflags = i915_set_edgeflags;
    i915->base.create_blend_state = i915_create_blend_state;
    i915->base.bind_blend_state = i915_bind_blend_state;
    i915->base.delete_blend_state = i915_delete_blend_state;
 
    i915->base.create_sampler_state = i915_create_sampler_state;
-   i915->base.bind_sampler_states = i915_bind_sampler_states;
+   i915->base.bind_fragment_sampler_states = i915_bind_sampler_states;
    i915->base.delete_sampler_state = i915_delete_sampler_state;
 
    i915->base.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
@@ -789,7 +783,7 @@ i915_init_state_functions( struct i915_context *i915 )
 
    i915->base.set_polygon_stipple = i915_set_polygon_stipple;
    i915->base.set_scissor_state = i915_set_scissor_state;
-   i915->base.set_sampler_textures = i915_set_sampler_textures;
+   i915->base.set_fragment_sampler_textures = i915_set_sampler_textures;
    i915->base.set_viewport_state = i915_set_viewport_state;
    i915->base.set_vertex_buffers = i915_set_vertex_buffers;
    i915->base.set_vertex_elements = i915_set_vertex_elements;
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 1373f9e0975..f5b0e9f011e 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -83,7 +83,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
 
    
    /* pos */
-   src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
    if (needW) {
       draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
@@ -100,21 +100,21 @@ static void calculate_vertex_layout( struct i915_context *i915 )
 
    /* primary color */
    if (colors[0]) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
       draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
       vinfo.hwfmt[0] |= S4_VFMT_COLOR;
    }
 
    /* secondary color */
    if (colors[1]) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
       draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
       vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
    }
 
    /* fog coord, not fog blend factor */
    if (fog) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
       draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
       vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
    }
@@ -124,7 +124,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
       uint hwtc;
       if (texCoords[i]) {
          hwtc = TEXCOORDFMT_4D;
-         src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
+         src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
          draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
       }
       else {
diff --git a/src/gallium/drivers/i915/i915_state_sampler.c b/src/gallium/drivers/i915/i915_state_sampler.c
index 86a019d250d..e5c6d87215b 100644
--- a/src/gallium/drivers/i915/i915_state_sampler.c
+++ b/src/gallium/drivers/i915/i915_state_sampler.c
@@ -230,7 +230,7 @@ i915_update_texture(struct i915_context *i915,
 {
    const struct pipe_texture *pt = &tex->base;
    uint format, pitch;
-   const uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+   const uint width = pt->width0, height = pt->height0, depth = pt->depth0;
    const uint num_levels = pt->last_level;
    unsigned max_lod = num_levels * 4;
    unsigned tiled = MS3_USE_FENCE_REGS;
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index 793613be7ba..13684aa59c5 100644
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -28,6 +28,9 @@
 #include "i915_context.h"
 #include "i915_blit.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_format.h"
 
 
 /* Assumes all values are within bounds -- no checking at this level -
@@ -42,17 +45,19 @@ i915_surface_copy(struct pipe_context *pipe,
 {
    struct i915_texture *dst_tex = (struct i915_texture *)dst->texture;
    struct i915_texture *src_tex = (struct i915_texture *)src->texture;
+   struct pipe_texture *dpt = &dst_tex->base;
+   struct pipe_texture *spt = &src_tex->base;
 
    assert( dst != src );
-   assert( dst_tex->base.block.size == src_tex->base.block.size );
-   assert( dst_tex->base.block.width == src_tex->base.block.height );
-   assert( dst_tex->base.block.height == src_tex->base.block.height );
-   assert( dst_tex->base.block.width == 1 );
-   assert( dst_tex->base.block.height == 1 );
+   assert( util_format_get_blocksize(dpt->format) == util_format_get_blocksize(spt->format) );
+   assert( util_format_get_blockwidth(dpt->format) == util_format_get_blockwidth(spt->format) );
+   assert( util_format_get_blockheight(dpt->format) == util_format_get_blockheight(spt->format) );
+   assert( util_format_get_blockwidth(dpt->format) == 1 );
+   assert( util_format_get_blockheight(dpt->format) == 1 );
 
    i915_copy_blit( i915_context(pipe),
                    FALSE,
-                   dst_tex->base.block.size,
+                   util_format_get_blocksize(dpt->format),
                    (unsigned short) src_tex->stride, src_tex->buffer, src->offset,
                    (unsigned short) dst_tex->stride, dst_tex->buffer, dst->offset,
                    (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
@@ -66,12 +71,13 @@ i915_surface_fill(struct pipe_context *pipe,
 		  unsigned width, unsigned height, unsigned value)
 {
    struct i915_texture *tex = (struct i915_texture *)dst->texture;
+   struct pipe_texture *pt = &tex->base;
 
-   assert(tex->base.block.width == 1);
-   assert(tex->base.block.height == 1);
+   assert(util_format_get_blockwidth(pt->format) == 1);
+   assert(util_format_get_blockheight(pt->format) == 1);
 
    i915_fill_blit( i915_context(pipe),
-                   tex->base.block.size,
+                   util_format_get_blocksize(pt->format),
                    (unsigned short) tex->stride,
                    tex->buffer, dst->offset,
                    (short) dstx, (short) dsty,
diff --git a/src/gallium/drivers/i915/i915_texture.c b/src/gallium/drivers/i915/i915_texture.c
index 9c0de8fa5fe..441bc4f1930 100644
--- a/src/gallium/drivers/i915/i915_texture.c
+++ b/src/gallium/drivers/i915/i915_texture.c
@@ -34,6 +34,8 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -72,6 +74,9 @@ static const int step_offsets[6][2] = {
    {-1, 1}
 };
 
+/* XXX really need twice the size if x is already pot?
+   Otherwise just use util_next_power_of_two?
+*/
 static unsigned
 power_of_two(unsigned x)
 {
@@ -81,13 +86,6 @@ power_of_two(unsigned x)
    return value;
 }
 
-static unsigned
-round_up(unsigned n, unsigned multiple)
-{
-   return (n + multiple - 1) & ~(multiple - 1);
-}
-
-
 /*
  * More advanced helper funcs
  */
@@ -99,17 +97,8 @@ i915_miptree_set_level_info(struct i915_texture *tex,
                              unsigned nr_images,
                              unsigned w, unsigned h, unsigned d)
 {
-   struct pipe_texture *pt = &tex->base;
-
    assert(level < PIPE_MAX_TEXTURE_LEVELS);
 
-   pt->width[level] = w;
-   pt->height[level] = h;
-   pt->depth[level] = d;
-   
-   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
-   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
-
    tex->nr_images[level] = nr_images;
 
    /*
@@ -140,7 +129,7 @@ i915_miptree_set_image_offset(struct i915_texture *tex,
 
    assert(img < tex->nr_images[level]);
 
-   tex->image_offset[level][img] = y * tex->stride + x * tex->base.block.size;
+   tex->image_offset[level][img] = y * tex->stride + x * util_format_get_blocksize(tex->base.format);
 
    /*
    printf("%s level %d img %d pos %d,%d image_offset %x\n",
@@ -162,28 +151,28 @@ i915_scanout_layout(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
 
-   if (pt->last_level > 0 || pt->block.size != 4)
+   if (pt->last_level > 0 || util_format_get_blocksize(pt->format) != 4)
       return FALSE;
 
    i915_miptree_set_level_info(tex, 0, 1,
-                               tex->base.width[0],
-                               tex->base.height[0],
+                               pt->width0,
+                               pt->height0,
                                1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
-   if (tex->base.width[0] >= 240) {
-      tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
-      tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   if (pt->width0 >= 240) {
+      tex->stride = power_of_two(util_format_get_stride(pt->format, pt->width0));
+      tex->total_nblocksy = align(util_format_get_nblocksy(pt->format, pt->height0), 8);
       tex->hw_tiled = INTEL_TILE_X;
-   } else if (tex->base.width[0] == 64 && tex->base.height[0] == 64) {
-      tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
-      tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   } else if (pt->width0 == 64 && pt->height0 == 64) {
+      tex->stride = power_of_two(util_format_get_stride(pt->format, pt->width0));
+      tex->total_nblocksy = align(util_format_get_nblocksy(pt->format, pt->height0), 8);
    } else {
       return FALSE;
    }
 
    debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
-      tex->base.width[0], tex->base.height[0], pt->block.size,
+      pt->width0, pt->height0, util_format_get_blocksize(pt->format),
       tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
 
    return TRUE;
@@ -197,25 +186,25 @@ i915_display_target_layout(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
 
-   if (pt->last_level > 0 || pt->block.size != 4)
+   if (pt->last_level > 0 || util_format_get_blocksize(pt->format) != 4)
       return FALSE;
 
    /* fallback to normal textures for small textures */
-   if (tex->base.width[0] < 240)
+   if (pt->width0 < 240)
       return FALSE;
 
    i915_miptree_set_level_info(tex, 0, 1,
-                               tex->base.width[0],
-                               tex->base.height[0],
+                               pt->width0,
+                               pt->height0,
                                1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
-   tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
-   tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   tex->stride = power_of_two(util_format_get_stride(pt->format, pt->width0));
+   tex->total_nblocksy = align(util_format_get_nblocksy(pt->format, pt->height0), 8);
    tex->hw_tiled = INTEL_TILE_X;
 
    debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
-      tex->base.width[0], tex->base.height[0], pt->block.size,
+      pt->width0, pt->height0, util_format_get_blocksize(pt->format),
       tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
 
    return TRUE;
@@ -226,36 +215,34 @@ i915_miptree_layout_2d(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
    unsigned level;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned nblocksx = pt->nblocksx[0];
-   unsigned nblocksy = pt->nblocksy[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->width0);
 
    /* used for scanouts that need special layouts */
-   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+   if (pt->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
       if (i915_scanout_layout(tex))
          return;
 
    /* for shared buffers we use some very like scanout */
-   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+   if (pt->tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
       if (i915_display_target_layout(tex))
          return;
 
-   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
    tex->total_nblocksy = 0;
 
    for (level = 0; level <= pt->last_level; level++) {
       i915_miptree_set_level_info(tex, level, 1, width, height, 1);
       i915_miptree_set_image_offset(tex, level, 0, 0, tex->total_nblocksy);
 
-      nblocksy = round_up(MAX2(2, nblocksy), 2);
+      nblocksy = align(MAX2(2, nblocksy), 2);
 
       tex->total_nblocksy += nblocksy;
 
-      width = minify(width);
-      height = minify(height);
-      nblocksx = pf_get_nblocksx(&pt->block, width);
-      nblocksy = pf_get_nblocksy(&pt->block, height);
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
    }
 }
 
@@ -265,16 +252,15 @@ i915_miptree_layout_3d(struct i915_texture *tex)
    struct pipe_texture *pt = &tex->base;
    unsigned level;
 
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
-   unsigned nblocksx = pt->nblocksx[0];
-   unsigned nblocksy = pt->nblocksy[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->height0);
    unsigned stack_nblocksy = 0;
 
    /* Calculate the size of a single slice. 
     */
-   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
 
    /* XXX: hardware expects/requires 9 levels at minimum.
     */
@@ -283,44 +269,41 @@ i915_miptree_layout_3d(struct i915_texture *tex)
 
       stack_nblocksy += MAX2(2, nblocksy);
 
-      width = minify(width);
-      height = minify(height);
-      depth = minify(depth);
-      nblocksx = pf_get_nblocksx(&pt->block, width);
-      nblocksy = pf_get_nblocksy(&pt->block, height);
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
    }
 
    /* Fixup depth image_offsets: 
     */
-   depth = pt->depth[0];
    for (level = 0; level <= pt->last_level; level++) {
       unsigned i;
       for (i = 0; i < depth; i++) 
          i915_miptree_set_image_offset(tex, level, i, 0, i * stack_nblocksy);
 
-      depth = minify(depth);
+      depth = u_minify(depth, 1);
    }
 
    /* Multiply slice size by texture depth for total size.  It's
     * remarkable how wasteful of memory the i915 texture layouts
     * are.  They are largely fixed in the i945.
     */
-   tex->total_nblocksy = stack_nblocksy * pt->depth[0];
+   tex->total_nblocksy = stack_nblocksy * pt->depth0;
 }
 
 static void
 i915_miptree_layout_cube(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
-   unsigned width = pt->width[0], height = pt->height[0];
-   const unsigned nblocks = pt->nblocksx[0];
+   unsigned width = pt->width0, height = pt->height0;
+   const unsigned nblocks = util_format_get_nblocksx(pt->format, pt->width0);
    unsigned level;
    unsigned face;
 
    assert(width == height); /* cubemap images are square */
 
    /* double pitch for cube layouts */
-   tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+   tex->stride = align(nblocks * util_format_get_blocksize(pt->format) * 2, 4);
    tex->total_nblocksy = nblocks * 4;
 
    for (level = 0; level <= pt->last_level; level++) {
@@ -381,10 +364,10 @@ i945_miptree_layout_2d(struct i915_texture *tex)
    unsigned level;
    unsigned x = 0;
    unsigned y = 0;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned nblocksx = pt->nblocksx[0];
-   unsigned nblocksy = pt->nblocksy[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned nblocksx = util_format_get_nblocksx(pt->format, pt->width0);
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->height0);
 
    /* used for scanouts that need special layouts */
    if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
@@ -396,7 +379,7 @@ i945_miptree_layout_2d(struct i915_texture *tex)
       if (i915_display_target_layout(tex))
          return;
 
-   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
 
    /* May need to adjust pitch to accomodate the placement of
     * the 2nd mipmap level.  This occurs when the alignment
@@ -405,11 +388,11 @@ i945_miptree_layout_2d(struct i915_texture *tex)
     */
    if (pt->last_level > 0) {
       unsigned mip1_nblocksx 
-         = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
-         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+         = align(util_format_get_nblocksx(pt->format, u_minify(width, 1)), align_x)
+         + util_format_get_nblocksx(pt->format, u_minify(width, 2));
 
       if (mip1_nblocksx > nblocksx)
-         tex->stride = mip1_nblocksx * pt->block.size;
+         tex->stride = mip1_nblocksx * util_format_get_blocksize(pt->format);
    }
 
    /* Pitch must be a whole number of dwords
@@ -437,10 +420,10 @@ i945_miptree_layout_2d(struct i915_texture *tex)
          y += nblocksy;
       }
 
-      width  = minify(width);
-      height = minify(height);
-      nblocksx = pf_get_nblocksx(&pt->block, width);
-      nblocksy = pf_get_nblocksy(&pt->block, height);
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      nblocksx = util_format_get_nblocksx(pt->format, width);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
    }
 }
 
@@ -448,20 +431,19 @@ static void
 i945_miptree_layout_3d(struct i915_texture *tex)
 {
    struct pipe_texture *pt = &tex->base;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
-   unsigned nblocksx = pt->nblocksx[0];
-   unsigned nblocksy = pt->nblocksy[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned nblocksy = util_format_get_nblocksy(pt->format, pt->width0);
    unsigned pack_x_pitch, pack_x_nr;
    unsigned pack_y_pitch;
    unsigned level;
 
-   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+   tex->stride = align(util_format_get_stride(pt->format, pt->width0), 4);
    tex->total_nblocksy = 0;
 
-   pack_y_pitch = MAX2(pt->nblocksy[0], 2);
-   pack_x_pitch = tex->stride / pt->block.size;
+   pack_y_pitch = MAX2(nblocksy, 2);
+   pack_x_pitch = tex->stride / util_format_get_blocksize(pt->format);
    pack_x_nr = 1;
 
    for (level = 0; level <= pt->last_level; level++) {
@@ -486,18 +468,17 @@ i945_miptree_layout_3d(struct i915_texture *tex)
       if (pack_x_pitch > 4) {
          pack_x_pitch >>= 1;
          pack_x_nr <<= 1;
-         assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+         assert(pack_x_pitch * pack_x_nr * util_format_get_blocksize(pt->format) <= tex->stride);
       }
 
       if (pack_y_pitch > 2) {
          pack_y_pitch >>= 1;
       }
 
-      width = minify(width);
-      height = minify(height);
-      depth = minify(depth);
-      nblocksx = pf_get_nblocksx(&pt->block, width);
-      nblocksy = pf_get_nblocksy(&pt->block, height);
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+      nblocksy = util_format_get_nblocksy(pt->format, height);
    }
 }
 
@@ -507,13 +488,13 @@ i945_miptree_layout_cube(struct i915_texture *tex)
    struct pipe_texture *pt = &tex->base;
    unsigned level;
 
-   const unsigned nblocks = pt->nblocksx[0];
+   const unsigned nblocks = util_format_get_nblocksx(pt->format, pt->width0);
    unsigned face;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
 
    /*
-   printf("%s %i, %i\n", __FUNCTION__, pt->width[0], pt->height[0]);
+   printf("%s %i, %i\n", __FUNCTION__, pt->width0, pt->height0);
    */
 
    assert(width == height); /* cubemap images are square */
@@ -527,9 +508,9 @@ i945_miptree_layout_cube(struct i915_texture *tex)
     * or the final row of 4x4, 2x2 and 1x1 faces below this.
     */
    if (nblocks > 32)
-      tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+      tex->stride = align(nblocks * util_format_get_blocksize(pt->format) * 2, 4);
    else
-      tex->stride = 14 * 8 * pt->block.size;
+      tex->stride = 14 * 8 * util_format_get_blocksize(pt->format);
 
    tex->total_nblocksy = nblocks * 4;
 
@@ -649,9 +630,6 @@ i915_texture_create(struct pipe_screen *screen,
    pipe_reference_init(&tex->base.reference, 1);
    tex->base.screen = screen;
 
-   tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
-   tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
-   
    if (is->is_i945) {
       if (!i945_miptree_layout(tex))
          goto fail;
@@ -665,7 +643,7 @@ i915_texture_create(struct pipe_screen *screen,
 
 
    /* for scanouts and cursors, cursors arn't scanouts */
-   if (templat->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY && templat->width[0] != 64)
+   if (templat->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY && templat->width0 != 64)
       buf_usage = INTEL_NEW_SCANOUT;
    else
       buf_usage = INTEL_NEW_TEXTURE;
@@ -708,7 +686,7 @@ i915_texture_blanket(struct pipe_screen * screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -722,7 +700,7 @@ i915_texture_blanket(struct pipe_screen * screen,
 
    tex->stride = stride[0];
 
-   i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
+   i915_miptree_set_level_info(tex, 0, 1, base->width0, base->height0, 1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
    pipe_buffer_reference(&tex->buffer, buffer);
@@ -786,8 +764,8 @@ i915_get_tex_surface(struct pipe_screen *screen,
       pipe_reference_init(&ps->reference, 1);
       pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
       ps->offset = offset;
       ps->usage = flags;
    }
@@ -833,14 +811,10 @@ i915_get_tex_transfer(struct pipe_screen *screen,
    trans = CALLOC_STRUCT(i915_transfer);
    if (trans) {
       pipe_texture_reference(&trans->base.texture, texture);
-      trans->base.format = trans->base.format;
       trans->base.x = x;
       trans->base.y = y;
       trans->base.width = w;
       trans->base.height = h;
-      trans->base.block = texture->block;
-      trans->base.nblocksx = texture->nblocksx[level];
-      trans->base.nblocksy = texture->nblocksy[level];
       trans->base.stride = tex->stride;
       trans->offset = offset;
       trans->base.usage = usage;
@@ -856,6 +830,7 @@ i915_transfer_map(struct pipe_screen *screen,
    struct intel_winsys *iws = i915_screen(tex->base.screen)->iws;
    char *map;
    boolean write = FALSE;
+   enum pipe_format format = tex->base.format;
 
    if (transfer->usage & PIPE_TRANSFER_WRITE)
       write = TRUE;
@@ -865,8 +840,8 @@ i915_transfer_map(struct pipe_screen *screen,
       return NULL;
 
    return map + i915_transfer(transfer)->offset +
-      transfer->y / transfer->block.height * transfer->stride +
-      transfer->x / transfer->block.width * transfer->block.size;
+      transfer->y / util_format_get_blockheight(format) * transfer->stride +
+      transfer->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
 }
 
 static void
@@ -917,7 +892,7 @@ i915_texture_blanket_intel(struct pipe_screen *screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -931,7 +906,7 @@ i915_texture_blanket_intel(struct pipe_screen *screen,
 
    tex->stride = stride;
 
-   i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
+   i915_miptree_set_level_info(tex, 0, 1, base->width0, base->height0, 1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
    tex->buffer = buffer;
diff --git a/src/gallium/drivers/i965/Makefile b/src/gallium/drivers/i965/Makefile
new file mode 100644
index 00000000000..95fd3cd69bd
--- /dev/null
+++ b/src/gallium/drivers/i965/Makefile
@@ -0,0 +1,74 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965
+
+C_SOURCES = \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_unfilled.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_disasm.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_pipe_blend.c \
+	brw_pipe_depth.c \
+	brw_pipe_fb.c \
+	brw_pipe_query.c \
+	brw_pipe_shader.c \
+	brw_pipe_flush.c \
+	brw_pipe_misc.c \
+	brw_pipe_sampler.c \
+	brw_pipe_vertex.c \
+	brw_pipe_clear.c \
+	brw_pipe_rast.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state_batch.c \
+	brw_state_debug.c \
+	brw_state_cache.c \
+	brw_state_upload.c \
+	brw_structs_dump.c \
+	brw_swtnl.c \
+	brw_urb.c \
+	brw_util.c \
+	brw_vs.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_vs_surface_state.c \
+	brw_wm.c \
+	brw_wm_debug.c \
+	brw_wm_emit.c \
+	brw_wm_fp.c \
+	brw_wm_iz.c \
+	brw_wm_pass0.c \
+	brw_wm_pass1.c \
+	brw_wm_pass2.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c \
+	brw_screen.c \
+	brw_screen_buffers.c \
+	brw_screen_tex_layout.c \
+	brw_screen_texture.c \
+	brw_screen_surface.c \
+	brw_batchbuffer.c \
+	brw_winsys_debug.c \
+	intel_decode.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/i965/SConscript b/src/gallium/drivers/i965/SConscript
new file mode 100644
index 00000000000..9c2faaf4b49
--- /dev/null
+++ b/src/gallium/drivers/i965/SConscript
@@ -0,0 +1,77 @@
+Import('*')
+
+env = env.Clone()
+
+i965 = env.ConvenienceLibrary(
+	target = 'i965',
+	source = [
+		'brw_batchbuffer.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_unfilled.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_disasm.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_pipe_blend.c',
+		'brw_pipe_clear.c',
+		'brw_pipe_depth.c',
+		'brw_pipe_fb.c',
+		'brw_pipe_flush.c',
+		'brw_pipe_misc.c',
+		'brw_pipe_query.c',
+		'brw_pipe_rast.c',
+		'brw_pipe_sampler.c',
+		'brw_pipe_shader.c',
+		'brw_pipe_vertex.c',
+		'brw_screen_buffers.c',
+		'brw_screen.c',
+		'brw_screen_surface.c',
+		'brw_screen_tex_layout.c',
+		'brw_screen_texture.c',
+		'brw_structs_dump.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+#		'brw_state_debug.c',
+		'brw_state_upload.c',
+		'brw_swtnl.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_vs_surface_state.c',
+		'brw_wm.c',
+#		'brw_wm_constant_buffer.c',
+		'brw_wm_debug.c',
+		'brw_wm_emit.c',
+		'brw_wm_fp.c',
+#		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_pass0.c',
+		'brw_wm_pass1.c',
+		'brw_wm_pass2.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+		'intel_decode.c',
+	])
+
+Export('i965')
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c
new file mode 100644
index 00000000000..003b1fd5bf0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.c
@@ -0,0 +1,202 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_reg.h"
+#include "brw_winsys.h"
+#include "brw_debug.h"
+#include "brw_structs.h"
+
+#define ALWAYS_EMIT_MI_FLUSH 1
+
+enum pipe_error
+brw_batchbuffer_reset(struct brw_batchbuffer *batch)
+{
+   enum pipe_error ret;
+
+   ret = batch->sws->bo_alloc( batch->sws,
+                               BRW_BUFFER_TYPE_BATCH,
+                               BRW_BATCH_SIZE, 4096,
+                               &batch->buf );
+   if (ret)
+      return ret;
+
+   batch->size = BRW_BATCH_SIZE;
+
+   /* With map_range semantics, the winsys can decide whether to
+    * inject a malloc'ed bounce buffer instead of mapping directly.
+    */
+   batch->map = batch->sws->bo_map(batch->buf,
+                                   BRW_DATA_BATCH_BUFFER,
+                                   0, batch->size,
+                                   GL_TRUE,
+                                   GL_TRUE,
+                                   GL_TRUE);
+
+   batch->ptr = batch->map;
+   return PIPE_OK;
+}
+
+struct brw_batchbuffer *
+brw_batchbuffer_alloc(struct brw_winsys_screen *sws,
+                      struct brw_chipset chipset)
+{
+   struct brw_batchbuffer *batch = CALLOC_STRUCT(brw_batchbuffer);
+
+   batch->sws = sws;
+   batch->chipset = chipset;
+   brw_batchbuffer_reset(batch);
+
+   return batch;
+}
+
+void
+brw_batchbuffer_free(struct brw_batchbuffer *batch)
+{
+   if (batch->map) {
+      batch->sws->bo_unmap(batch->buf);
+      batch->map = NULL;
+   }
+
+   bo_reference(&batch->buf, NULL);
+   FREE(batch);
+}
+
+
+void
+_brw_batchbuffer_flush(struct brw_batchbuffer *batch, 
+		       const char *file,
+		       int line)
+{
+   GLuint used = batch->ptr - batch->map;
+
+   if (used == 0)
+      return;
+
+   /* Post-swap throttling done by the state tracker.
+    */
+
+   if (BRW_DEBUG & DEBUG_BATCH)
+      debug_printf("%s:%d: Batchbuffer flush with %db used\n", 
+		   file, line, used);
+
+   if (ALWAYS_EMIT_MI_FLUSH) {
+      *(GLuint *) (batch->ptr) = MI_FLUSH | BRW_FLUSH_STATE_CACHE;
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Round batchbuffer usage to 2 DWORDs. 
+    */
+   if ((used & 4) == 0) {
+      *(GLuint *) (batch->ptr) = 0; /* noop */
+      batch->ptr += 4;
+      used = batch->ptr - batch->map;
+   }
+
+   /* Mark the end of the buffer. 
+    */
+   *(GLuint *) (batch->ptr) = MI_BATCH_BUFFER_END;
+   batch->ptr += 4;
+   used = batch->ptr - batch->map;
+
+   batch->sws->bo_flush_range(batch->buf, 0, used);
+   batch->sws->bo_unmap(batch->buf);
+   batch->map = NULL;
+   batch->ptr = NULL;
+      
+   batch->sws->bo_exec(batch->buf, used );
+
+   if (BRW_DEBUG & DEBUG_SYNC) {
+      /* Abuse map/unmap to achieve wait-for-fence.
+       *
+       * XXX: hide this inside the winsys and export a fence
+       * interface.
+       */
+      debug_printf("waiting for idle\n");
+      batch->sws->bo_wait_idle(batch->buf);
+   }
+
+   /* Reset the buffer:
+    */
+   brw_batchbuffer_reset(batch);
+}
+
+
+/* The OUT_RELOC() macro ends up here, generating a relocation within
+ * the batch buffer.
+ */
+enum pipe_error
+brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+			   struct brw_winsys_buffer *buffer,
+			   enum brw_buffer_usage usage,
+			   uint32_t delta)
+{
+   int ret;
+
+   if (batch->ptr - batch->map > batch->buf->size) {
+      debug_printf("bad relocation ptr %p map %p offset %d size %d\n",
+		   batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
+
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   ret = batch->sws->bo_emit_reloc(batch->buf,
+				   usage,
+				   delta, 
+				   batch->ptr - batch->map,
+				   buffer);
+   if (ret != 0)
+      return ret;
+
+   /* bo_emit_reloc was resposible for writing a zero into the
+    * batchbuffer if necessary.  Just need to update our pointer.
+    */
+   batch->ptr += 4;
+
+   return 0;
+}
+
+enum pipe_error
+brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                       const void *data, GLuint bytes,
+		       enum cliprect_mode cliprect_mode)
+{
+   enum pipe_error ret;
+
+   assert((bytes & 3) == 0);
+
+   ret = brw_batchbuffer_require_space(batch, bytes);
+   if (ret)
+      return ret;
+
+   memcpy(batch->ptr, data, bytes);
+   batch->ptr += bytes;
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_batchbuffer.h b/src/gallium/drivers/i965/brw_batchbuffer.h
new file mode 100644
index 00000000000..6ca9f617f5e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_batchbuffer.h
@@ -0,0 +1,148 @@
+#ifndef BRW_BATCHBUFFER_H
+#define BRW_BATCHBUFFER_H
+
+#include "util/u_debug.h"
+
+#include "brw_types.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+
+#define BATCH_SZ 16384
+#define BATCH_RESERVED 16
+
+/* All ignored:
+ */
+enum cliprect_mode {
+   IGNORE_CLIPRECTS,
+   LOOP_CLIPRECTS,
+   NO_LOOP_CLIPRECTS,
+   REFERENCES_CLIPRECTS
+};
+
+
+
+
+struct brw_batchbuffer {
+
+   struct brw_winsys_screen *sws;
+   struct brw_winsys_buffer *buf;
+   struct brw_chipset chipset;
+
+   /**
+    * Values exported to speed up the writing the batchbuffer,
+    * instead of having to go trough a accesor function for
+    * each dword written.
+    */
+   /*{@*/
+   uint8_t *map;
+   uint8_t *ptr;
+   size_t size;
+   struct {
+      uint8_t *end_ptr;
+   } emit;
+
+
+   size_t relocs;
+   size_t max_relocs;
+   /*@}*/
+};
+
+struct brw_batchbuffer *brw_batchbuffer_alloc( struct brw_winsys_screen *sws,
+                                               struct brw_chipset chipset );
+
+void brw_batchbuffer_free(struct brw_batchbuffer *batch);
+
+void _brw_batchbuffer_flush(struct brw_batchbuffer *batch,
+			      const char *file, int line);
+
+
+enum pipe_error
+brw_batchbuffer_reset(struct brw_batchbuffer *batch);
+
+
+/* Unlike bmBufferData, this currently requires the buffer be mapped.
+ * Consider it a convenience function wrapping multple
+ * intel_buffer_dword() calls.
+ */
+enum pipe_error brw_batchbuffer_data(struct brw_batchbuffer *batch,
+                            const void *data, GLuint bytes,
+			    enum cliprect_mode cliprect_mode);
+
+
+enum pipe_error brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch,
+			       struct brw_winsys_buffer *buffer,
+			       enum brw_buffer_usage usage,
+			       uint32_t offset);
+
+/* Inline functions - might actually be better off with these
+ * non-inlined.  Certainly better off switching all command packets to
+ * be passed as structs rather than dwords, but that's a little bit of
+ * work...
+ */
+static INLINE GLint
+brw_batchbuffer_space(struct brw_batchbuffer *batch)
+{
+   return (batch->size - BATCH_RESERVED) - (batch->ptr - batch->map);
+}
+
+
+static INLINE void
+brw_batchbuffer_emit_dword(struct brw_batchbuffer *batch, GLuint dword)
+{
+   assert(batch->map);
+   assert(brw_batchbuffer_space(batch) >= 4);
+   *(GLuint *) (batch->ptr) = dword;
+   batch->ptr += 4;
+}
+
+static INLINE enum pipe_error
+brw_batchbuffer_require_space(struct brw_batchbuffer *batch,
+                                GLuint sz)
+{
+   assert(sz < batch->size - 8);
+   if (brw_batchbuffer_space(batch) < sz) {
+      assert(0);
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+#ifdef DEBUG
+   batch->emit.end_ptr = batch->ptr + sz;
+#endif
+   return 0;
+}
+
+/* Here are the crusty old macros, to be removed:
+ */
+#define BEGIN_BATCH(n, cliprect_mode) do {				\
+      brw_batchbuffer_require_space(brw->batch, (n)*4);			\
+   } while (0)
+
+#define OUT_BATCH(d) brw_batchbuffer_emit_dword(brw->batch, d)
+
+#define OUT_RELOC(buf, usage, delta) do {				\
+      assert((unsigned) (delta) < buf->size);				\
+      brw_batchbuffer_emit_reloc(brw->batch, buf,			\
+				 usage, delta);				\
+   } while (0)
+
+#ifdef DEBUG
+#define ADVANCE_BATCH() do {						\
+      unsigned int _n = brw->batch->ptr - brw->batch->emit.end_ptr;	\
+      if (_n != 0) {							\
+	 debug_printf("%s: %d too many bytes emitted to batch\n",	\
+		      __FUNCTION__, _n);				\
+	 abort();							\
+      }									\
+      brw->batch->emit.end_ptr = NULL;					\
+   } while(0)
+#else
+#define ADVANCE_BATCH()
+#endif
+
+static INLINE void
+brw_batchbuffer_emit_mi_flush(struct brw_batchbuffer *batch)
+{
+   brw_batchbuffer_require_space(batch, 4);
+   brw_batchbuffer_emit_dword(batch, MI_FLUSH);
+}
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_cc.c b/src/gallium/drivers/i965/brw_cc.c
new file mode 100644
index 00000000000..3e070f5591a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_cc.c
@@ -0,0 +1,111 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+static enum pipe_error prepare_cc_vp( struct brw_context *brw )
+{
+   return brw_cache_data( &brw->cache, 
+                         BRW_CC_VP,
+                         &brw->curr.ccv,
+                         NULL, 0,
+                         &brw->cc.reloc[CC_RELOC_VP].bo );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .mesa = PIPE_NEW_VIEWPORT,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .prepare = prepare_cc_vp
+};
+
+
+/* A long-winded way to OR two unsigned integers together:
+ */
+static INLINE struct brw_cc3
+combine_cc3( struct brw_cc3 a, struct brw_cc3 b )
+{
+   union { struct brw_cc3 cc3; unsigned i; } ca, cb;
+   ca.cc3 = a;
+   cb.cc3 = b;
+   ca.i |= cb.i;
+   return ca.cc3;
+}
+
+
+static int prepare_cc_unit( struct brw_context *brw )
+{
+   brw->cc.cc.cc0 = brw->curr.zstencil->cc0;
+   brw->cc.cc.cc1 = brw->curr.zstencil->cc1;
+   brw->cc.cc.cc2 = brw->curr.zstencil->cc2;
+   brw->cc.cc.cc3 = combine_cc3( brw->curr.zstencil->cc3, brw->curr.blend->cc3 );
+   
+   brw->cc.cc.cc5 = brw->curr.blend->cc5;
+   brw->cc.cc.cc6 = brw->curr.blend->cc6;
+   brw->cc.cc.cc7 = brw->curr.zstencil->cc7;
+
+   return brw_cache_data_sz(&brw->cache, BRW_CC_UNIT,
+                           &brw->cc.cc, sizeof(brw->cc.cc),
+                           brw->cc.reloc, 1,
+                           &brw->cc.state_bo);
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .mesa = PIPE_NEW_DEPTH_STENCIL_ALPHA | PIPE_NEW_BLEND,
+      .brw = 0,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .prepare = prepare_cc_unit,
+};
+
+
+void brw_hw_cc_init( struct brw_context *brw )
+{
+   make_reloc(&brw->cc.reloc[0],
+              BRW_USAGE_STATE,
+              0,
+              offsetof(struct brw_cc_unit_state, cc4),
+              NULL);
+}
+
+
+void brw_hw_cc_cleanup( struct brw_context *brw )
+{
+   bo_reference(&brw->cc.state_bo, NULL);
+   bo_reference(&brw->cc.reloc[0].bo, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
new file mode 100644
index 00000000000..d67a1a62633
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -0,0 +1,224 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+
+#include "util/u_math.h"
+
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_pipe_rast.h"
+#include "brw_clip.h"
+
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static enum pipe_error
+compile_clip_prog( struct brw_context *brw,
+                   struct brw_clip_prog_key *key,
+                   struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_clip_compile c;
+   const GLuint *program;
+   GLuint program_size;
+   GLuint delta;
+
+   memset(&c, 0, sizeof(c));
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.chipset = brw->chipset;
+   c.key = *key;
+   c.need_ff_sync = c.chipset.is_igdng;
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   if (c.chipset.is_igdng)
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   c.offset_hpos = delta + c.key.output_hpos * ATTR_SIZE;
+
+   if (c.key.output_color0 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_color0 = delta + c.key.output_color0 * ATTR_SIZE;
+
+   if (c.key.output_color1 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_color1 = delta + c.key.output_color1 * ATTR_SIZE;
+
+   if (c.key.output_bfc0 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_bfc0 = delta + c.key.output_bfc0 * ATTR_SIZE;
+
+   if (c.key.output_bfc1 != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_bfc1 = delta + c.key.output_bfc1 * ATTR_SIZE;
+
+   if (c.key.output_edgeflag != BRW_OUTPUT_NOT_PRESENT)
+      c.offset_edgeflag = delta + c.key.output_edgeflag * ATTR_SIZE;
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.key.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.key.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_TRIANGLES: 
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+	 brw_emit_tri_clip( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case PIPE_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+	 
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache,
+                           BRW_CLIP_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->clip.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static enum pipe_error
+upload_clip_prog(struct brw_context *brw)
+{
+   const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
+   struct brw_clip_prog_key key;
+   enum pipe_error ret;
+
+   /* Populate the key, starting from the almost-complete version from
+    * the rast state. 
+    */
+
+   /* PIPE_NEW_RAST */
+   key = brw->curr.rast->clip_key;
+   
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->reduced_primitive;
+
+   /* XXX: if edgeflag is moved to a proper TGSI vs output, can remove
+    * dependency on CACHE_NEW_VS_PROG
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.nr_attrs        = brw->vs.prog_data->nr_outputs;
+
+   /* PIPE_NEW_VS */
+   key.output_hpos     = vs->output_hpos;
+   key.output_color0   = vs->output_color0;
+   key.output_color1   = vs->output_color1;
+   key.output_bfc0     = vs->output_bfc0;
+   key.output_bfc1     = vs->output_bfc1;
+   key.output_edgeflag = vs->output_edgeflag;
+
+   /* PIPE_NEW_CLIP */
+   key.nr_userclip = brw->curr.ucp.nr;
+
+   /* Already cached?
+    */
+   if (brw_search_cache(&brw->cache, BRW_CLIP_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->clip.prog_data,
+                        &brw->clip.prog_bo))
+      return PIPE_OK;
+
+   /* Compile new program:
+    */
+   ret = compile_clip_prog( brw, &key, &brw->clip.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST | 
+		PIPE_NEW_CLIP),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = upload_clip_prog
+};
diff --git a/src/gallium/drivers/i965/brw_clip.h b/src/gallium/drivers/i965/brw_clip.h
new file mode 100644
index 00000000000..80e3a11a370
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip.h
@@ -0,0 +1,199 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+#include "pipe/p_state.h"
+#include "brw_reg.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   GLuint nr_attrs:6;
+   GLuint primitive:4;
+   GLuint nr_userclip:3;
+   GLuint do_flat_shading:1;
+   GLuint do_unfilled:1;
+   GLuint fill_cw:2;		/* includes cull information */
+   GLuint fill_ccw:2;		/* includes cull information */
+   GLuint offset_cw:1;
+   GLuint offset_ccw:1;
+   GLuint copy_bfc_cw:1;
+   GLuint copy_bfc_ccw:1;
+   GLuint clip_mode:3;
+   GLuint output_hpos:6;        /* not always zero? */
+
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
+   GLuint pad1:2;
+   
+   GLfloat offset_factor;
+   GLfloat offset_units;
+};
+
+struct brw_clip_prog_data {
+   GLuint curb_read_length;	/* user planes? */
+   GLuint clip_mode;
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+       
+      struct brw_reg ff_sync;
+   } reg;
+
+   /* 3 different ways of expressing vertex size, including
+    * key.nr_attrs.
+    */
+   GLuint nr_regs;
+   GLuint nr_bytes;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   GLboolean need_direction;
+   struct brw_chipset chipset;
+
+   GLuint last_mrf;
+
+   GLuint header_position_offset;
+   GLboolean need_ff_sync;
+
+   GLuint nr_color_attrs;
+   GLuint offset_color0;
+   GLuint offset_color1;
+   GLuint offset_bfc0;
+   GLuint offset_bfc1;
+
+   GLuint offset_hpos;
+   GLuint offset_edgeflag;
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+struct brw_reg get_tmp( struct brw_clip_compile *c );
+
+void brw_clip_project_position(struct brw_clip_compile *c,
+             struct brw_reg pos );
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
+#endif
diff --git a/src/gallium/drivers/i965/brw_clip_line.c b/src/gallium/drivers/i965/brw_clip_line.c
new file mode 100644
index 00000000000..54282d975ed
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_line.c
@@ -0,0 +1,271 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_debug.h"
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        GLfloat dp0 = DOTPROD( vtx0, plane[p] );
+ *        GLfloat dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           GLfloat t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           GLfloat t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *  
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2 = NULL;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together: 
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   if (c->chipset.is_965) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+              brw_imm_ud(1<<20));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   }
+
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 /* dp = DP4(vtx->position, plane) 
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset_hpos), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1)) 
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset_hpos), c->reg.plane_equation);
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (c->chipset.is_965) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p, is_neg2);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 } 
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (c->chipset.is_965) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             }
+
+             {
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             }
+
+             if (c->chipset.is_965) {
+                 brw_ENDIF(p, is_neg2);
+             }
+         }
+	 brw_ENDIF(p, is_negative);	 
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END); 
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+                
+   clip_and_emit_line(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_point.c b/src/gallium/drivers/i965/brw_clip_point.c
new file mode 100644
index 00000000000..e0a5330556d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_point.c
@@ -0,0 +1,48 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_state.c b/src/gallium/drivers/i965/brw_clip_state.c
new file mode 100644
index 00000000000..5c3ccfd8d0d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_state.c
@@ -0,0 +1,209 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_clip.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+struct brw_clip_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int clip_mode;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   GLboolean depth_clamp;
+};
+
+static void
+clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_CLIP_PROG */
+   key->total_grf = brw->clip.prog_data->total_grf;
+   key->urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   key->clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_clip_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /*  */
+   key->depth_clamp = 0; /* XXX: add this to gallium: ctx->Transform.DepthClamp; */
+}
+
+static enum pipe_error
+clip_unit_create_from_key(struct brw_context *brw,
+                          struct brw_clip_unit_key *key,
+                          struct brw_winsys_reloc *reloc,
+                          struct brw_winsys_buffer **bo_out)
+{
+   struct brw_clip_unit_state clip;
+   enum pipe_error ret;
+
+   memset(&clip, 0, sizeof(clip));
+
+   clip.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   /* reloc */
+   clip.thread0.kernel_start_pointer = 0;
+
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+
+   clip.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   clip.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   clip.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+
+   clip.thread4.nr_urb_entries = key->nr_urb_entries;
+   clip.thread4.urb_entry_allocation_size = key->urb_size - 1;
+   /* If we have enough clip URB entries to run two threads, do so.
+    */
+   if (key->nr_urb_entries >= 10) {
+      /* Half of the URB entries go to each thread, and it has to be an
+       * even number.
+       */
+      assert(key->nr_urb_entries % 2 == 0);
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
+   } else {
+      assert(key->nr_urb_entries >= 5);
+      clip.thread4.max_threads = 1 - 1;
+   }
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      clip.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1;
+
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   if (!key->depth_clamp)
+      clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;
+   clip.clip5.clip_mode = key->clip_mode;
+
+   if (BRW_IS_G4X(brw))
+      clip.clip5.negative_w_clip_test = 1;
+
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_CLIP_UNIT,
+                          key, sizeof(*key),
+                          reloc, 1,
+                          &clip, sizeof(clip),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static int upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_key key;
+   struct brw_winsys_reloc reloc[1];
+   unsigned grf_reg_count;
+   enum pipe_error ret;
+
+   clip_unit_populate_key(brw, &key);
+
+   grf_reg_count = align(key.total_grf, 16) / 16 - 1;
+
+   /* clip program relocation
+    *
+    * XXX: these reloc structs are long lived and only need to be
+    * updated when the bound BO changes.  Hopefully the stuff mixed in
+    * in the delta's is non-orthogonal.
+    */
+   assert(brw->clip.prog_bo);
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_clip_unit_state, thread0),
+              brw->clip.prog_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_CLIP_UNIT,
+                        &key, sizeof(key),
+                        reloc, 1,
+                        NULL,
+                        &brw->clip.state_bo))
+      return PIPE_OK;
+      
+   /* Create new:
+    */
+   ret = clip_unit_create_from_key(brw, &key, 
+                                   reloc,
+                                   &brw->clip.state_bo);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .prepare = upload_clip_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_clip_tri.c b/src/gallium/drivers/i965/brw_clip_tri.c
new file mode 100644
index 00000000000..4cde7294ea0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_tri.c
@@ -0,0 +1,595 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->key.nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 GLuint delta = c->key.nr_attrs*16 + 32;
+
+         if (c->chipset.is_igdng)
+             delta = c->key.nr_attrs * 16 + 32 * 3;
+
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_D);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0); 
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+   
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+      
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++ 
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+	    
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset_hpos), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, GL_FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	       
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset_hpos), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, GL_TRUE);		  
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++; 
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       } 	       
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+	    
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 } 
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+      
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3 
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+   
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+      
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+  
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+    struct brw_instruction *is_outside;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset_hpos));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset_hpos));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset_hpos));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
+
+    /* test nearz, xmin, ymin plane */
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
+
+   /* if -ve rhw workaround bit is set, 
+      do cliptest */
+   if (c->chipset.is_965) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
+              brw_imm_ud(1<<20));
+      neg_rhw = brw_IF(p, BRW_EXECUTE_1); 
+      {
+         brw_clip_test(c);
+      }
+      brw_ENDIF(p, neg_rhw);
+   }
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading) 
+      brw_clip_tri_flat_shade(c); 
+      
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
+      do_clip_tri(c);
+   else 
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965/brw_clip_unfilled.c b/src/gallium/drivers/i965/brw_clip_unfilled.c
new file mode 100644
index 00000000000..aec835b8cec
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_unfilled.c
@@ -0,0 +1,497 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset_hpos); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset_hpos); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset_hpos); 
+
+
+   struct brw_reg v0n = get_tmp(c);
+   struct brw_reg v1n = get_tmp(c);
+   struct brw_reg v2n = get_tmp(c);
+
+   /* Convert to NDC.
+    * NOTE: We can't modify the original vertex coordinates,
+    * as it may impact further operations.
+    * So, we have to keep normalized coordinates in temp registers.
+    *
+    * TBD-KC
+    * Try to optimize unnecessary MOV's.
+    */
+   brw_MOV(p, v0n, v0);
+   brw_MOV(p, v1n, v1);
+   brw_MOV(p, v2n, v2);
+
+   brw_clip_project_position(c, v0n);
+   brw_clip_project_position(c, v1n);
+   brw_clip_project_position(c, v2n);
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0n, negate(v2n)); 
+   brw_ADD(p, f, v1n, negate(v2n)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   GLuint conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if ((c->offset_color0 == 0 || c->offset_bfc0 == 0) &&
+       (c->offset_color1 == 0 || c->offset_bfc1 == 0))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      GLuint i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset_color0 && c->offset_bfc0)
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset_color0),
+		    byte_offset(c->reg.vertex[i], c->offset_bfc0));
+
+	 if (c->offset_color1 && c->offset_bfc1)
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset_color0),
+		    byte_offset(c->reg.vertex[i], c->offset_bfc0));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  GLfloat iz	= 1.0 / dir.z;
+  GLfloat ac	= dir.x * iz;
+  GLfloat bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset_edgeflag), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset_edgeflag), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg z = deref_1f(vert, c->header_position_offset +
+			       2 * type_sz(BRW_REGISTER_TYPE_F));
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       GLboolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset_edgeflag),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			GLboolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset_edgeflag),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     GLuint mode, 
+			     GLboolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
+
+   assert(c->offset_edgeflag);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
diff --git a/src/gallium/drivers/i965/brw_clip_util.c b/src/gallium/drivers/i965/brw_clip_util.c
new file mode 100644
index 00000000000..97a57103105
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_clip_util.c
@@ -0,0 +1,388 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(GLuint x, GLuint y, GLuint z, GLuint w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw 
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, BRW_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c, 
+				     struct brw_indirect vert_addr )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset_hpos));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+	 
+   release_tmp(c, tmp);
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.  
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     GLboolean force_edgeflag)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   GLuint i;
+
+   /* Just copy the vertex header:
+    */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+      
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->key.nr_attrs; i++) {
+      GLuint delta = i*16 + 32;
+
+      if (c->chipset.is_igdng)
+          delta = i * 16 + 32 * 3;
+
+      if (delta == c->offset_edgeflag) {
+	 if (force_edgeflag) 
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate: 
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p, 
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p, 
+		 tmp,	      
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0); 
+	      
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta), 
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      GLuint delta = i*16 + 32;
+
+      if (c->chipset.is_igdng)
+          delta = i * 16 + 32 * 3;
+
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       GLboolean allocate,
+		       GLboolean eot,
+		       GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLuint start = c->last_mrf;
+
+   brw_clip_ff_sync(c);
+
+   assert(!(allocate && eot));
+   
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+	
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */ 
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   brw_clip_ff_sync(c);
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p, 
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 1, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   GLuint to, GLuint from )
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->offset_color0)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_color0),
+	      byte_offset(c->reg.vertex[from], c->offset_color0));
+
+   if (c->offset_color1)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_color1),
+	      byte_offset(c->reg.vertex[from], c->offset_color1));
+
+   if (c->offset_bfc0)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_bfc0),
+	      byte_offset(c->reg.vertex[from], c->offset_bfc0));
+
+   if (c->offset_bfc1)
+      brw_MOV(p, 
+	      byte_offset(c->reg.vertex[to], c->offset_bfc1),
+	      byte_offset(c->reg.vertex[from], c->offset_bfc1));
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+   
+   /* Shift so that lowest outcode bit is rightmost: 
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+      
+      release_tmp(c, tmp);
+   }
+}
+
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+        struct brw_compile *p = &c->func;
+        struct brw_instruction *need_ff_sync;
+
+        brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p, 
+                    c->reg.R0,
+                    0,
+                    c->reg.R0,
+                    1,	
+                    1,		/* used */
+                    1,  	/* msg length */
+                    1,		/* response length */
+                    0,		/* eot */
+                    1,		/* write compelete */
+                    0,		/* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+        }
+        brw_ENDIF(p, need_ff_sync);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+	struct brw_compile *p = &c->func;
+        
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
diff --git a/src/gallium/drivers/i965/brw_context.c b/src/gallium/drivers/i965/brw_context.c
new file mode 100644
index 00000000000..e67551882dc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_context.c
@@ -0,0 +1,154 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "pipe/p_context.h"
+#include "util/u_simple_list.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_draw.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_winsys.h"
+#include "brw_screen.h"
+
+
+static void brw_destroy_context( struct pipe_context *pipe )
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   brw_context_flush( brw );
+   brw_batchbuffer_free( brw->batch );
+   brw_destroy_state(brw);
+
+   brw_draw_cleanup( brw );
+
+   brw_pipe_blend_cleanup( brw );
+   brw_pipe_depth_stencil_cleanup( brw );
+   brw_pipe_framebuffer_cleanup( brw );
+   brw_pipe_flush_cleanup( brw );
+   brw_pipe_misc_cleanup( brw );
+   brw_pipe_query_cleanup( brw );
+   brw_pipe_rast_cleanup( brw );
+   brw_pipe_sampler_cleanup( brw );
+   brw_pipe_shader_cleanup( brw );
+   brw_pipe_vertex_cleanup( brw );
+   brw_pipe_clear_cleanup( brw );
+
+   brw_hw_cc_cleanup( brw );
+
+
+   FREE(brw->wm.compile_data);
+
+   for (i = 0; i < brw->curr.fb.nr_cbufs; i++)
+      pipe_surface_reference(&brw->curr.fb.cbufs[i], NULL);
+   brw->curr.fb.nr_cbufs = 0;
+   pipe_surface_reference(&brw->curr.fb.zsbuf, NULL);
+
+   bo_reference(&brw->curbe.curbe_bo, NULL);
+   bo_reference(&brw->vs.prog_bo, NULL);
+   bo_reference(&brw->vs.state_bo, NULL);
+   bo_reference(&brw->vs.bind_bo, NULL);
+   bo_reference(&brw->gs.prog_bo, NULL);
+   bo_reference(&brw->gs.state_bo, NULL);
+   bo_reference(&brw->clip.prog_bo, NULL);
+   bo_reference(&brw->clip.state_bo, NULL);
+   bo_reference(&brw->clip.vp_bo, NULL);
+   bo_reference(&brw->sf.prog_bo, NULL);
+   bo_reference(&brw->sf.state_bo, NULL);
+   bo_reference(&brw->sf.vp_bo, NULL);
+
+   for (i = 0; i < Elements(brw->wm.sdc_bo); i++)
+      bo_reference(&brw->wm.sdc_bo[i], NULL);
+
+   bo_reference(&brw->wm.bind_bo, NULL);
+
+   for (i = 0; i < Elements(brw->wm.surf_bo); i++)
+      bo_reference(&brw->wm.surf_bo[i], NULL);
+
+   bo_reference(&brw->wm.sampler_bo, NULL);
+   bo_reference(&brw->wm.prog_bo, NULL);
+   bo_reference(&brw->wm.state_bo, NULL);
+}
+
+
+struct pipe_context *brw_create_context(struct pipe_screen *screen)
+{
+   struct brw_context *brw = (struct brw_context *) CALLOC_STRUCT(brw_context);
+
+   if (!brw) {
+      debug_printf("%s: failed to alloc context\n", __FUNCTION__);
+      return NULL;
+   }
+
+   brw->base.screen = screen;
+   brw->base.destroy = brw_destroy_context;
+   brw->sws = brw_screen(screen)->sws;
+   brw->chipset = brw_screen(screen)->chipset;
+
+   brw_pipe_blend_init( brw );
+   brw_pipe_depth_stencil_init( brw );
+   brw_pipe_framebuffer_init( brw );
+   brw_pipe_flush_init( brw );
+   brw_pipe_misc_init( brw );
+   brw_pipe_query_init( brw );
+   brw_pipe_rast_init( brw );
+   brw_pipe_sampler_init( brw );
+   brw_pipe_shader_init( brw );
+   brw_pipe_vertex_init( brw );
+   brw_pipe_clear_init( brw );
+
+   brw_hw_cc_init( brw );
+
+   brw_init_state( brw );
+   brw_draw_init( brw );
+
+   brw->state.dirty.mesa = ~0;
+   brw->state.dirty.brw = ~0;
+
+   brw->flags.always_emit_state = 0;
+
+   make_empty_list(&brw->query.active_head);
+
+   brw->batch = brw_batchbuffer_alloc( brw->sws, brw->chipset );
+   if (brw->batch == NULL)
+      goto fail;
+
+   return &brw->base;
+
+fail:
+   if (brw->batch)
+      brw_batchbuffer_free( brw->batch );
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
new file mode 100644
index 00000000000..8c006bb95b2
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -0,0 +1,858 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+#include "brw_structs.h"
+#include "brw_winsys.h"
+#include "brw_reg.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawning a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.  
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contiguous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes decisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_MAX_CURBE                    (32*16)
+
+
+/* Need a value to say a particular vertex shader output isn't
+ * present.  Limits us to 63 outputs currently.
+ */
+#define BRW_OUTPUT_NOT_PRESENT           ((1<<6)-1)
+
+
+struct brw_context;
+
+struct brw_depth_stencil_state {
+   /* Precalculated hardware state:
+    */
+   struct brw_cc0 cc0;
+   struct brw_cc1 cc1;
+   struct brw_cc2 cc2;
+   struct brw_cc3 cc3;
+   struct brw_cc7 cc7;
+
+   unsigned iz_lookup;
+};
+
+
+struct brw_blend_state {
+   /* Precalculated hardware state:
+    */
+   struct brw_cc2 cc2;
+   struct brw_cc3 cc3;
+   struct brw_cc5 cc5;
+   struct brw_cc6 cc6;
+
+   struct brw_surf_ss0 ss0;
+};
+
+
+struct brw_rasterizer_state;
+
+struct brw_immediate_data {
+   unsigned nr;
+   float (*data)[4];
+};
+
+struct brw_vertex_shader {
+   const struct tgsi_token *tokens;
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+
+   struct tgsi_shader_info info;
+   struct brw_immediate_data immediates;
+
+   GLuint has_flow_control:1;
+   GLuint use_const_buffer:1;
+
+   /* Offsets of special vertex shader outputs required for clipping.
+    */
+   GLuint output_hpos:6;        /* not always zero? */
+   GLuint output_color0:6;
+   GLuint output_color1:6;
+   GLuint output_bfc0:6;
+   GLuint output_bfc1:6;
+   GLuint output_edgeflag:6;
+
+   unsigned id;
+};
+
+struct brw_fs_signature {
+   GLuint nr_inputs;
+   struct {
+      GLuint interp:3;          /* TGSI_INTERPOLATE_x */
+      GLuint semantic:5;        /* TGSI_SEMANTIC_x */
+      GLuint semantic_index:24;
+   } input[PIPE_MAX_SHADER_INPUTS];
+};
+
+#define brw_fs_signature_size(s) (offsetof(struct brw_fs_signature, input) + \
+                                  ((s)->nr_inputs * sizeof (s)->input[0])) 
+
+
+struct brw_fragment_shader {
+   const struct tgsi_token *tokens;
+   struct tgsi_shader_info info;
+
+   struct brw_fs_signature signature;
+   struct brw_immediate_data immediates;
+
+   unsigned iz_lookup;
+   /*unsigned wm_lookup;*/
+   
+   unsigned  uses_depth:1;
+   unsigned  has_flow_control:1;
+
+   unsigned id;
+   struct brw_winsys_buffer *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
+};
+
+
+struct brw_sampler {
+   struct brw_ss0 ss0;
+   struct brw_ss1 ss1;
+   float border_color[4];
+   struct brw_ss3 ss3;
+};
+
+
+
+#define PIPE_NEW_DEPTH_STENCIL_ALPHA    0x1
+#define PIPE_NEW_RAST                   0x2
+#define PIPE_NEW_BLEND                  0x4
+#define PIPE_NEW_VIEWPORT               0x8
+#define PIPE_NEW_SAMPLERS               0x10
+#define PIPE_NEW_VERTEX_BUFFER          0x20
+#define PIPE_NEW_VERTEX_ELEMENT         0x40
+#define PIPE_NEW_FRAGMENT_SHADER        0x80
+#define PIPE_NEW_VERTEX_SHADER          0x100
+#define PIPE_NEW_FRAGMENT_CONSTANTS     0x200
+#define PIPE_NEW_VERTEX_CONSTANTS       0x400
+#define PIPE_NEW_CLIP                   0x800
+#define PIPE_NEW_INDEX_BUFFER           0x1000
+#define PIPE_NEW_INDEX_RANGE            0x2000
+#define PIPE_NEW_BLEND_COLOR            0x4000
+#define PIPE_NEW_POLYGON_STIPPLE        0x8000
+#define PIPE_NEW_FRAMEBUFFER_DIMENSIONS 0x10000
+#define PIPE_NEW_DEPTH_BUFFER           0x20000
+#define PIPE_NEW_COLOR_BUFFERS          0x40000
+#define PIPE_NEW_QUERY                  0x80000
+#define PIPE_NEW_SCISSOR                0x100000
+#define PIPE_NEW_BOUND_TEXTURES         0x200000
+#define PIPE_NEW_NR_CBUFS               0x400000
+#define PIPE_NEW_FRAGMENT_SIGNATURE     0x800000
+
+
+
+#define BRW_NEW_URB_FENCE               0x1
+#define BRW_NEW_FRAGMENT_PROGRAM        0x2
+#define BRW_NEW_VERTEX_PROGRAM          0x4
+#define BRW_NEW_INPUT_DIMENSIONS        0x8
+#define BRW_NEW_CURBE_OFFSETS           0x10
+#define BRW_NEW_REDUCED_PRIMITIVE       0x20
+#define BRW_NEW_PRIMITIVE               0x40
+#define BRW_NEW_CONTEXT                 0x80
+#define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
+#define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
+#define BRW_NEW_xxx                     0x2000 /* was FENCE */
+#define BRW_NEW_INDICES			0x4000
+
+/**
+ * Used for any batch entry with a relocated pointer that will be used
+ * by any 3D rendering.  Need to re-emit these fresh in each
+ * batchbuffer as the referenced buffers may be relocated in the
+ * meantime.
+ */
+#define BRW_NEW_BATCH			0x10000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
+
+struct brw_state_flags {
+   /** State update flags signalled by mesa internals */
+   GLuint mesa;
+   /**
+    * State update flags signalled as the result of brw_tracked_state updates
+    */
+   GLuint brw;
+   /** State update flags signalled by brw_state_cache.c searches */
+   GLuint cache;
+};
+
+
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+struct brw_wm_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+
+   GLuint first_curbe_grf;
+   GLuint total_grf;
+   GLuint total_scratch;
+
+   GLuint nr_params;       /**< number of float params/constants */
+   GLboolean error;
+
+   /* Pointer to tracked values (only valid once
+    * _mesa_load_state_parameters has been called at runtime).
+    */
+   const GLfloat *param[BRW_MAX_CURBE];
+};
+
+struct brw_sf_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   GLuint urb_entry_size;
+};
+
+
+struct brw_clip_prog_data;
+
+struct brw_gs_prog_data {
+   GLuint urb_read_length;
+   GLuint total_grf;
+};
+
+struct brw_vs_prog_data {
+   GLuint curb_read_length;
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   GLuint nr_outputs;
+   GLuint nr_inputs;
+
+   GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
+
+   GLboolean writes_psiz;
+
+   /* Used for calculating urb partitions:
+    */
+   GLuint urb_entry_size;
+};
+
+
+/* Size == 0 if output either not written, or always [0,0,0,1]
+ */
+struct brw_vs_ouput_sizes {
+   GLubyte output_size[PIPE_MAX_SHADER_OUTPUTS];
+};
+
+
+/** Number of texture sampler units */
+#define BRW_MAX_TEX_UNIT 16
+
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 4
+
+/**
+ * Size of our surface binding table for the WM.
+ * This contains pointers to the drawing surfaces and current texture
+ * objects and shader constant buffers (+2).
+ */
+#define BRW_WM_MAX_SURF (BRW_MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
+
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define BTI_COLOR_BUF(d)          (d)
+#define BTI_FRAGMENT_CONSTANTS    (BRW_MAX_DRAW_BUFFERS) 
+#define BTI_TEXTURE(t)            (BRW_MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
+
+/* Bit of a hack to align these with the winsys buffer_data_type enum.
+ */
+enum brw_cache_id {
+   BRW_CC_VP         = BRW_DATA_GS_CC_VP,
+   BRW_CC_UNIT       = BRW_DATA_GS_CC_UNIT,
+   BRW_WM_PROG       = BRW_DATA_GS_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR    = BRW_DATA_GS_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER       = BRW_DATA_GS_SAMPLER,
+   BRW_WM_UNIT       = BRW_DATA_GS_WM_UNIT,
+   BRW_SF_PROG       = BRW_DATA_GS_SF_PROG,
+   BRW_SF_VP         = BRW_DATA_GS_SF_VP,
+   BRW_SF_UNIT       = BRW_DATA_GS_SF_UNIT,
+   BRW_VS_UNIT       = BRW_DATA_GS_VS_UNIT,
+   BRW_VS_PROG       = BRW_DATA_GS_VS_PROG,
+   BRW_GS_UNIT       = BRW_DATA_GS_GS_UNIT,
+   BRW_GS_PROG       = BRW_DATA_GS_GS_PROG,
+   BRW_CLIP_VP       = BRW_DATA_GS_CLIP_VP,
+   BRW_CLIP_UNIT     = BRW_DATA_GS_CLIP_UNIT,
+   BRW_CLIP_PROG     = BRW_DATA_GS_CLIP_PROG,
+   BRW_SS_SURFACE    = BRW_DATA_SS_SURFACE,
+   BRW_SS_SURF_BIND  = BRW_DATA_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+struct brw_cache_item {
+   /**
+    * Effectively part of the key, cache_id identifies what kind of state
+    * buffer is involved, and also which brw->state.dirty.cache flag should
+    * be set when this cache item is chosen.
+    */
+   enum brw_cache_id cache_id;
+   /** 32-bit hash of the key data */
+   GLuint hash;
+   GLuint key_size;		/* for variable-sized keys */
+   const void *key;
+   struct brw_winsys_reloc *relocs;
+   GLuint nr_relocs;
+
+   struct brw_winsys_buffer *bo;
+   GLuint data_size;
+
+   struct brw_cache_item *next;
+};   
+
+
+
+struct brw_cache {
+   struct brw_context *brw;
+   struct brw_winsys_screen *sws;
+
+   struct brw_cache_item **items;
+   GLuint size, n_items;
+
+   enum brw_buffer_type buffer_type;
+
+   GLuint key_size[BRW_MAX_CACHE];		/* for fixed-size keys */
+   GLuint aux_size[BRW_MAX_CACHE];
+   char *name[BRW_MAX_CACHE];
+   
+
+   /* Record of the last BOs chosen for each cache_id.  Used to set
+    * brw->state.dirty.cache when a new cache item is chosen.
+    */
+   struct brw_winsys_buffer *last_bo[BRW_MAX_CACHE];
+};
+
+
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   int (*prepare)( struct brw_context *brw );
+   int (*emit)( struct brw_context *brw );
+};
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+struct brw_cached_batch_item {
+   struct header *header;
+   GLuint sz;
+   struct brw_cached_batch_item *next;
+};
+   
+
+
+/* Protect against a future where VERT_ATTRIB_MAX > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define VS_INPUT_BITMASK_DWORDS  ((PIPE_MAX_SHADER_INPUTS+31)/32)
+
+
+
+
+struct brw_vertex_info {
+   GLuint sizes[VS_INPUT_BITMASK_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
+};
+
+
+struct brw_query_object {
+   /** Doubly linked list of active query objects in the context. */
+   struct brw_query_object *prev, *next;
+
+   /** Last query BO associated with this query. */
+   struct brw_winsys_buffer *bo;
+   /** First index in bo with query data for this object. */
+   int first_index;
+   /** Last index in bo with query data for this object. */
+   int last_index;
+
+   /* Total count of pixels from previous BOs */
+   uint64_t result;
+};
+
+#define CC_RELOC_VP 0
+
+
+/**
+ * brw_context is derived from pipe_context
+ */
+struct brw_context 
+{
+   struct pipe_context base;
+   struct brw_chipset chipset;
+
+   struct brw_winsys_screen *sws;
+
+   struct brw_batchbuffer *batch;
+
+   GLuint primitive;
+   GLuint reduced_primitive;
+
+   /* Active state from the state tracker: 
+    */
+   struct {
+      struct brw_vertex_shader *vertex_shader;
+      struct brw_fragment_shader *fragment_shader;
+      const struct brw_blend_state *blend;
+      const struct brw_rasterizer_state *rast;
+      const struct brw_depth_stencil_state *zstencil;
+
+      const struct brw_sampler *sampler[PIPE_MAX_SAMPLERS];
+      unsigned num_samplers;
+
+      struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+      unsigned num_vertex_elements;
+      unsigned num_textures;
+      unsigned num_vertex_buffers;
+
+      struct pipe_scissor_state scissor;
+      struct pipe_viewport_state viewport;
+      struct pipe_framebuffer_state fb;
+      struct pipe_clip_state ucp;
+      struct pipe_buffer *vertex_constants;
+      struct pipe_buffer *fragment_constants;
+
+      struct brw_blend_constant_color bcc;
+      struct brw_polygon_stipple bps;
+      struct brw_cc_viewport ccv;
+
+      /**
+       * Index buffer for this draw_prims call.
+       *
+       * Updates are signaled by PIPE_NEW_INDEX_BUFFER.
+       */
+      struct pipe_buffer *index_buffer;
+      unsigned index_size;
+
+      /* Updates are signalled by PIPE_NEW_INDEX_RANGE:
+       */
+      unsigned min_index;
+      unsigned max_index;
+
+   } curr;
+
+   struct {
+      struct brw_state_flags dirty;
+
+      /**
+       * List of buffers accumulated in brw_validate_state to receive
+       * dri_bo_check_aperture treatment before exec, so we can know if we
+       * should flush the batch and try again before emitting primitives.
+       *
+       * This can be a fixed number as we only have a limited number of
+       * objects referenced from the batchbuffer in a primitive emit,
+       * consisting of the vertex buffers, pipelined state pointers,
+       * the CURBE, the depth buffer, and a query BO.
+       */
+      struct brw_winsys_buffer *validated_bos[PIPE_MAX_SHADER_INPUTS + 16];
+      int validated_bo_count;
+   } state;
+
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+      struct u_upload_mgr *upload_vertex;
+      struct u_upload_mgr *upload_index;
+      
+      /* Information on uploaded vertex buffers:
+       */
+      struct {
+	 unsigned stride;	/* in bytes between successive vertices */
+	 unsigned offset;	/* in bytes, of first vertex in bo */
+	 unsigned vertex_count;	/* count of valid vertices which may be accessed */
+	 struct brw_winsys_buffer *bo;
+      } vb[PIPE_MAX_ATTRIBS];
+
+      unsigned nr_vb;		/* currently the same as curr.num_vertex_buffers */
+   } vb;
+
+   struct {
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
+      struct brw_winsys_buffer *bo;
+      unsigned int offset;
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
+       */
+      unsigned int start_vertex_offset;
+   } ib;
+
+
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      GLuint vsize;		/* vertex size plus header in urb registers */
+      GLuint csize;		/* constant buffer size in urb registers */
+      GLuint sfsize;		/* setup data size in urb registers */
+
+      GLboolean constrained;
+
+      GLuint nr_vs_entries;
+      GLuint nr_gs_entries;
+      GLuint nr_clip_entries;
+      GLuint nr_sf_entries;
+      GLuint nr_cs_entries;
+
+      GLuint vs_start;
+      GLuint gs_start;
+      GLuint clip_start;
+      GLuint sf_start;
+      GLuint cs_start;
+   } urb;
+
+   
+   /* BRW_NEW_CURBE_OFFSETS: 
+    */
+   struct {
+      GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
+      GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
+      GLuint clip_start;
+      GLuint clip_size;
+      GLuint vs_start;
+      GLuint vs_size;
+      GLuint total_size;
+
+      struct brw_winsys_buffer *curbe_bo;
+      /** Offset within curbe_bo of space for current curbe entry */
+      GLuint curbe_offset;
+      /** Offset within curbe_bo of space for next curbe entry */
+      GLuint curbe_next_offset;
+
+      GLfloat *last_buf;
+      GLuint last_bufsz;
+      /**
+       *  Whether we should create a new bo instead of reusing the old one
+       * (if we just dispatch the batch pointing at the old one.
+       */
+      GLboolean need_new_bo;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      GLboolean prog_active;
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+      struct brw_winsys_buffer *vp_bo;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+      struct brw_wm_compile *compile_data;
+
+      /** Input sizes, calculated from active vertex program.
+       * One bit per fragment program input attribute.
+       */
+      /*GLbitfield input_size_masks[4];*/
+
+      /** Array of surface default colors (texture border color) */
+      struct brw_winsys_buffer *sdc_bo[BRW_MAX_TEX_UNIT];
+
+      GLuint render_surf;
+      GLuint nr_surfaces;      
+
+      GLuint max_threads;
+      struct brw_winsys_buffer *scratch_bo;
+
+      GLuint sampler_count;
+      struct brw_winsys_buffer *sampler_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      struct brw_winsys_buffer *bind_bo;
+      struct brw_winsys_buffer *surf_bo[BRW_WM_MAX_SURF];
+
+      struct brw_winsys_buffer *prog_bo;
+      struct brw_winsys_buffer *state_bo;
+   } wm;
+
+
+   struct {
+      struct brw_winsys_buffer *state_bo;
+
+      struct brw_cc_unit_state cc;
+      struct brw_winsys_reloc reloc[1];
+   } cc;
+
+   struct {
+      struct brw_query_object active_head;
+      struct brw_winsys_buffer *bo;
+      int index;
+      GLboolean active;
+      int stats_wm;
+   } query;
+
+   struct {
+      unsigned always_emit_state:1;
+      unsigned always_flush_batch:1;
+      unsigned force_swtnl:1;
+      unsigned no_swtnl:1;
+   } flags;
+
+   /* Used to give every program string a unique id
+    */
+   GLuint program_id;
+};
+
+
+
+/*======================================================================
+ * brw_queryobj.c
+ */
+void brw_init_query(struct brw_context *brw);
+enum pipe_error brw_prepare_query_begin(struct brw_context *brw);
+void brw_emit_query_begin(struct brw_context *brw);
+void brw_emit_query_end(struct brw_context *brw);
+
+/*======================================================================
+ * brw_state_dump.c
+ */
+void brw_debug_batch(struct brw_context *intel);
+
+
+/*======================================================================
+ * brw_pipe_*.c
+ */
+void brw_pipe_blend_init( struct brw_context *brw );
+void brw_pipe_depth_stencil_init( struct brw_context *brw );
+void brw_pipe_framebuffer_init( struct brw_context *brw );
+void brw_pipe_flush_init( struct brw_context *brw );
+void brw_pipe_misc_init( struct brw_context *brw );
+void brw_pipe_query_init( struct brw_context *brw );
+void brw_pipe_rast_init( struct brw_context *brw );
+void brw_pipe_sampler_init( struct brw_context *brw );
+void brw_pipe_shader_init( struct brw_context *brw );
+void brw_pipe_vertex_init( struct brw_context *brw );
+void brw_pipe_clear_init( struct brw_context *brw );
+
+
+void brw_pipe_blend_cleanup( struct brw_context *brw );
+void brw_pipe_depth_stencil_cleanup( struct brw_context *brw );
+void brw_pipe_framebuffer_cleanup( struct brw_context *brw );
+void brw_pipe_flush_cleanup( struct brw_context *brw );
+void brw_pipe_misc_cleanup( struct brw_context *brw );
+void brw_pipe_query_cleanup( struct brw_context *brw );
+void brw_pipe_rast_cleanup( struct brw_context *brw );
+void brw_pipe_sampler_cleanup( struct brw_context *brw );
+void brw_pipe_shader_cleanup( struct brw_context *brw );
+void brw_pipe_vertex_cleanup( struct brw_context *brw );
+void brw_pipe_clear_cleanup( struct brw_context *brw );
+
+void brw_hw_cc_init( struct brw_context *brw );
+void brw_hw_cc_cleanup( struct brw_context *brw );
+
+
+
+void brw_context_flush( struct brw_context *brw );
+
+
+/* brw_urb.c
+ */
+int brw_upload_urb_fence(struct brw_context *brw);
+
+/* brw_curbe.c
+ */
+int brw_upload_cs_urb_state(struct brw_context *brw);
+
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct brw_context *
+brw_context( struct pipe_context *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+
+#define BRW_IS_965(brw)    ((brw)->chipset.is_965)
+#define BRW_IS_IGDNG(brw)  ((brw)->chipset.is_igdng)
+#define BRW_IS_G4X(brw)    ((brw)->chipset.is_g4x)
+
+
+#endif
+
diff --git a/src/gallium/drivers/i965/brw_curbe.c b/src/gallium/drivers/i965/brw_curbe.c
new file mode 100644
index 00000000000..3f031577d5a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_curbe.c
@@ -0,0 +1,390 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_util.h"
+#include "brw_debug.h"
+#include "brw_screen.h"
+
+
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
+ */
+static int calculate_curbe_offsets( struct brw_context *brw )
+{
+   /* CACHE_NEW_WM_PROG */
+   const GLuint nr_fp_regs = brw->wm.prog_data->curb_read_length;
+   
+   /* BRW_NEW_VERTEX_PROGRAM */
+   const GLuint nr_vp_regs = brw->vs.prog_data->curb_read_length;
+   GLuint nr_clip_regs = 0;
+   GLuint total_regs;
+
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
+      GLuint nr_planes = 6 + brw->curr.ucp.nr;
+      nr_clip_regs = (nr_planes * 4 + 15) / 16;
+   }
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* When this is > 32, want to use a true constant buffer to hold
+    * the extra constants.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      GLuint reg = 0;
+
+      /* Calculate a new layout: 
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+      if (BRW_DEBUG & DEBUG_CURBE)
+	 debug_printf("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .mesa = PIPE_NEW_CLIP,
+      .brw  = BRW_NEW_VERTEX_PROGRAM,
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .prepare = calculate_curbe_offsets
+};
+
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+int brw_upload_cs_urb_state(struct brw_context *brw)
+{
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   return 0;
+}
+
+static GLfloat fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static enum pipe_error prepare_curbe_buffer(struct brw_context *brw)
+{
+   struct pipe_screen *screen = brw->base.screen;
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   enum pipe_error ret;
+   GLfloat *buf;
+   GLuint i;
+
+   if (sz == 0) {
+      if (brw->curbe.last_buf) {
+	 free(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+      return 0;
+   }
+
+   buf = (GLfloat *) CALLOC(bufsz, 1);
+
+   /* fragment shader constants */
+   if (brw->curbe.wm_size) {
+      const struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+      GLuint offset = brw->curbe.wm_start * 16;
+      GLuint nr_immediate, nr_const;
+
+      nr_immediate = fs->immediates.nr;
+      if (nr_immediate) {
+         memcpy(&buf[offset], 
+                fs->immediates.data,
+                nr_immediate * 4 * sizeof(float));
+
+         offset += nr_immediate * 4;
+      }
+
+      nr_const = fs->info.file_max[TGSI_FILE_CONSTANT] + 1;
+/*      nr_const = brw->wm.prog_data->nr_params; */
+      if (nr_const) {
+         const GLfloat *value = screen->buffer_map( screen,
+                                                    brw->curr.fragment_constants,
+                                                    PIPE_BUFFER_USAGE_CPU_READ);
+
+         memcpy(&buf[offset], value,
+                nr_const * 4 * sizeof(float));
+         
+         screen->buffer_unmap( screen, 
+                               brw->curr.fragment_constants );
+      }
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      GLuint offset = brw->curbe.clip_start * 16;
+      GLuint j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes:
+       */
+      assert(brw->curr.ucp.nr <= 6);
+      for (j = 0; j < brw->curr.ucp.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->curr.ucp.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->curr.ucp.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->curr.ucp.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->curr.ucp.ucp[j][3];
+	 i++;
+      }
+   }
+
+   /* vertex shader constants */
+   if (brw->curbe.vs_size) {
+      GLuint offset = brw->curbe.vs_start * 16;
+      const struct brw_vertex_shader *vs = brw->curr.vertex_shader;
+      GLuint nr_immediate, nr_const;
+
+      nr_immediate = vs->immediates.nr;
+      if (nr_immediate) {
+         memcpy(&buf[offset], 
+                vs->immediates.data,
+                nr_immediate * 4 * sizeof(float));
+
+         offset += nr_immediate * 4;
+      }
+
+      nr_const = vs->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      if (nr_const) {
+         /* XXX: note that constant buffers are currently *already* in
+          * buffer objects.  If we want to keep on putting them into the
+          * curbe, makes sense to treat constbuf's specially with malloc.
+          */
+         const GLfloat *value = screen->buffer_map( screen,
+                                                    brw->curr.vertex_constants,
+                                                    PIPE_BUFFER_USAGE_CPU_READ);
+         
+         /* XXX: what if user's constant buffer is too small?
+          */
+         memcpy(&buf[offset], value, nr_const * 4 * sizeof(float));
+         
+         screen->buffer_unmap( screen, brw->curr.vertex_constants );
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_CURBE) {
+      for (i = 0; i < sz*16; i+=4) 
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   (void *)brw->curbe.last_buf, (void *)buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.curbe_bo != NULL &&
+       brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      /* constants have not changed */
+      FREE(buf);
+   } 
+   else {
+      /* constants have changed */
+      FREE(brw->curbe.last_buf);
+
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+      if (brw->curbe.curbe_bo != NULL &&
+	  (brw->curbe.need_new_bo ||
+	   brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size))
+      {
+	 bo_reference(&brw->curbe.curbe_bo, NULL);
+      }
+
+      if (brw->curbe.curbe_bo == NULL) {
+	 /* Allocate a single page for CURBE entries for this
+	  * batchbuffer.  They're generally around 64b.  We will
+	  * discard the curbe buffer after the batch is flushed to
+	  * avoid synchronous updates.
+	  */
+	 ret = brw->sws->bo_alloc(brw->sws, 
+                                  BRW_BUFFER_TYPE_CURBE,
+                                  4096, 1 << 6,
+                                  &brw->curbe.curbe_bo);
+         if (ret)
+            return ret;
+
+	 brw->curbe.curbe_next_offset = 0;
+      }
+
+      brw->curbe.curbe_offset = brw->curbe.curbe_next_offset;
+      brw->curbe.curbe_next_offset += bufsz;
+      brw->curbe.curbe_next_offset = align(brw->curbe.curbe_next_offset, 64);
+
+      /* Copy data to the buffer:
+       */
+      brw->sws->bo_subdata(brw->curbe.curbe_bo,
+                           BRW_DATA_CONSTANT_BUFFER,
+			   brw->curbe.curbe_offset,
+			   bufsz,
+			   buf,
+                           NULL, 0);
+   }
+
+   brw_add_validated_bo(brw, brw->curbe.curbe_bo);
+
+   /* Because this provokes an action (ie copy the constants into the
+    * URB), it shouldn't be shortcircuited if identical to the
+    * previous time - because eg. the urb destination may have
+    * changed, or the urb contents different to last time.
+    *
+    * Note that the data referred to is actually copied internally,
+    * not just used in place according to passed pointer.
+    *
+    * It appears that the CS unit takes care of using each available
+    * URB entry (Const URB Entry == CURBE) in turn, and issuing
+    * flushes as necessary when doublebuffering of CURBEs isn't
+    * possible.
+    */
+
+   return 0;
+}
+
+static enum pipe_error emit_curbe_buffer(struct brw_context *brw)
+{
+   GLuint sz = brw->curbe.total_size;
+
+   BEGIN_BATCH(2, IGNORE_CLIPRECTS);
+   if (sz == 0) {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2));
+      OUT_RELOC(brw->curbe.curbe_bo,
+		BRW_USAGE_STATE,
+		(sz - 1) + brw->curbe.curbe_offset);
+   }
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_curbe_buffer = {
+   .dirty = {
+      .mesa = (PIPE_NEW_FRAGMENT_CONSTANTS |
+	       PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_CLIP),
+      .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
+	       BRW_NEW_VERTEX_PROGRAM |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS |
+	       BRW_NEW_BATCH),
+      .cache = (CACHE_NEW_WM_PROG) 
+   },
+   .prepare = prepare_curbe_buffer,
+   .emit = emit_curbe_buffer,
+};
+
diff --git a/src/gallium/drivers/i965/brw_debug.h b/src/gallium/drivers/i965/brw_debug.h
new file mode 100644
index 00000000000..ae8e9254a68
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_debug.h
@@ -0,0 +1,43 @@
+#ifndef BRW_DEBUG_H
+#define BRW_DEBUG_H
+
+/* ================================================================
+ * Debugging:
+ */
+
+#define DEBUG_TEXTURE	        0x1
+#define DEBUG_STATE	        0x2
+#define DEBUG_IOCTL	        0x4
+#define DEBUG_BLIT	        0x8
+#define DEBUG_CURBE             0x10
+#define DEBUG_FALLBACKS	        0x20
+#define DEBUG_VERBOSE	        0x40
+#define DEBUG_BATCH             0x80
+#define DEBUG_PIXEL             0x100
+#define DEBUG_WINSYS            0x200
+#define DEBUG_MIN_URB           0x400
+#define DEBUG_DISASSEM           0x800
+#define DEBUG_unused3           0x1000
+#define DEBUG_SYNC	        0x2000
+#define DEBUG_PRIMS	        0x4000
+#define DEBUG_VERTS	        0x8000
+#define DEBUG_unused4           0x10000
+#define DEBUG_DMA               0x20000
+#define DEBUG_SANITY            0x40000
+#define DEBUG_SLEEP             0x80000
+#define DEBUG_STATS             0x100000
+#define DEBUG_unused5           0x200000
+#define DEBUG_SINGLE_THREAD     0x400000
+#define DEBUG_WM                0x800000
+#define DEBUG_URB               0x1000000
+#define DEBUG_VS                0x2000000
+
+#ifdef DEBUG
+extern int BRW_DEBUG;
+#else
+#define BRW_DEBUG 0
+#endif
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_defines.h b/src/gallium/drivers/i965/brw_defines.h
new file mode 100644
index 00000000000..e201ce4d7ce
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_defines.h
@@ -0,0 +1,847 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0 
+#define BRW_ANISORATIO_4     1 
+#define BRW_ANISORATIO_6     2 
+#define BRW_ANISORATIO_8     3 
+#define BRW_ANISORATIO_10    4 
+#define BRW_ANISORATIO_12    5 
+#define BRW_ANISORATIO_14    6 
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15  
+
+#define BRW_MAPFILTER_NEAREST        0x0 
+#define BRW_MAPFILTER_LINEAR         0x1 
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0   
+#define BRW_MIPFILTER_NEAREST     1   
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0 
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1 
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0    
+#define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002 
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004 
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005 
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006 
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040 
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041 
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042 
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043 
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044 
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045 
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082 
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083 
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084 
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085 
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086 
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087 
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088 
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089 
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A 
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B 
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C 
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D 
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E 
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F 
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090 
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091 
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092 
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0 
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3 
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4 
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9 
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA 
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB 
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC 
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD 
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE 
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF 
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1 
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2 
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3 
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6 
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7 
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8 
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9 
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA 
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF 
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0 
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1 
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2 
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3 
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4 
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9 
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB 
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC 
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED 
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE 
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0 
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1 
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2 
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100 
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102 
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104 
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105 
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106 
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107 
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108 
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109 
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A 
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B 
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C 
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D 
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E 
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111 
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112 
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113 
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114 
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB                0x118
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140 
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141 
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142 
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143 
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144 
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145 
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146 
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147 
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181 
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186 
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187 
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188 
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189 
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A 
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B 
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C 
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D 
+#define BRW_SURFACEFORMAT_MONO8                          0x18E 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F 
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190 
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191 
+#define BRW_SURFACEFORMAT_FXT1                           0x192 
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193 
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194 
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195 
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196 
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197 
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198 
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199 
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A 
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C 
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define BRW_SURFACEFORMAT_INVALID                        0xFFF
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_R     7
+#define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20   
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CS_URB_STATE              0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT_965       0x6104
+#define CMD_PIPELINE_SELECT_GM45      0x6904
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+
+#define CMD_VERTEX_BUFFER             0x7808
+# define BRW_VB0_INDEX_SHIFT		27
+# define BRW_VB0_ACCESS_VERTEXDATA	(0 << 26)
+# define BRW_VB0_ACCESS_INSTANCEDATA	(1 << 26)
+# define BRW_VB0_PITCH_SHIFT		0
+
+#define CMD_VERTEX_ELEMENT            0x7809
+# define BRW_VE0_INDEX_SHIFT		27
+# define BRW_VE0_FORMAT_SHIFT		16
+# define BRW_VE0_VALID			(1 << 26)
+# define BRW_VE0_SRC_OFFSET_SHIFT	0
+# define BRW_VE1_COMPONENT_NOSTORE	0
+# define BRW_VE1_COMPONENT_STORE_SRC	1
+# define BRW_VE1_COMPONENT_STORE_0	2
+# define BRW_VE1_COMPONENT_STORE_1_FLT	3
+# define BRW_VE1_COMPONENT_STORE_1_INT	4
+# define BRW_VE1_COMPONENT_STORE_VID	5
+# define BRW_VE1_COMPONENT_STORE_IID	6
+# define BRW_VE1_COMPONENT_STORE_PID	7
+# define BRW_VE1_COMPONENT_0_SHIFT	28
+# define BRW_VE1_COMPONENT_1_SHIFT	24
+# define BRW_VE1_COMPONENT_2_SHIFT	20
+# define BRW_VE1_COMPONENT_3_SHIFT	16
+# define BRW_VE1_DST_OFFSET_SHIFT	0
+
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS_965         0x780b
+#define CMD_VF_STATISTICS_GM45        0x680b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+#define CMD_AA_LINE_PARAMETERS        0x790a
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_disasm.c b/src/gallium/drivers/i965/brw_disasm.c
new file mode 100644
index 00000000000..65db27248b1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_disasm.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "brw_disasm.h"
+#include "brw_structs.h"
+#include "brw_reg.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+   [0] = NULL
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [5] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, const struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, const struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, const struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, const struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm_insn (FILE *file, const struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
+
+
+int brw_disasm (FILE *file, 
+                const struct brw_instruction *inst,
+                unsigned count)
+{
+   int i, err;
+
+   for (i = 0; i < count; i++) {
+      err = brw_disasm_insn(stderr, &inst[i]);
+      if (err)
+         return err;
+   }
+
+   fprintf(file, "\n");
+   return 0;
+}
+
diff --git a/src/gallium/drivers/i965/brw_disasm.h b/src/gallium/drivers/i965/brw_disasm.h
new file mode 100644
index 00000000000..ba5b109c483
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_disasm.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef BRW_DISASM_H
+#define BRW_DISASM_H
+
+#include <stdio.h>
+
+struct brw_instruction;
+
+int brw_disasm_insn (FILE *file, const struct brw_instruction *inst);
+int brw_disasm (FILE *file, 
+                const struct brw_instruction *inst,
+                unsigned count);
+
+#endif
+
diff --git a/src/gallium/drivers/i965/brw_draw.c b/src/gallium/drivers/i965/brw_draw.c
new file mode 100644
index 00000000000..ea8d39adaf0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw.c
@@ -0,0 +1,289 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_prim.h"
+#include "util/u_upload_mgr.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_debug.h"
+#include "brw_screen.h"
+
+#include "brw_batchbuffer.h"
+
+
+static uint32_t prim_to_hw_prim[PIPE_PRIM_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static int brw_set_prim(struct brw_context *brw, unsigned prim )
+{
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("PRIM: %s\n", u_prim_name(prim));
+   
+   if (prim != brw->primitive) {
+      unsigned reduced_prim;
+
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      reduced_prim = u_reduced_prim(prim);
+      if (reduced_prim != brw->reduced_primitive) {
+	 brw->reduced_primitive = reduced_prim;
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+   }
+
+   return prim_to_hw_prim[prim];
+}
+
+
+
+static int brw_emit_prim(struct brw_context *brw,
+			 unsigned start,
+			 unsigned count,
+			 boolean indexed,
+			 uint32_t hw_prim)
+{
+   struct brw_3d_primitive prim_packet;
+   int ret;
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("%s start %d count %d indexed %d hw_prim %d\n",
+                   __FUNCTION__, start, count, indexed, hw_prim); 
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim;
+   prim_packet.header.indexed = indexed;
+
+   prim_packet.verts_per_instance = count;
+   prim_packet.start_vert_location = start;
+   if (indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = 0; /* prim->basevertex; XXX: add this to gallium */
+
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (0) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
+      ADVANCE_BATCH();
+   }
+   if (prim_packet.verts_per_instance) {
+      ret = brw_batchbuffer_data( brw->batch, &prim_packet,
+				  sizeof(prim_packet), LOOP_CLIPRECTS);
+      if (ret)
+	 return ret;
+   }
+   if (0) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_MI_FLUSH << 16) | BRW_FLUSH_STATE_CACHE);
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static int
+try_draw_range_elements(struct brw_context *brw,
+			struct pipe_buffer *index_buffer,
+			unsigned hw_prim, 
+			unsigned start, unsigned count)
+{
+   int ret;
+
+   ret = brw_validate_state(brw);
+   if (ret)
+      return ret;
+
+   /* Check that we can fit our state in with our existing batchbuffer, or
+    * flush otherwise.
+    */
+   ret = brw->sws->check_aperture_space(brw->sws,
+					brw->state.validated_bos,
+					brw->state.validated_bo_count);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_state(brw);
+   if (ret)
+      return ret;
+   
+   ret = brw_emit_prim(brw, start, count, index_buffer != NULL, hw_prim);
+   if (ret)
+      return ret;
+
+   if (brw->flags.always_flush_batch)
+      brw_context_flush( brw );
+
+   return 0;
+}
+
+
+static void
+brw_draw_range_elements(struct pipe_context *pipe,
+			struct pipe_buffer *index_buffer,
+			unsigned index_size,
+			unsigned min_index,
+			unsigned max_index,
+			unsigned mode, unsigned start, unsigned count)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int ret;
+   uint32_t hw_prim;
+
+   hw_prim = brw_set_prim(brw, mode);
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      debug_printf("PRIM: %s start %d count %d index_buffer %p\n",
+                   u_prim_name(mode), start, count, (void *)index_buffer);
+
+   /* Potentially trigger upload of new index buffer.
+    *
+    * XXX: do we need to go through state validation to achieve this?
+    * Could just call upload code directly.
+    */
+   if (brw->curr.index_buffer != index_buffer ||
+       brw->curr.index_size != index_size) {
+      pipe_buffer_reference( &brw->curr.index_buffer, index_buffer );
+      brw->curr.index_size = index_size;
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_BUFFER;
+   }
+
+   /* XXX: do we really care?
+    */
+   if (brw->curr.min_index != min_index ||
+       brw->curr.max_index != max_index) 
+   { 
+      brw->curr.min_index = min_index;
+      brw->curr.max_index = max_index;
+      brw->state.dirty.mesa |= PIPE_NEW_INDEX_RANGE;
+   }
+
+
+   /* Make a first attempt at drawing:
+    */
+   ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
+
+   /* Otherwise, flush and retry:
+    */
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_draw_range_elements(brw, index_buffer, hw_prim, start, count );
+      assert(ret == 0);
+   }
+}
+
+static void
+brw_draw_elements(struct pipe_context *pipe,
+		  struct pipe_buffer *index_buffer,
+		  unsigned index_size,
+		  unsigned mode, 
+		  unsigned start, unsigned count)
+{
+   brw_draw_range_elements( pipe, index_buffer,
+                            index_size,
+                            0, 0xffffffff,
+                            mode, 
+                            start, count );
+}
+
+static void
+brw_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   brw_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+
+boolean brw_draw_init( struct brw_context *brw )
+{
+   /* Register our drawing function: 
+    */
+   brw->base.draw_arrays = brw_draw_arrays;
+   brw->base.draw_elements = brw_draw_elements;
+   brw->base.draw_range_elements = brw_draw_range_elements;
+
+   /* Create helpers for uploading data in user buffers:
+    */
+   brw->vb.upload_vertex = u_upload_create( brw->base.screen,
+					    128 * 1024,
+					    64,
+					    PIPE_BUFFER_USAGE_VERTEX );
+   if (brw->vb.upload_vertex == NULL)
+      return FALSE;
+
+   brw->vb.upload_index = u_upload_create( brw->base.screen,
+					   32 * 1024,
+					   64,
+					   PIPE_BUFFER_USAGE_INDEX );
+   if (brw->vb.upload_index == NULL)
+      return FALSE;
+
+   return TRUE;
+}
+
+void brw_draw_cleanup( struct brw_context *brw )
+{
+   u_upload_destroy( brw->vb.upload_vertex );
+   u_upload_destroy( brw->vb.upload_index );
+
+   bo_reference(&brw->ib.bo, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_draw.h b/src/gallium/drivers/i965/brw_draw.h
new file mode 100644
index 00000000000..8dc5dbce622
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw.h
@@ -0,0 +1,39 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "brw_types.h"
+
+struct brw_context;
+
+boolean brw_draw_init( struct brw_context *brw );
+void brw_draw_cleanup( struct brw_context *brw );
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_draw_upload.c b/src/gallium/drivers/i965/brw_draw_upload.c
new file mode 100644
index 00000000000..a27da5f1c17
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_draw_upload.c
@@ -0,0 +1,542 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_context.h"
+
+#include "util/u_upload_mgr.h"
+#include "util/u_math.h"
+
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
+#include "brw_debug.h"
+
+
+
+
+static unsigned brw_translate_surface_format( unsigned id )
+{
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned get_index_type(int type)
+{
+   switch (type) {
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+
+static int brw_prepare_vertices(struct brw_context *brw)
+{
+   unsigned int min_index = brw->curr.min_index;
+   unsigned int max_index = brw->curr.max_index;
+   GLuint i;
+   int ret;
+
+   if (BRW_DEBUG & DEBUG_VERTS)
+      debug_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
+
+
+   for (i = 0; i < brw->curr.num_vertex_buffers; i++) {
+      struct pipe_vertex_buffer *vb = &brw->curr.vertex_buffer[i];
+      struct brw_winsys_buffer *bo;
+      struct pipe_buffer *upload_buf = NULL;
+      unsigned offset;
+      
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s vb[%d] user:%d offset:0x%x sz:0x%x stride:0x%x\n",
+		      __FUNCTION__, i,
+		      brw_buffer_is_user_buffer(vb->buffer),
+		      vb->buffer_offset,
+		      vb->buffer->size,
+		      vb->stride);
+
+      if (brw_buffer_is_user_buffer(vb->buffer)) {
+
+	 /* XXX: simplify this.  Stop the state trackers from generating
+	  * zero-stride buffers & have them use additional constants (or
+	  * add support for >1 constant buffer) instead.
+	  */
+	 unsigned size = (vb->stride == 0 ? 
+			  vb->buffer->size - vb->buffer_offset :
+			  MAX2(vb->buffer->size - vb->buffer_offset,
+			       vb->stride * (max_index + 1 - min_index)));
+
+	 ret = u_upload_buffer( brw->vb.upload_vertex, 
+				vb->buffer_offset + min_index * vb->stride,
+				size,
+				vb->buffer,
+				&offset,
+				&upload_buf );
+	 if (ret)
+	    return ret;
+
+	 bo = brw_buffer(upload_buf)->bo;
+	 
+	 assert(offset + size <= bo->size);
+      }
+      else
+      {
+	 offset = vb->buffer_offset;
+	 bo = brw_buffer(vb->buffer)->bo;
+      }
+
+      assert(offset < bo->size);
+      
+      /* Set up post-upload info about this vertex buffer:
+       */
+      brw->vb.vb[i].offset = offset;
+      brw->vb.vb[i].stride = vb->stride;
+      brw->vb.vb[i].vertex_count = (vb->stride == 0 ?
+				    1 :
+				    (bo->size - offset) / vb->stride);
+
+      bo_reference( &brw->vb.vb[i].bo,  bo );
+
+      /* Don't need to retain this reference.  We have a reference on
+       * the underlying winsys buffer:
+       */
+      pipe_buffer_reference( &upload_buf, NULL );
+   }
+
+   brw->vb.nr_vb = i;
+   brw_prepare_query_begin(brw);
+
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      brw_add_validated_bo(brw, brw->vb.vb[i].bo);
+   }
+
+   return 0;
+}
+
+static int brw_emit_vertex_buffers( struct brw_context *brw )
+{
+   int i;
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), just bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_vb == 0) {
+      if (BRW_DEBUG & DEBUG_VERTS)
+	 debug_printf("%s: no active vertex buffers\n", __FUNCTION__);
+
+      return 0;
+   }
+
+   /* Emit VB state packets.
+    */
+   BEGIN_BATCH(1 + brw->vb.nr_vb * 4, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
+	     ((1 + brw->vb.nr_vb * 4) - 2));
+
+   for (i = 0; i < brw->vb.nr_vb; i++) {
+      OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
+		BRW_VB0_ACCESS_VERTEXDATA |
+		(brw->vb.vb[i].stride << BRW_VB0_PITCH_SHIFT));
+      OUT_RELOC(brw->vb.vb[i].bo,
+		BRW_USAGE_VERTEX,
+		brw->vb.vb[i].offset);
+      if (BRW_IS_IGDNG(brw)) {
+	 OUT_RELOC(brw->vb.vb[i].bo,
+		   BRW_USAGE_VERTEX,
+		   brw->vb.vb[i].bo->size - 1);
+      } else
+	 OUT_BATCH(brw->vb.vb[i].stride ? brw->vb.vb[i].vertex_count : 0);
+      OUT_BATCH(0); /* Instance data step rate */
+   }
+   ADVANCE_BATCH();
+   return 0;
+}
+
+
+
+
+static int brw_emit_vertex_elements(struct brw_context *brw)
+{
+   GLuint nr = brw->curr.num_vertex_elements;
+   GLuint i;
+
+   brw_emit_query_begin(brw);
+
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (nr == 0) {
+      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
+      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
+      ADVANCE_BATCH();
+      return 0;
+   }
+
+   /* Now emit vertex element (VEP) state packets.
+    *
+    */
+   BEGIN_BATCH(1 + nr * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + nr * 2) - 2));
+   for (i = 0; i < nr; i++) {
+      const struct pipe_vertex_element *input = &brw->curr.vertex_element[i];
+      uint32_t format = brw_translate_surface_format( input->src_format );
+      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
+      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
+
+      switch (input->nr_components) {
+      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
+      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
+      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
+      case 3: comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
+	 break;
+      }
+
+      OUT_BATCH((input->vertex_buffer_index << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(format << BRW_VE0_FORMAT_SHIFT) |
+		(input->src_offset << BRW_VE0_SRC_OFFSET_SHIFT));
+
+      if (BRW_IS_IGDNG(brw))
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
+      else
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
+                    ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
+   }
+   ADVANCE_BATCH();
+   return 0;
+}
+
+
+static int brw_emit_vertices( struct brw_context *brw )
+{
+   int ret;
+
+   ret = brw_emit_vertex_buffers( brw );
+   if (ret)
+      return ret;
+
+   ret = brw_emit_vertex_elements( brw );
+   if (ret)
+      return ret;
+   
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_vertices = {
+   .dirty = {
+      .mesa = (PIPE_NEW_INDEX_RANGE |
+               PIPE_NEW_VERTEX_BUFFER),
+      .brw = BRW_NEW_BATCH,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_vertices,
+   .emit = brw_emit_vertices,
+};
+
+
+static int brw_prepare_indices(struct brw_context *brw)
+{
+   struct pipe_buffer *index_buffer = brw->curr.index_buffer;
+   struct pipe_buffer *upload_buf = NULL;
+   struct brw_winsys_buffer *bo = NULL;
+   GLuint offset;
+   GLuint index_size;
+   GLuint ib_size;
+   int ret;
+
+   if (index_buffer == NULL)
+      return 0;
+
+   if (BRW_DEBUG & DEBUG_VERTS)
+      debug_printf("%s: index_size:%d index_buffer->size:%d\n",
+		   __FUNCTION__,
+		   brw->curr.index_size,
+		   brw->curr.index_buffer->size);
+
+   ib_size = index_buffer->size;
+   index_size = brw->curr.index_size;
+
+   /* Turn userbuffer into a proper hardware buffer?
+    */
+   if (brw_buffer_is_user_buffer(index_buffer)) {
+
+      ret = u_upload_buffer( brw->vb.upload_index,
+			     0,
+			     ib_size,
+			     index_buffer,
+			     &offset,
+			     &upload_buf );
+      if (ret)
+	 return ret;
+
+      bo = brw_buffer(upload_buf)->bo;
+
+      /* XXX: annotate the userbuffer with the upload information so
+       * that successive calls don't get re-uploaded.
+       */
+   }
+   else {
+      bo = brw_buffer(index_buffer)->bo;
+      ib_size = bo->size;
+      offset = 0;
+   }
+
+   /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading the
+    * index buffer state when we're just moving the start index of our
+    * drawing.
+    *
+    * In gallium this will happen in the case where successive draw
+    * calls are made with (distinct?) userbuffers, but the upload_mgr
+    * places the data into a single winsys buffer.
+    * 
+    * This statechange doesn't raise any state flags and is always
+    * just merged into the final draw packet:
+    */
+   if (1) {
+      assert((offset & (index_size - 1)) == 0);
+      brw->ib.start_vertex_offset = offset / index_size;
+   }
+
+   /* These statechanges trigger a new CMD_INDEX_BUFFER packet:
+    */
+   if (brw->ib.bo != bo ||
+       brw->ib.size != ib_size)
+   {
+      bo_reference(&brw->ib.bo, bo);
+      brw->ib.size = ib_size;
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   }
+
+   pipe_buffer_reference( &upload_buf, NULL );
+   brw_add_validated_bo(brw, brw->ib.bo);
+   return 0;
+}
+
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = PIPE_NEW_INDEX_BUFFER,
+      .brw = 0,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static int brw_emit_index_buffer(struct brw_context *brw)
+{
+   /* Emit the indexbuffer packet:
+    */
+   if (brw->ib.bo)
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(brw->ib.size);
+      ib.header.bits.cut_index_enable = 0;
+
+      BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC(brw->ib.bo,
+		BRW_USAGE_VERTEX,
+		brw->ib.offset);
+      OUT_RELOC(brw->ib.bo,
+		BRW_USAGE_VERTEX,
+		brw->ib.offset + brw->ib.size - 1);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_index_buffer = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
+      .cache = 0,
+   },
+   .emit = brw_emit_index_buffer,
+};
diff --git a/src/gallium/drivers/i965/brw_eu.c b/src/gallium/drivers/i965/brw_eu.c
new file mode 100644
index 00000000000..a8fcb5f97eb
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu.c
@@ -0,0 +1,262 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
+{
+   p->current->header.destreg__conditionalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, GLuint value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, GLuint value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
+{
+   p->brw = brw;
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+enum pipe_error brw_get_program( struct brw_compile *p,
+                                 const GLuint **data,
+                                 GLuint *sz )
+{
+   GLuint i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   /* Is the generated program malformed for some reason?
+    */
+   if (p->error)
+      return PIPE_ERROR_BAD_INPUT;
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   *data = (const GLuint *)p->store;
+   return PIPE_OK;
+}
+
+
+
+/**
+ * Subroutine calls require special attention.
+ * Mesa instructions may be expanded into multiple hardware instructions
+ * so the prog_instruction::BranchTarget field can't be used as an index
+ * into the hardware instructions.
+ *
+ * The BranchTarget field isn't needed, however.  Mesa's GLSL compiler
+ * emits CAL and BGNSUB instructions with labels that can be used to map
+ * subroutine calls to actual subroutine code blocks.
+ *
+ * The structures and function here implement patching of CAL instructions
+ * so they jump to the right subroutine code...
+ */
+
+
+/**
+ * For each OPCODE_BGNSUB we create one of these.
+ */
+struct brw_eu_label
+{
+   GLuint label;     /**< the label number */
+   GLuint position;  /**< the position of the brw instruction for this label */
+   struct brw_eu_label *next;  /**< next in linked list */
+};
+
+
+/**
+ * For each OPCODE_CAL we create one of these.
+ */
+struct brw_eu_call
+{
+   GLuint call_inst_pos;  /**< location of the CAL instruction */
+   GLuint label;
+   struct brw_eu_call *next;  /**< next in linked list */
+};
+
+
+/**
+ * Called for each OPCODE_BGNSUB.
+ */
+void
+brw_save_label(struct brw_compile *c, unsigned l, GLuint position)
+{
+   struct brw_eu_label *label = CALLOC_STRUCT(brw_eu_label);
+   label->label = l;
+   label->position = position;
+   label->next = c->first_label;
+   c->first_label = label;
+}
+
+
+/**
+ * Called for each OPCODE_CAL.
+ */
+void
+brw_save_call(struct brw_compile *c, GLuint label, GLuint call_pos)
+{
+   struct brw_eu_call *call = CALLOC_STRUCT(brw_eu_call);
+   call->call_inst_pos = call_pos;
+   call->label = label;
+   call->next = c->first_call;
+   c->first_call = call;
+}
+
+
+/**
+ * Lookup a label, return label's position/offset.
+ */
+static GLuint
+brw_lookup_label(struct brw_compile *c, unsigned l)
+{
+   const struct brw_eu_label *label;
+   for (label = c->first_label; label; label = label->next) {
+      if (l == label->label) {
+         return label->position;
+      }
+   }
+   abort();  /* should never happen */
+   return ~0;
+}
+
+
+/**
+ * When we're done generating code, this function is called to resolve
+ * subroutine calls.
+ */
+void
+brw_resolve_cals(struct brw_compile *c)
+{
+    const struct brw_eu_call *call;
+
+    for (call = c->first_call; call; call = call->next) {
+        const GLuint sub_loc = brw_lookup_label(c, call->label);
+	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
+	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
+	GLint offset = brw_sub_inst - brw_call_inst;
+
+	/* patch brw_inst1 to point to brw_inst2 */
+	brw_set_src1(brw_call_inst, brw_imm_d(offset * 16));
+    }
+
+    /* free linked list of calls */
+    {
+        struct brw_eu_call *call, *next;
+        for (call = c->first_call; call; call = next) {
+	    next = call->next;
+	    FREE(call);
+	}
+	c->first_call = NULL;
+    }
+
+    /* free linked list of labels */
+    {
+        struct brw_eu_label *label, *next;
+	for (label = c->first_label; label; label = next) {
+	    next = label->next;
+	    FREE(label);
+	}
+	c->first_label = NULL;
+    }
+}
diff --git a/src/gallium/drivers/i965/brw_eu.h b/src/gallium/drivers/i965/brw_eu.h
new file mode 100644
index 00000000000..af509b2e5f4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu.h
@@ -0,0 +1,992 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "util/u_debug.h"
+#include "pipe/p_defines.h"
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+#define BRW_WRITEMASK_NONE     0x00
+#define BRW_WRITEMASK_X        0x01
+#define BRW_WRITEMASK_Y        0x02
+#define BRW_WRITEMASK_XY       0x03
+#define BRW_WRITEMASK_Z        0x04
+#define BRW_WRITEMASK_XZ       0x05
+#define BRW_WRITEMASK_YZ       0x06
+#define BRW_WRITEMASK_XYZ      0x07
+#define BRW_WRITEMASK_W        0x08
+#define BRW_WRITEMASK_XW       0x09
+#define BRW_WRITEMASK_YW       0x0A
+#define BRW_WRITEMASK_XYW      0x0B
+#define BRW_WRITEMASK_ZW       0x0C
+#define BRW_WRITEMASK_XZW      0x0D
+#define BRW_WRITEMASK_YZW      0x0E
+#define BRW_WRITEMASK_XYZW     0x0F
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   GLuint type:4;
+   GLuint file:2;
+   GLuint nr:8;
+   GLuint subnr:5;		/* :1 in align16 */
+   GLuint negate:1;		/* source only */
+   GLuint abs:1;		/* source only */
+   GLuint vstride:4;		/* source only */
+   GLuint width:3;		/* src only, align1 only */
+   GLuint hstride:2;   		/* align1 only */
+   GLuint address_mode:1;	/* relative addressing, hopefully! */
+   GLuint pad0:1;
+
+   union {      
+      struct {
+	 GLuint swizzle:8;		/* src only, align16 only */
+	 GLuint writemask:4;		/* dest only, align16 only */
+	 GLint  indirect_offset:10;	/* relative addressing offset */
+	 GLuint pad1:10;		/* two dwords total */
+      } bits;
+
+      GLfloat f;
+      GLint   d;
+      GLuint ud;
+   } dw1;      
+};
+
+
+struct brw_indirect {
+   GLuint addr_subnr:4;
+   GLint addr_offset:10;
+   GLuint pad:18;
+};
+
+
+struct brw_eu_label;
+struct brw_eu_call;
+
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 10000
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   GLuint nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   GLuint flag_value;
+   GLboolean single_program_flow;
+   struct brw_context *brw;
+
+   struct brw_eu_label *first_label;  /**< linked list of labels */
+   struct brw_eu_call *first_call;    /**< linked list of CALs */
+
+   boolean error;
+};
+
+
+void
+brw_save_label(struct brw_compile *c, unsigned label, GLuint position);
+
+void
+brw_save_call(struct brw_compile *c, unsigned label, GLuint call_pos);
+
+void
+brw_resolve_cals(struct brw_compile *c);
+
+
+
+static INLINE int type_sz( GLuint type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file  one of the BRW_x_REGISTER_FILE values
+ * \param nr  register number/index
+ * \param subnr  register sub number
+ * \param type  one of BRW_REGISTER_TYPE_x
+ * \param vstride  one of BRW_VERTICAL_STRIDE_x
+ * \param width  one of BRW_WIDTH_x
+ * \param hstride  one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle  one of BRW_SWIZZLE_x
+ * \param writemask  BRW_WRITEMASK_X/Y/Z/W bitfield
+ */
+static INLINE struct brw_reg brw_reg( GLuint file,
+                                      GLuint nr,
+                                      GLuint subnr,
+                                      GLuint type,
+                                      GLuint vstride,
+                                      GLuint width,
+                                      GLuint hstride,
+                                      GLuint swizzle,
+                                      GLuint writemask )
+{
+   struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < BRW_MAX_GRF);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < BRW_MAX_MRF);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+/** Construct float[16] register */
+static INLINE struct brw_reg brw_vec16_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static INLINE struct brw_reg brw_vec8_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static INLINE struct brw_reg brw_vec4_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  BRW_WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static INLINE struct brw_reg brw_vec2_reg( GLuint file,
+					      GLuint nr,
+					      GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  BRW_WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static INLINE struct brw_reg brw_vec1_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  BRW_WRITEMASK_X);
+}
+
+
+static INLINE struct brw_reg retype( struct brw_reg reg,
+				       GLuint type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static INLINE struct brw_reg suboffset( struct brw_reg reg,
+					  GLuint delta )
+{   
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static INLINE struct brw_reg offset( struct brw_reg reg,
+				       GLuint delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static INLINE struct brw_reg byte_offset( struct brw_reg reg,
+					    GLuint bytes )
+{
+   GLuint newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+   
+
+/** Construct unsigned word[16] register */
+static INLINE struct brw_reg brw_uw16_reg( GLuint file,
+					     GLuint nr,
+					     GLuint subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static INLINE struct brw_reg brw_uw8_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static INLINE struct brw_reg brw_uw1_reg( GLuint file,
+					    GLuint nr,
+					    GLuint subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static INLINE struct brw_reg brw_imm_reg( GLuint type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);      
+}
+
+/** Construct float immediate register */
+static INLINE struct brw_reg brw_imm_f( GLfloat f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+/** Construct integer immediate register */
+static INLINE struct brw_reg brw_imm_d( GLint d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+/** Construct uint immediate register */
+static INLINE struct brw_reg brw_imm_ud( GLuint ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+/** Construct ushort immediate register */
+static INLINE struct brw_reg brw_imm_uw( GLushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw | (uw << 16);
+   return imm;
+}
+
+/** Construct short immediate register */
+static INLINE struct brw_reg brw_imm_w( GLshort w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w | (w << 16);
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static INLINE struct brw_reg brw_imm_v( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static INLINE struct brw_reg brw_imm_vf( GLuint v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static INLINE struct brw_reg brw_imm_vf4( GLuint v0, 
+					    GLuint v1, 
+					    GLuint v2,
+					    GLuint v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static INLINE struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static INLINE struct brw_reg brw_vec1_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static INLINE struct brw_reg brw_vec2_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static INLINE struct brw_reg brw_vec4_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static INLINE struct brw_reg brw_vec8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static INLINE struct brw_reg brw_uw8_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static INLINE struct brw_reg brw_uw16_grf( GLuint nr, GLuint subnr )
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static INLINE struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_NULL, 
+		       0);
+}
+
+static INLINE struct brw_reg brw_address_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		      BRW_ARF_ADDRESS, 
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static INLINE struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		  BRW_ARF_IP, 
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  BRW_WRITEMASK_XYZW); /* NOTE! */
+}
+
+static INLINE struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+		       BRW_ARF_ACCUMULATOR, 
+		       0);
+}
+
+
+static INLINE struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static INLINE struct brw_reg brw_message_reg( GLuint nr )
+{
+   assert(nr < BRW_MAX_MRF);
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static INLINE GLuint cvt( GLuint val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static INLINE struct brw_reg stride( struct brw_reg reg,
+				       GLuint vstride,
+				       GLuint width,
+				       GLuint hstride )
+{
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+
+static INLINE struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static INLINE struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static INLINE struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static INLINE struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static INLINE struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+
+static INLINE struct brw_reg get_element( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static INLINE struct brw_reg get_element_ud( struct brw_reg reg, GLuint elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static INLINE struct brw_reg brw_swizzle( struct brw_reg reg,
+					    GLuint x,
+					    GLuint y, 
+					    GLuint z,
+					    GLuint w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static INLINE struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     GLuint x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static INLINE struct brw_reg brw_writemask( struct brw_reg reg,
+					      GLuint mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  GLuint mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static INLINE struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static INLINE struct brw_reg brw_vec4_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg brw_vec1_indirect( GLuint subnr,
+						  GLint offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static INLINE struct brw_reg deref_4f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_1f(struct brw_indirect ptr, GLint offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static INLINE struct brw_reg deref_4b(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static INLINE struct brw_reg deref_1uw(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static INLINE struct brw_reg deref_1d(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static INLINE struct brw_reg deref_1ud(struct brw_indirect ptr, GLint offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static INLINE struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static INLINE struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, GLint offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+/** Do two brw_regs refer to the same register? */
+static INLINE GLboolean
+brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+   return r1.file == r2.file && r1.nr == r2.nr;
+}
+
+static INLINE struct brw_instruction *current_insn( struct brw_compile *p)
+{
+   return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, GLuint value );
+void brw_set_saturate( struct brw_compile *p, GLuint value );
+void brw_set_access_mode( struct brw_compile *p, GLuint access_mode );
+void brw_set_compression_control( struct brw_compile *p, GLboolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, GLuint value );
+void brw_set_predicate_control( struct brw_compile *p, GLuint pc );
+void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional );
+
+void brw_init_compile( struct brw_context *, struct brw_compile *p );
+
+enum pipe_error brw_get_program( struct brw_compile *p, 
+                                 const GLuint **program,
+                                 GLuint *sz );
+
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLuint binding_table_index,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		GLuint msg_reg_nr,
+		struct brw_reg src0,
+		GLuint binding_table_index,
+		GLuint sampler,
+		GLuint writemask,
+		GLuint msg_type,
+		GLuint response_length,
+		GLuint msg_length,
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  GLuint function,
+		  GLuint saturate,
+		  GLuint msg_reg_nr,
+		  struct brw_reg src,
+		  GLuint precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       GLuint function,
+	       GLuint saturate,
+	       GLuint msg_reg_nr,
+	       struct brw_reg src,
+	       GLuint data_type,
+	       GLuint precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     GLuint scratch_offset );
+
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index );
+
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      GLuint scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, 
+			       GLuint execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p, 
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p, 
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       GLuint execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p, 
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     GLuint conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/*********************************************************************** 
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count);
+
+void brw_math_invert( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
diff --git a/src/gallium/drivers/i965/brw_eu_debug.c b/src/gallium/drivers/i965/brw_eu_debug.c
new file mode 100644
index 00000000000..5989f5a04ee
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_debug.c
@@ -0,0 +1,94 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "util/u_debug.h"
+
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   debug_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
+      debug_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      debug_printf("imm %f", hwreg.dw1.f);
+   }
+   else {
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
new file mode 100644
index 00000000000..00d8eaccbc4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -0,0 +1,1433 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+     
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+
+
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size( struct brw_instruction *insn,
+				  struct brw_reg reg )
+{
+   if (reg.width == BRW_WIDTH_8 && 
+       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+      insn->header.execution_size = BRW_EXECUTE_16;
+   else
+      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
+}
+
+
+static void brw_set_dest( struct brw_instruction *insn,
+			  struct brw_reg dest )
+{
+   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
+   insn->bits1.da1.dest_reg_file = dest.file;
+   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {   
+      insn->bits1.da1.dest_reg_nr = dest.nr;
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
+      }
+      else {
+	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+      }
+   }
+   else {
+      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
+      }
+      else {
+	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+      }
+   }
+
+   /* NEW: Set the execution size based on dest.width and
+    * insn->compression_control:
+    */
+   guess_execution_size(insn, dest);
+}
+
+static void brw_set_src0( struct brw_instruction *insn,
+                          struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+   insn->bits1.da1.src0_reg_file = reg.file;
+   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits2.da1.src0_abs = reg.abs;
+   insn->bits2.da1.src0_negate = reg.negate;
+   insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   
+      /* Required to set some fields in src1 as well:
+       */
+      insn->bits1.da1.src1_reg_file = 0; /* arf */
+      insn->bits1.da1.src1_reg_type = reg.type;
+   }
+   else 
+   {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
+	    insn->bits2.da1.src0_reg_nr = reg.nr;
+	 }
+	 else {
+	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+	    insn->bits2.da16.src0_reg_nr = reg.nr;
+	 }
+      }
+      else {
+	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset; 
+	 }
+	 else {
+	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+	 }
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 && 
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
+	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
+	    insn->bits2.da1.src0_width = reg.width;
+	    insn->bits2.da1.src0_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits2.da16.src0_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+void brw_set_src1( struct brw_instruction *insn,
+                   struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   assert(reg.nr < 128);
+
+   insn->bits1.da1.src1_reg_file = reg.file;
+   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits3.da1.src1_abs = reg.abs;
+   insn->bits3.da1.src1_negate = reg.negate;
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   }
+   else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      /*assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
+	 insn->bits3.da1.src1_reg_nr = reg.nr;
+      }
+      else {
+	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+	 insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 && 
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
+	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
+	    insn->bits3.da1.src1_width = reg.width;
+	    insn->bits3.da1.src1_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits3.da16.src1_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+
+static void brw_set_math_message( struct brw_context *brw,
+				  struct brw_instruction *insn,
+				  GLuint msg_length,
+				  GLuint response_length,
+				  GLuint function,
+				  GLuint integer_type,
+				  GLboolean low_precision,
+				  GLboolean saturate,
+				  GLuint dataType )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
+}
+
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+	brw_set_src1(insn, brw_imm_d(0));
+
+	insn->bits3.urb_igdng.opcode = 1;
+	insn->bits3.urb_igdng.offset = offset;
+	insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+	insn->bits3.urb_igdng.allocate = allocate;
+	insn->bits3.urb_igdng.used = used;
+	insn->bits3.urb_igdng.complete = complete;
+	insn->bits3.urb_igdng.header_present = 1;
+	insn->bits3.urb_igdng.response_length = response_length;
+	insn->bits3.urb_igdng.msg_length = msg_length;
+	insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+	insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
+
+static void brw_set_urb_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (BRW_IS_IGDNG(brw)) {
+        insn->bits3.urb_igdng.opcode = 0;	/* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;	/* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    } else {
+        insn->bits3.urb.opcode = 0;	/* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;	/* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
+}
+
+static void brw_set_dp_write_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
+				      GLuint binding_table_index,
+				      GLuint msg_control,
+				      GLuint msg_type,
+				      GLuint msg_length,
+				      GLuint pixel_scoreboard_clear,
+				      GLuint response_length,
+				      GLuint end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
+}
+
+static void brw_set_dp_read_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
+				      GLuint binding_table_index,
+				      GLuint msg_control,
+				      GLuint msg_type,
+				      GLuint target_cache,
+				      GLuint msg_length,
+				      GLuint response_length,
+				      GLuint end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
+}
+
+static void brw_set_sampler_message(struct brw_context *brw,
+                                    struct brw_instruction *insn,
+                                    GLuint binding_table_index,
+                                    GLuint sampler,
+                                    GLuint msg_type,
+                                    GLuint response_length,
+                                    GLuint msg_length,
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
+{
+   assert(eot == 0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   if (BRW_IS_IGDNG(brw)) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (BRW_IS_G4X(brw)) {
+      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
+      insn->bits3.sampler_g4x.sampler = sampler;
+      insn->bits3.sampler_g4x.msg_type = msg_type;
+      insn->bits3.sampler_g4x.response_length = response_length;
+      insn->bits3.sampler_g4x.msg_length = msg_length;
+      insn->bits3.sampler_g4x.end_of_thread = eot;
+      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   } else {
+      insn->bits3.sampler.binding_table_index = binding_table_index;
+      insn->bits3.sampler.sampler = sampler;
+      insn->bits3.sampler.msg_type = msg_type;
+      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      insn->bits3.sampler.response_length = response_length;
+      insn->bits3.sampler.msg_length = msg_length;
+      insn->bits3.sampler.end_of_thread = eot;
+      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+   }
+}
+
+
+
+static struct brw_instruction *next_insn( struct brw_compile *p, 
+					  GLuint opcode )
+{
+   struct brw_instruction *insn;
+
+   if (0 && (BRW_DEBUG & DEBUG_DISASSEM))
+   {
+      if (p->nr_insn) 
+         brw_disasm_insn(stderr, &p->store[p->nr_insn-1]);
+   }
+
+   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   /* Reset this one-shot flag: 
+    */
+
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }
+
+   insn->header.opcode = opcode;
+   return insn;
+}
+
+
+static struct brw_instruction *brw_alu1( struct brw_compile *p,
+					 GLuint opcode,
+					 struct brw_reg dest,
+					 struct brw_reg src )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);   
+   return insn;
+}
+
+static struct brw_instruction *brw_alu2(struct brw_compile *p,
+					GLuint opcode,
+					struct brw_reg dest,
+					struct brw_reg src0,
+					struct brw_reg src1 )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);   
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+   return insn;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDZ)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+
+
+
+void brw_NOP(struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
+   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(insn, brw_imm_ud(0x0));
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+struct brw_instruction *brw_JMPI(struct brw_compile *p, 
+                                 struct brw_reg dest,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
+{
+   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+   insn->header.execution_size = 1;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ *
+ * No attempt is made to deal with stack overflow (14 elements?).
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      assert(execute_size == BRW_EXECUTE_1);
+
+      insn = next_insn(p, BRW_OPCODE_ADD);
+      insn->header.predicate_inverse = 1;
+   } else {
+      insn = next_insn(p, BRW_OPCODE_IF);
+   }
+
+   /* Override the defaults for this instruction:
+    */
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.execution_size = execute_size;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   if (!p->single_program_flow)
+       insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p, 
+				 struct brw_instruction *if_insn)
+{
+   struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
+
+   if (p->single_program_flow) {
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_ELSE);
+   }
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = if_insn->header.execution_size;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+   if (!p->single_program_flow)
+       insn->header.thread_control = BRW_THREAD_SWITCH;
+
+   /* Patch the if instruction to point at this instruction.
+    */
+   if (p->single_program_flow) {
+      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+
+      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
+   } else {
+      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+      if_insn->bits3.if_else.pop_count = 0;
+      if_insn->bits3.if_else.pad0 = 0;
+   }
+
+   return insn;
+}
+
+void brw_ENDIF(struct brw_compile *p, 
+	       struct brw_instruction *patch_insn)
+{
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2; 
+ 
+   if (p->single_program_flow) {
+      /* In single program flow mode, there's no need to execute an ENDIF,
+       * since we don't need to do any stack operations, and if we're executing
+       * currently, we want to just continue executing.
+       */
+      struct brw_instruction *next = &p->store[p->nr_insn];
+
+      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+
+      patch_insn->bits3.ud = (next - patch_insn) * 16;
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(insn, brw_imm_d(0x0));
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = patch_insn->header.execution_size;
+      insn->header.mask_control = BRW_MASK_ENABLE;
+      insn->header.thread_control = BRW_THREAD_SWITCH;
+
+      assert(patch_insn->bits3.if_else.jump_count == 0);
+
+      /* Patch the if or else instructions to point at this or the next
+       * instruction respectively.
+       */
+      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
+	 /* Automagically turn it into an IFF:
+	  */
+	 patch_insn->header.opcode = BRW_OPCODE_IFF;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+	 patch_insn->bits3.if_else.pop_count = 0;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
+	 patch_insn->bits3.if_else.pop_count = 1;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else {
+	 assert(0);
+      }
+
+      /* Also pop item off the stack in the endif instruction:
+       */
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   }
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+/* DO/WHILE loop:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
+{
+   if (p->single_program_flow) {
+      return &p->store[p->nr_insn];
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      /* insn->header.mask_control = BRW_MASK_DISABLE; */
+
+      return insn;
+   }
+}
+
+
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p, 
+                                  struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
+
+   if (p->single_program_flow)
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   else
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (p->single_program_flow) {
+      insn->header.execution_size = BRW_EXECUTE_1;
+
+      insn->bits3.d = (do_insn - insn) * 16;
+   } else {
+      insn->header.execution_size = do_insn->header.execution_size;
+
+      assert(do_insn->header.opcode == BRW_OPCODE_DO);
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+      insn->bits3.if_else.pop_count = 0;
+      insn->bits3.if_else.pad0 = 0;
+   }
+
+/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+
+   /* insn->header.mask_control = BRW_MASK_DISABLE; */
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
+   return insn;
+}
+
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p, 
+		       struct brw_instruction *jmp_insn)
+{
+   struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
+
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     GLuint conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   insn->header.destreg__conditionalmod = conditional;
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+
+/*    guess_execution_size(insn, src0); */
+
+
+   /* Make it so that future instructions will use the computed flag
+    * value until brw_set_predicate_control_flag_value() is called
+    * again.  
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == 0) {
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+      p->flag_value = 0xff;
+   }
+}
+
+
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       GLuint function,
+	       GLuint saturate,
+	       GLuint msg_reg_nr,
+	       struct brw_reg src,
+	       GLuint data_type,
+	       GLuint precision )
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
+   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   insn->header.predicate_control = 0; 
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw,
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			data_type);
+}
+
+/**
+ * Extended math function, float[16].
+ * Use 2 send instructions.
+ */
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  GLuint function,
+		  GLuint saturate,
+		  GLuint msg_reg_nr,
+		  struct brw_reg src,
+		  GLuint precision )
+{
+   struct brw_instruction *insn;
+   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1; 
+   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1; 
+
+   /* First instruction:
+    */
+   brw_push_insn_state(p);
+   brw_set_predicate_control_flag_value(p, 0xff);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw,
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   /* Second instruction:
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
+
+   brw_set_dest(insn, offset(dest,1));
+   brw_set_src0(insn, src);
+   brw_set_math_message(p->brw, 
+			insn, 
+			msg_length, response_length, 
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      GLuint scratch_offset )
+{
+   GLuint msg_reg_nr = 1;
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      GLuint msg_length = 3;
+      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+
+      brw_set_dp_write_message(p->brw,
+			       insn,
+			       255, /* binding table index (255=stateless) */
+			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
+			       msg_length,
+			       0, /* pixel scoreboard */
+			       0, /* response_length */
+			       0); /* eot */
+   }
+}
+
+
+/**
+ * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
+void brw_dp_READ_16( struct brw_compile *p,
+		      struct brw_reg dest,
+		      GLuint scratch_offset )
+{
+   GLuint msg_reg_nr = 1;
+   {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      255, /* binding table index (255=stateless) */
+			      3,  /* msg_control (3 means 4 Owords) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      1, /* target cache (render/scratch) */
+			      1, /* msg_length */
+			      2, /* response_length */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ * If relAddr is true, we'll do an indirect fetch using the address register.
+ */
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index )
+{
+   /* XXX: relAddr not implemented */
+   GLuint msg_reg_nr = 1;
+   {
+      struct brw_reg b;
+      brw_push_insn_state(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Setup MRF[1] with location/offset into const buffer */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      brw_MOV(p, b, brw_imm_ud(location));
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+  
+      /* cast dest to a uword[8] vector */
+      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      0,  /* msg_control (0 means 1 Oword) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      GLuint oword,
+                      GLboolean relAddr,
+                      struct brw_reg addrReg,
+                      GLuint location,
+                      GLuint bind_table_index)
+{
+   GLuint msg_reg_nr = 1;
+
+   assert(oword < 2);
+   /*
+   printf("vs const read msg, location %u, msg_reg_nr %d\n",
+          location, msg_reg_nr);
+   */
+
+   /* Setup MRF[1] with location/offset into const buffer */
+   {
+      struct brw_reg b;
+
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /*b = get_element_ud(b, 2);*/
+      if (relAddr) {
+         brw_ADD(p, b, addrReg, brw_imm_ud(location));
+      }
+      else {
+         brw_MOV(p, b, brw_imm_ud(location));
+      }
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+      /*insn->header.access_mode = BRW_ALIGN_16;*/
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      oword,  /* 0 = lower Oword, 1 = upper Oword */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+
+void brw_fb_WRITE(struct brw_compile *p,
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLuint binding_table_index,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+   insn->header.predicate_control = 0; /* XXX */
+   insn->header.compression_control = BRW_COMPRESSION_NONE; 
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+  
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_dp_write_message(p->brw,
+			    insn,
+			    binding_table_index,
+			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
+			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+			    msg_length,
+			    1,	/* pixel scoreboard */
+			    response_length, 
+			    eot);
+}
+
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		GLuint msg_reg_nr,
+		struct brw_reg src0,
+		GLuint binding_table_index,
+		GLuint sampler,
+		GLuint writemask,
+		GLuint msg_type,
+		GLuint response_length,
+		GLuint msg_length,
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode)
+{
+   GLboolean need_stall = 0;
+   
+   if (writemask == 0) {
+      /*debug_printf("%s: zero writemask??\n", __FUNCTION__); */
+      return;
+   }
+   
+   /* Hardware doesn't do destination dependency checking on send
+    * instructions properly.  Add a workaround which generates the
+    * dependency by other means.  In practice it seems like this bug
+    * only crops up for texture samples, and only where registers are
+    * written by the send and then written again later without being
+    * read in between.  Luckily for us, we already track that
+    * information and use it to modify the writemask for the
+    * instruction, so that is a guide for whether a workaround is
+    * needed.
+    */
+   if (writemask != BRW_WRITEMASK_XYZW) {
+      GLuint dst_offset = 0;
+      GLuint i, newmask = 0, len = 0;
+
+      for (i = 0; i < 4; i++) {
+	 if (writemask & (1<<i))
+	    break;
+	 dst_offset += 2;
+      }
+      for (; i < 4; i++) {
+	 if (!(writemask & (1<<i)))
+	    break;
+	 newmask |= 1<<i;
+	 len++;
+      }
+
+      if (newmask != writemask) {
+	 need_stall = 1;
+         /* debug_printf("need stall %x %x\n", newmask , writemask); */
+      }
+      else {
+	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+	 
+	 newmask = ~newmask & BRW_WRITEMASK_XYZW;
+
+	 brw_push_insn_state(p);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	 brw_MOV(p, m1, brw_vec8_grf(0,0));	 
+  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); 
+
+	 brw_pop_insn_state(p);
+
+  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW); 
+	 dest = offset(dest, dst_offset);
+	 response_length = len * 2;
+      }
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src0);
+      brw_set_sampler_message(p->brw, insn,
+			      binding_table_index,
+			      sampler,
+			      msg_type,
+			      response_length, 
+			      msg_length,
+			      eot,
+			      header_present,
+			      simd_mode);
+   }
+
+   if (need_stall) {
+      struct brw_reg reg = vec8(offset(dest, response_length-1));
+
+      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
+       */
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, reg, reg);	      
+      brw_pop_insn_state(p);
+   }
+
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < BRW_MAX_MRF);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_urb_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
+
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
diff --git a/src/gallium/drivers/i965/brw_eu_util.c b/src/gallium/drivers/i965/brw_eu_util.c
new file mode 100644
index 00000000000..5405cf17a4e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       GLuint count)
+{
+   GLuint i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   GLuint count)
+{
+   GLuint i;
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    GLuint count)
+{
+   GLuint i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      GLuint delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_gs.c b/src/gallium/drivers/i965/brw_gs.c
new file mode 100644
index 00000000000..921b201bae2
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs.c
@@ -0,0 +1,216 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static enum pipe_error compile_gs_prog( struct brw_context *brw,
+                                        struct brw_gs_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
+{
+   struct brw_gs_compile c;
+   enum pipe_error ret;
+   const GLuint *program;
+   GLuint program_size;
+
+   memset(&c, 0, sizeof(c));
+   
+   c.key = *key;
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = c.key.nr_attrs;
+
+   if (BRW_IS_IGDNG(brw))
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+      c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.  
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_QUADS:
+      brw_gs_quads( &c ); 
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   case PIPE_PRIM_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return PIPE_OK;
+      }
+      break;
+   default:
+      assert(0);
+      return PIPE_ERROR_BAD_INPUT;
+   }
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache, BRW_GS_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->gs.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static const unsigned gs_prim[PIPE_PRIM_MAX] = {  
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   const struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+
+   memset(key, 0, sizeof(*key));
+
+   /* PIPE_NEW_FRAGMENT_SIGNATURE */
+   key->nr_attrs = sig->nr_inputs + 1;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static int prepare_gs_prog(struct brw_context *brw)
+{
+   struct brw_gs_prog_key key;
+   enum pipe_error ret;
+
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (!brw->gs.prog_active)
+      return PIPE_OK;
+
+   if (brw_search_cache(&brw->cache, BRW_GS_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->gs.prog_data,
+                        &brw->gs.prog_bo))
+      return PIPE_OK;
+
+   ret = compile_gs_prog( brw, &key, &brw->gs.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .mesa  = PIPE_NEW_FRAGMENT_SIGNATURE,
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = 0,
+   },
+   .prepare = prepare_gs_prog
+};
diff --git a/src/gallium/drivers/i965/brw_gs.h b/src/gallium/drivers/i965/brw_gs.h
new file mode 100644
index 00000000000..6e616dcb875
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs.h
@@ -0,0 +1,76 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   GLuint nr_attrs:8;
+   GLuint primitive:4;
+   GLuint hint_gs_always:1;
+   GLuint need_gs_prog:1;
+   GLuint pad:18;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   GLuint nr_attrs;
+   GLuint nr_regs;
+   GLuint nr_bytes;
+   GLboolean need_ff_sync;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_gs_emit.c b/src/gallium/drivers/i965/brw_gs_emit.c
new file mode 100644
index 00000000000..fd8e2accedd
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs_emit.c
@@ -0,0 +1,181 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       GLuint nr_verts )
+{
+   GLuint i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    GLboolean last,
+			    GLuint header)
+{
+   struct brw_compile *p = &c->func;
+   GLboolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_gs_state.c b/src/gallium/drivers/i965/brw_gs_state.c
new file mode 100644
index 00000000000..b64ec286cea
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_gs_state.c
@@ -0,0 +1,169 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+struct brw_gs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+   GLboolean prog_active;
+};
+
+static void
+gs_unit_populate_key(struct brw_context *brw, struct brw_gs_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_GS_PROG */
+   key->prog_active = brw->gs.prog_active;
+   if (key->prog_active) {
+      key->total_grf = brw->gs.prog_data->total_grf;
+      key->urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   } else {
+      key->total_grf = 1;
+      key->urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.clip_start;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_gs_entries;
+   key->urb_size = brw->urb.vsize;
+}
+
+static enum pipe_error
+gs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_gs_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        unsigned nr_reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_gs_unit_state gs;
+   enum pipe_error ret;
+
+
+   memset(&gs, 0, sizeof(gs));
+
+   /* reloc */
+   gs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   gs.thread0.kernel_start_pointer = 0;
+
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+   gs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   gs.thread4.nr_urb_entries = key->nr_urb_entries;
+   gs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (key->nr_urb_entries >= 8)
+      gs.thread4.max_threads = 1;
+   else
+      gs.thread4.max_threads = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_GS_UNIT,
+                          key, sizeof(*key),
+                          reloc, nr_reloc,
+                          &gs, sizeof(gs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static enum pipe_error prepare_gs_unit(struct brw_context *brw)
+{
+   struct brw_gs_unit_key key;
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[1];
+   unsigned nr_reloc = 0;
+   unsigned grf_reg_count;
+
+   gs_unit_populate_key(brw, &key);
+
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+
+   /* GS program relocation */
+   if (key.prog_active) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_STATE,
+                 grf_reg_count << 1,
+                 offsetof(struct brw_gs_unit_state, thread0),
+                 brw->gs.prog_bo);
+   }
+
+   if (brw_search_cache(&brw->cache, BRW_GS_UNIT,
+                        &key, sizeof(key),
+                        reloc, nr_reloc,
+                        NULL,
+                        &brw->gs.state_bo))
+      return PIPE_OK;
+
+   ret = gs_unit_create_from_key(brw, &key,
+                                 reloc, nr_reloc,
+                                 &brw->gs.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .mesa  = 0,
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .prepare = prepare_gs_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_misc_state.c b/src/gallium/drivers/i965/brw_misc_state.c
new file mode 100644
index 00000000000..e4b24229db3
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_misc_state.c
@@ -0,0 +1,513 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+
+#include "brw_debug.h"
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_screen.h"
+#include "brw_pipe_rast.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static int upload_blend_constant_color(struct brw_context *brw)
+{
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bcc);
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .mesa = PIPE_NEW_BLEND_COLOR,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_blend_constant_color
+};
+
+/***********************************************************************
+ * Drawing rectangle - framebuffer dimensions
+ */
+static int upload_drawing_rect(struct brw_context *brw)
+{
+   BEGIN_BATCH(4, NO_LOOP_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_DRAWRECT_INFO_I965);
+   OUT_BATCH(0);
+   OUT_BATCH(((brw->curr.fb.width - 1) & 0xffff) |
+	    ((brw->curr.fb.height - 1) << 16));
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .mesa = PIPE_NEW_FRAMEBUFFER_DIMENSIONS,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_drawing_rect
+};
+
+
+/***********************************************************************
+ * Binding table pointers
+ */
+
+static int prepare_binding_table_pointers(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
+   brw_add_validated_bo(brw, brw->wm.bind_bo);
+   return 0;
+}
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is 0.
+ */
+static int upload_binding_table_pointers(struct brw_context *brw)
+{
+   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, 
+		BRW_USAGE_SAMPLER,
+		0); /* vs */
+   else
+      OUT_BATCH(0);
+   OUT_BATCH(0); /* gs */
+   OUT_BATCH(0); /* clip */
+   OUT_BATCH(0); /* sf */
+   OUT_RELOC(brw->wm.bind_bo,
+	     BRW_USAGE_SAMPLER,
+	     0); /* wm/ps */
+   ADVANCE_BATCH();
+   return 0;
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_BATCH,
+      .cache = CACHE_NEW_SURF_BIND,
+   },
+   .prepare = prepare_binding_table_pointers,
+   .emit = upload_binding_table_pointers,
+};
+
+
+/**********************************************************************
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is 0.
+ */
+static int upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   BEGIN_BATCH(7, IGNORE_CLIPRECTS);
+   OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2));
+   OUT_RELOC(brw->vs.state_bo, 
+	     BRW_USAGE_STATE,
+	     0);
+   if (brw->gs.prog_active)
+      OUT_RELOC(brw->gs.state_bo, 
+		BRW_USAGE_STATE,
+		1);
+   else
+      OUT_BATCH(0);
+   OUT_RELOC(brw->clip.state_bo, 
+	     BRW_USAGE_STATE,
+	     1);
+   OUT_RELOC(brw->sf.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   OUT_RELOC(brw->wm.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   OUT_RELOC(brw->cc.state_bo,
+	     BRW_USAGE_STATE,
+	     0);
+   ADVANCE_BATCH();
+
+   brw->state.dirty.brw |= BRW_NEW_PSP;
+   return 0;
+}
+
+
+static int prepare_psp_urb_cbs(struct brw_context *brw)
+{
+   brw_add_validated_bo(brw, brw->vs.state_bo);
+   brw_add_validated_bo(brw, brw->gs.state_bo);
+   brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->sf.state_bo);
+   brw_add_validated_bo(brw, brw->wm.state_bo);
+   brw_add_validated_bo(brw, brw->cc.state_bo);
+   return 0;
+}
+
+static int upload_psp_urb_cbs(struct brw_context *brw )
+{
+   int ret;
+   
+   ret = upload_pipelined_state_pointers(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_urb_fence(brw);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cs_urb_state(brw);
+   if (ret)
+      return ret;
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH,
+      .cache = (CACHE_NEW_VS_UNIT | 
+		CACHE_NEW_GS_UNIT | 
+		CACHE_NEW_GS_PROG | 
+		CACHE_NEW_CLIP_UNIT | 
+		CACHE_NEW_SF_UNIT | 
+		CACHE_NEW_WM_UNIT | 
+		CACHE_NEW_CC_UNIT)
+   },
+   .prepare = prepare_psp_urb_cbs,
+   .emit = upload_psp_urb_cbs,
+};
+
+
+/***********************************************************************
+ * Depth buffer 
+ */
+
+static int prepare_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *zsbuf = brw->curr.fb.zsbuf;
+
+   if (zsbuf)
+      brw_add_validated_bo(brw, brw_surface(zsbuf)->bo);
+
+   return 0;
+}
+
+static int emit_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *surface = brw->curr.fb.zsbuf;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
+
+   if (surface == NULL) {
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   } else {
+      struct brw_winsys_buffer *bo;
+      unsigned int format;
+      unsigned int pitch;
+      unsigned int cpp;
+
+      switch (surface->format) {
+      case PIPE_FORMAT_Z16_UNORM:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 cpp = 2;
+	 break;
+      case PIPE_FORMAT_X8Z24_UNORM:
+      case PIPE_FORMAT_S8Z24_UNORM:
+	 format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 cpp = 4;
+	 break;
+      case PIPE_FORMAT_Z32_FLOAT:
+	 format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 cpp = 4;
+	 break;
+      default:
+	 assert(0);
+	 return PIPE_ERROR_BAD_INPUT;
+      }
+
+      bo = brw_surface(surface)->bo;
+      pitch = brw_surface(surface)->pitch;
+
+      BEGIN_BATCH(len, IGNORE_CLIPRECTS);
+      OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
+      OUT_BATCH(((pitch * cpp) - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+		((surface->layout != PIPE_SURFACE_LAYOUT_LINEAR) << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(bo,
+		BRW_USAGE_DEPTH_BUFFER,
+		surface->offset);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((pitch - 1) << 6) |
+		((surface->height - 1) << 19));
+      OUT_BATCH(0);
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+         OUT_BATCH(0);
+
+      ADVANCE_BATCH();
+   }
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .mesa = PIPE_NEW_DEPTH_BUFFER,
+      .brw = BRW_NEW_BATCH,
+      .cache = 0,
+   },
+   .prepare = prepare_depthbuffer,
+   .emit = emit_depthbuffer,
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static int upload_polygon_stipple(struct brw_context *brw)
+{
+   BRW_CACHED_BATCH_STRUCT(brw, &brw->curr.bps);
+   return 0;
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .mesa = PIPE_NEW_POLYGON_STIPPLE,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static int upload_line_stipple(struct brw_context *brw)
+{
+   const struct brw_line_stipple *bls = &brw->curr.rast->bls;
+   if (bls->header.opcode) {
+      BRW_CACHED_BATCH_STRUCT(brw, bls);
+   }
+   return 0;
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .mesa = PIPE_NEW_RAST,
+      .brw = 0,
+      .cache = 0
+   },
+   .emit = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static int upload_invarient_state( struct brw_context *brw )
+{
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
+	 ps.header.opcode = CMD_PIPELINE_SELECT_GM45;
+      else
+	 ps.header.opcode = CMD_PIPELINE_SELECT_965;
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping. 
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+   /* VF Statistics */
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) 
+	 vfs.opcode = CMD_VF_STATISTICS_GM45;
+      else 
+	 vfs.opcode = CMD_VF_STATISTICS_965;
+
+      if (BRW_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1; 
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+   
+   if (!BRW_IS_965(brw))
+   {
+      struct brw_aa_line_parameters balp;
+
+      /* use legacy aa line coverage computation */
+      memset(&balp, 0, sizeof(balp));
+      balp.header.opcode = CMD_AA_LINE_PARAMETERS;
+      balp.header.length = sizeof(balp) / 4 - 2;
+   
+      BRW_BATCH_STRUCT(brw, &balp);
+   }
+
+   {
+      struct brw_polygon_stipple_offset bpso;
+      
+      /* This is invarient state in gallium:
+       */
+      memset(&bpso, 0, sizeof(bpso));
+      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+      bpso.header.length = sizeof(bpso)/4-2;
+      bpso.bits0.y_offset = 0;
+      bpso.bits0.x_offset = 0;
+
+      BRW_BATCH_STRUCT(brw, &bpso);
+   }
+   
+   return 0;
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_invarient_state
+};
+
+
+/***********************************************************************
+ * State base address 
+ */
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static int upload_state_base_address( struct brw_context *brw )
+{
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
+   return 0;
+}
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0,
+   },
+   .emit = upload_state_base_address
+};
diff --git a/src/gallium/drivers/i965/brw_pipe_blend.c b/src/gallium/drivers/i965/brw_pipe_blend.c
new file mode 100644
index 00000000000..b759a910b63
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_blend.c
@@ -0,0 +1,208 @@
+
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+static int translate_logicop(unsigned logicop)
+{
+   switch (logicop) {
+   case PIPE_LOGICOP_CLEAR:
+      return BRW_LOGICOPFUNCTION_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return BRW_LOGICOPFUNCTION_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return BRW_LOGICOPFUNCTION_AND_REVERSE;
+   case PIPE_LOGICOP_COPY:
+      return BRW_LOGICOPFUNCTION_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return BRW_LOGICOPFUNCTION_AND_INVERTED;
+   case PIPE_LOGICOP_NOOP:
+      return BRW_LOGICOPFUNCTION_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return BRW_LOGICOPFUNCTION_XOR;
+   case PIPE_LOGICOP_OR:
+      return BRW_LOGICOPFUNCTION_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return BRW_LOGICOPFUNCTION_OR_INVERTED;
+   case PIPE_LOGICOP_NOR:
+      return BRW_LOGICOPFUNCTION_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return BRW_LOGICOPFUNCTION_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return BRW_LOGICOPFUNCTION_INVERT;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return BRW_LOGICOPFUNCTION_OR_REVERSE;
+   case PIPE_LOGICOP_NAND:
+      return BRW_LOGICOPFUNCTION_NAND;
+   case PIPE_LOGICOP_SET:
+      return BRW_LOGICOPFUNCTION_SET;
+   default:
+      assert(0);
+      return BRW_LOGICOPFUNCTION_SET;
+   }
+}
+
+
+static unsigned translate_blend_equation( unsigned mode )
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD: 
+      return BRW_BLENDFUNCTION_ADD; 
+   case PIPE_BLEND_MIN: 
+      return BRW_BLENDFUNCTION_MIN; 
+   case PIPE_BLEND_MAX: 
+      return BRW_BLENDFUNCTION_MAX; 
+   case PIPE_BLEND_SUBTRACT: 
+      return BRW_BLENDFUNCTION_SUBTRACT; 
+   case PIPE_BLEND_REVERSE_SUBTRACT: 
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT; 
+   default: 
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+static unsigned translate_blend_factor( unsigned factor )
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_ZERO: 
+      return BRW_BLENDFACTOR_ZERO; 
+   case PIPE_BLENDFACTOR_SRC_ALPHA: 
+      return BRW_BLENDFACTOR_SRC_ALPHA; 
+   case PIPE_BLENDFACTOR_ONE: 
+      return BRW_BLENDFACTOR_ONE; 
+   case PIPE_BLENDFACTOR_SRC_COLOR: 
+      return BRW_BLENDFACTOR_SRC_COLOR; 
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR: 
+      return BRW_BLENDFACTOR_INV_SRC_COLOR; 
+   case PIPE_BLENDFACTOR_DST_COLOR: 
+      return BRW_BLENDFACTOR_DST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_DST_COLOR: 
+      return BRW_BLENDFACTOR_INV_DST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA; 
+   case PIPE_BLENDFACTOR_DST_ALPHA: 
+      return BRW_BLENDFACTOR_DST_ALPHA; 
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA; 
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: 
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR; 
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA; 
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }   
+}
+
+static void *brw_create_blend_state( struct pipe_context *pipe,
+				     const struct pipe_blend_state *templ )
+{
+   struct brw_blend_state *blend = CALLOC_STRUCT(brw_blend_state);
+   if (blend == NULL)
+      return NULL;
+
+   if (templ->logicop_enable) {
+      blend->cc2.logicop_enable = 1;
+      blend->cc5.logicop_func = translate_logicop(templ->logicop_func);
+   } 
+   else if (templ->blend_enable) {
+      blend->cc6.dest_blend_factor = translate_blend_factor(templ->rgb_dst_factor);
+      blend->cc6.src_blend_factor = translate_blend_factor(templ->rgb_src_factor);
+      blend->cc6.blend_function = translate_blend_equation(templ->rgb_func);
+
+      blend->cc5.ia_dest_blend_factor = translate_blend_factor(templ->alpha_dst_factor);
+      blend->cc5.ia_src_blend_factor = translate_blend_factor(templ->alpha_src_factor);
+      blend->cc5.ia_blend_function = translate_blend_equation(templ->alpha_func);
+
+      blend->cc3.blend_enable = 1;
+      blend->cc3.ia_blend_enable = 
+	 (blend->cc6.dest_blend_factor != blend->cc5.ia_dest_blend_factor ||
+	  blend->cc6.src_blend_factor != blend->cc5.ia_src_blend_factor ||
+	  blend->cc6.blend_function != blend->cc5.ia_blend_function);
+
+      /* Per-surface blend enables, currently just follow global
+       * state:
+       */
+      blend->ss0.color_blend = 1;
+   }
+
+   blend->cc5.dither_enable = templ->dither;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      blend->cc5.statistics_enable = 1;
+
+   /* Per-surface color mask -- just follow global state:
+    */
+   blend->ss0.writedisable_red   = (templ->colormask & PIPE_MASK_R) ? 0 : 1;
+   blend->ss0.writedisable_green = (templ->colormask & PIPE_MASK_G) ? 0 : 1;
+   blend->ss0.writedisable_blue  = (templ->colormask & PIPE_MASK_B) ? 0 : 1;
+   blend->ss0.writedisable_alpha = (templ->colormask & PIPE_MASK_A) ? 0 : 1;
+
+   return (void *)blend;
+}
+
+static void brw_bind_blend_state(struct pipe_context *pipe,
+				 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.blend = (const struct brw_blend_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_BLEND;
+}
+
+static void brw_delete_blend_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.blend);
+   FREE(cso);
+}
+
+
+static void brw_set_blend_color(struct pipe_context *pipe,
+				const struct pipe_blend_color *blend_color)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_blend_constant_color *bcc = &brw->curr.bcc;
+
+   bcc->blend_constant_color[0] = blend_color->color[0];
+   bcc->blend_constant_color[1] = blend_color->color[1];
+   bcc->blend_constant_color[2] = blend_color->color[2];
+   bcc->blend_constant_color[3] = blend_color->color[3];
+
+   brw->state.dirty.mesa |= PIPE_NEW_BLEND_COLOR;
+}
+
+
+void brw_pipe_blend_init( struct brw_context *brw )
+{
+   brw->base.set_blend_color = brw_set_blend_color;
+   brw->base.create_blend_state = brw_create_blend_state;
+   brw->base.bind_blend_state = brw_bind_blend_state;
+   brw->base.delete_blend_state = brw_delete_blend_state;
+
+   {
+      struct brw_blend_constant_color *bcc = &brw->curr.bcc;
+
+      memset(bcc, 0, sizeof(*bcc));      
+      bcc->header.opcode = CMD_BLEND_CONSTANT_COLOR;
+      bcc->header.length = sizeof(*bcc)/4-2;
+   }
+
+}
+
+void brw_pipe_blend_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_clear.c b/src/gallium/drivers/i965/brw_pipe_clear.c
new file mode 100644
index 00000000000..452e1e89f93
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_clear.c
@@ -0,0 +1,218 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_pack_color.h"
+
+#include "pipe/p_state.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_screen.h"
+#include "brw_context.h"
+
+#define MASK16 0xffff
+#define MASK24 0xffffff
+
+
+/**
+ * Use blitting to clear the renderbuffers named by 'flags'.
+ * Note: we can't use the ctx->DrawBuffer->_ColorDrawBufferIndexes field
+ * since that might include software renderbuffers or renderbuffers
+ * which we're clearing with triangles.
+ * \param mask  bitmask of BUFFER_BIT_* values indicating buffers to clear
+ */
+static enum pipe_error
+try_clear( struct brw_context *brw,
+           struct brw_surface *surface,
+           unsigned value )
+{
+   uint32_t BR13, CMD;
+   int x1 = 0;
+   int y1 = 0;
+   int x2 = surface->base.width;
+   int y2 = surface->base.height;
+   int pitch = surface->pitch;
+   int cpp = surface->cpp;
+
+   if (x2 == 0 || y2 == 0)
+      return 0;
+
+   debug_printf("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+                __FUNCTION__,
+                (void *)surface->bo, pitch * cpp,
+                surface->base.offset,
+                x1, y1, x2 - x1, y2 - y1);
+
+   BR13 = 0xf0 << 16;
+   CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_RGB | XY_BLT_WRITE_ALPHA;
+
+   /* Setup the blit command */
+   if (cpp == 4) {
+      BR13 |= BR13_8888;
+      CMD |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+   }
+   else {
+      assert(cpp == 2);
+      BR13 |= BR13_565;
+   }
+
+   /* XXX: nasty hack for clearing depth buffers
+    */
+   if (surface->tiling == BRW_TILING_Y) {
+      x2 = pitch;
+   }
+
+   if (surface->tiling == BRW_TILING_X) {
+      CMD |= XY_DST_TILED;
+      pitch /= 4;
+   }
+
+   BR13 |= (pitch * cpp);
+
+   BEGIN_BATCH(6, 0);
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((y1 << 16) | x1);
+   OUT_BATCH((y2 << 16) | x2);
+   OUT_RELOC(surface->bo,
+             BRW_USAGE_BLIT_DEST,
+             surface->base.offset);
+   OUT_BATCH(value);
+   ADVANCE_BATCH();
+
+   return 0;
+}
+
+
+
+
+static void color_clear(struct brw_context *brw, 
+                        struct brw_surface *bsurface,
+                        const float *rgba )
+{
+   enum pipe_error ret;
+   union util_color value;
+
+   util_pack_color( rgba, bsurface->base.format, &value );
+
+   if (bsurface->cpp == 2)
+      value.ui |= value.ui << 16;
+
+   ret = try_clear( brw, bsurface, value.ui );
+
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_clear( brw, bsurface, value.ui );
+      assert( ret == 0 );
+   }
+}
+
+static void zstencil_clear(struct brw_context *brw, 
+                           struct brw_surface *bsurface,
+                           double depth,
+                           unsigned stencil )
+{
+   enum pipe_error ret;
+   unsigned value;
+
+   switch (bsurface->base.format) {
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      value = ((unsigned)(depth * MASK24) & MASK24);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      value = ((unsigned)(depth * MASK16) & MASK16);
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   switch (bsurface->base.format) {
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      value = value | (stencil << 24);
+      break;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      value = value | (value << 16);
+      break;
+
+   default:
+      break;
+   }
+
+   ret = try_clear( brw, bsurface, value );
+
+   if (ret != 0) {
+      brw_context_flush( brw );
+      ret = try_clear( brw, bsurface, value );
+      assert( ret == 0 );
+   }
+}
+
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+static void brw_clear(struct pipe_context *pipe, 
+                      unsigned buffers,
+                      const float *rgba,
+                      double depth,
+                      unsigned stencil)
+{
+   struct brw_context *brw = brw_context( pipe );
+   int i;
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
+         color_clear( brw, 
+                      brw_surface(brw->curr.fb.cbufs[i]),
+                      rgba );
+      }
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      if (brw->curr.fb.zsbuf) {
+         zstencil_clear( brw,
+                         brw_surface(brw->curr.fb.zsbuf),
+                         depth, stencil );
+      }
+   }
+}
+
+
+void brw_pipe_clear_init( struct brw_context *brw )
+{
+   brw->base.clear = brw_clear;
+}
+
+
+void brw_pipe_clear_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_depth.c b/src/gallium/drivers/i965/brw_pipe_depth.c
new file mode 100644
index 00000000000..e010d76e0d3
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_depth.c
@@ -0,0 +1,172 @@
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+/* XXX: Fixme - include this to get IZ_ defines
+ */
+#include "brw_wm.h"
+
+static unsigned brw_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      return BRW_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:
+      return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   default:
+      assert(0);
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   }
+}
+
+static unsigned translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return BRW_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return BRW_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return BRW_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return BRW_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return BRW_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return BRW_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return BRW_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return BRW_STENCILOP_INVERT;
+   default:
+      assert(0);
+      return BRW_STENCILOP_ZERO;
+   }
+}
+
+static void create_bcc_state( struct brw_depth_stencil_state *zstencil,
+			      const struct pipe_depth_stencil_alpha_state *templ )
+{
+   if (templ->stencil[0].enabled) {
+      zstencil->cc0.stencil_enable = 1;
+      zstencil->cc0.stencil_func =
+	 brw_translate_compare_func(templ->stencil[0].func);
+      zstencil->cc0.stencil_fail_op =
+	 translate_stencil_op(templ->stencil[0].fail_op);
+      zstencil->cc0.stencil_pass_depth_fail_op =
+	 translate_stencil_op(templ->stencil[0].zfail_op);
+      zstencil->cc0.stencil_pass_depth_pass_op =
+	 translate_stencil_op(templ->stencil[0].zpass_op);
+      zstencil->cc1.stencil_ref = templ->stencil[0].ref_value;
+      zstencil->cc1.stencil_write_mask = templ->stencil[0].writemask;
+      zstencil->cc1.stencil_test_mask = templ->stencil[0].valuemask;
+
+      if (templ->stencil[1].enabled) {
+	 zstencil->cc0.bf_stencil_enable = 1;
+	 zstencil->cc0.bf_stencil_func =
+	    brw_translate_compare_func(templ->stencil[1].func);
+	 zstencil->cc0.bf_stencil_fail_op =
+	    translate_stencil_op(templ->stencil[1].fail_op);
+	 zstencil->cc0.bf_stencil_pass_depth_fail_op =
+	    translate_stencil_op(templ->stencil[1].zfail_op);
+	 zstencil->cc0.bf_stencil_pass_depth_pass_op =
+	    translate_stencil_op(templ->stencil[1].zpass_op);
+	 zstencil->cc1.bf_stencil_ref = templ->stencil[1].ref_value;
+	 zstencil->cc2.bf_stencil_write_mask = templ->stencil[1].writemask;
+	 zstencil->cc2.bf_stencil_test_mask = templ->stencil[1].valuemask;
+      }
+
+      zstencil->cc0.stencil_write_enable = (zstencil->cc1.stencil_write_mask ||
+					    zstencil->cc2.bf_stencil_write_mask);
+   }
+
+
+   if (templ->alpha.enabled) {
+      zstencil->cc3.alpha_test = 1;
+      zstencil->cc3.alpha_test_func = brw_translate_compare_func(templ->alpha.func);
+      zstencil->cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+      zstencil->cc7.alpha_ref.ub[0] = float_to_ubyte(templ->alpha.ref_value);
+   }
+
+   if (templ->depth.enabled) {
+      zstencil->cc2.depth_test = 1;
+      zstencil->cc2.depth_test_function = brw_translate_compare_func(templ->depth.func);
+      zstencil->cc2.depth_write_enable = templ->depth.writemask;
+   }
+}
+
+static void create_wm_iz_state( struct brw_depth_stencil_state *zstencil )
+{
+   if (zstencil->cc3.alpha_test)
+      zstencil->iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (zstencil->cc2.depth_test)
+      zstencil->iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (zstencil->cc2.depth_write_enable)
+      zstencil->iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+   if (zstencil->cc0.stencil_write_enable)
+      zstencil->iz_lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+
+}
+
+
+static void *
+brw_create_depth_stencil_state( struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *templ )
+{
+   struct brw_depth_stencil_state *zstencil = CALLOC_STRUCT(brw_depth_stencil_state);
+
+   create_bcc_state( zstencil, templ );
+   create_wm_iz_state( zstencil );
+
+   return (void *)zstencil;
+}
+
+
+static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
+					 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.zstencil = (const struct brw_depth_stencil_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
+					   void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.zstencil);
+   FREE(cso);
+}
+
+
+void brw_pipe_depth_stencil_init( struct brw_context *brw )
+{
+   brw->base.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
+   brw->base.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
+   brw->base.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
+}
+
+void brw_pipe_depth_stencil_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
new file mode 100644
index 00000000000..5d4e5025f97
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -0,0 +1,84 @@
+#include "util/u_math.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_debug.h"
+
+/**
+ * called from intelDrawBuffer()
+ */
+static void brw_set_framebuffer_state( struct pipe_context *pipe, 
+				       const struct pipe_framebuffer_state *fb )
+{
+   struct brw_context *brw = brw_context(pipe);
+   unsigned i;
+
+   /* Dimensions:
+    */
+   if (brw->curr.fb.width != fb->width ||
+       brw->curr.fb.height != fb->height) {
+      brw->curr.fb.width = fb->width;
+      brw->curr.fb.height = fb->height;
+      brw->state.dirty.mesa |= PIPE_NEW_FRAMEBUFFER_DIMENSIONS;
+   }
+   
+   /* Z/Stencil
+    */
+   if (brw->curr.fb.zsbuf != fb->zsbuf) {
+      pipe_surface_reference(&brw->curr.fb.zsbuf, fb->zsbuf);
+      brw->state.dirty.mesa |= PIPE_NEW_DEPTH_BUFFER;
+   }
+
+   /* Color buffers:
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (brw->curr.fb.cbufs[i] != fb->cbufs[i]) {
+	 brw->state.dirty.mesa |= PIPE_NEW_COLOR_BUFFERS;
+	 pipe_surface_reference(&brw->curr.fb.cbufs[i], fb->cbufs[i]);
+      }
+   }
+   
+   if (brw->curr.fb.nr_cbufs != fb->nr_cbufs) {
+      brw->curr.fb.nr_cbufs = MIN2(BRW_MAX_DRAW_BUFFERS, fb->nr_cbufs);
+      brw->state.dirty.mesa |= PIPE_NEW_NR_CBUFS;
+   }
+}
+
+
+static void brw_set_viewport_state( struct pipe_context *pipe,
+				    const struct pipe_viewport_state *viewport )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.viewport = *viewport;
+   brw->curr.ccv.min_depth = viewport->scale[2] * -1.0 + viewport->translate[2];
+   brw->curr.ccv.max_depth = viewport->scale[2] *  1.0 + viewport->translate[2];
+
+   if (0)
+      debug_printf("%s depth range %f .. %f\n",
+                   __FUNCTION__,
+                   brw->curr.ccv.min_depth,
+                   brw->curr.ccv.max_depth);
+
+   brw->state.dirty.mesa |= PIPE_NEW_VIEWPORT;
+}
+
+
+void brw_pipe_framebuffer_init( struct brw_context *brw )
+{
+   brw->base.set_framebuffer_state = brw_set_framebuffer_state;
+   brw->base.set_viewport_state = brw_set_viewport_state;
+}
+
+void brw_pipe_framebuffer_cleanup( struct brw_context *brw )
+{
+   struct pipe_framebuffer_state *fb = &brw->curr.fb;
+   int i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&fb->zsbuf, NULL);
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_flush.c b/src/gallium/drivers/i965/brw_pipe_flush.c
new file mode 100644
index 00000000000..fdc4814b221
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_flush.c
@@ -0,0 +1,83 @@
+
+#include "util/u_upload_mgr.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_batchbuffer.h"
+
+
+
+/* All batchbuffer flushes must go through this function.
+ */
+void brw_context_flush( struct brw_context *brw )
+{
+   /*
+    * 
+    */
+   brw_emit_query_end(brw);
+
+   /* Move to the end of the current upload buffer so that we'll force choosing
+    * a new buffer next time.
+    */
+   u_upload_flush( brw->vb.upload_vertex );
+   u_upload_flush( brw->vb.upload_index );
+
+   _brw_batchbuffer_flush( brw->batch, __FILE__, __LINE__ );
+
+   /* Mark all context state as needing to be re-emitted.
+    * This is probably not as severe as on 915, since almost all of our state
+    * is just in referenced buffers.
+    */
+   brw->state.dirty.brw |= BRW_NEW_CONTEXT;
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+
+   brw->curbe.need_new_bo = GL_TRUE;
+}
+
+static void
+brw_flush( struct pipe_context *pipe,
+           unsigned flags, 
+           struct pipe_fence_handle **fence )
+{
+   brw_context_flush( brw_context( pipe ) );
+   if (fence)
+      *fence = NULL;
+}
+
+static unsigned brw_is_buffer_referenced(struct pipe_context *pipe,
+                                  struct pipe_buffer *buffer)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_screen *bscreen = brw_screen(brw->base.screen);
+
+   return brw_is_buffer_referenced_by_bo( bscreen,
+                                          buffer,
+                                          brw->batch->buf );
+}
+
+static unsigned brw_is_texture_referenced(struct pipe_context *pipe,
+                                   struct pipe_texture *texture,
+                                   unsigned face,
+                                   unsigned level)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_screen *bscreen = brw_screen(brw->base.screen);
+
+   return brw_is_texture_referenced_by_bo( bscreen,
+                                           texture, face, level,
+                                           brw->batch->buf );
+}
+
+void brw_pipe_flush_init( struct brw_context *brw )
+{
+   brw->base.flush = brw_flush;
+   brw->base.is_buffer_referenced = brw_is_buffer_referenced;
+   brw->base.is_texture_referenced = brw_is_texture_referenced;
+}
+
+
+void brw_pipe_flush_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_misc.c b/src/gallium/drivers/i965/brw_pipe_misc.c
new file mode 100644
index 00000000000..30359078079
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_misc.c
@@ -0,0 +1,54 @@
+
+#include "brw_context.h"
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+static void brw_set_polygon_stipple( struct pipe_context *pipe,
+				     const struct pipe_poly_stipple *stip )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_polygon_stipple *bps = &brw->curr.bps;
+   GLuint i;
+
+   memset(bps, 0, sizeof *bps);
+   bps->header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps->header.length = sizeof *bps/4-2;
+
+   for (i = 0; i < 32; i++)
+      bps->stipple[i] = stip->stipple[i]; /* don't invert */
+
+   brw->state.dirty.mesa |= PIPE_NEW_POLYGON_STIPPLE;
+}
+
+
+static void brw_set_scissor_state( struct pipe_context *pipe,
+                                   const struct pipe_scissor_state *scissor )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.scissor =  *scissor;
+   brw->state.dirty.mesa |= PIPE_NEW_SCISSOR;
+}
+
+
+static void brw_set_clip_state( struct pipe_context *pipe,
+                                const struct pipe_clip_state *clip )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.ucp = *clip;
+   brw->state.dirty.mesa |= PIPE_NEW_CLIP;
+}
+
+
+void brw_pipe_misc_init( struct brw_context *brw )
+{
+   brw->base.set_polygon_stipple = brw_set_polygon_stipple;
+   brw->base.set_scissor_state = brw_set_scissor_state;
+   brw->base.set_clip_state = brw_set_clip_state;
+}
+
+
+void brw_pipe_misc_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_query.c b/src/gallium/drivers/i965/brw_pipe_query.c
new file mode 100644
index 00000000000..2eb862635cc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_query.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "util/u_simple_list.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static boolean
+brw_query_get_result(struct pipe_context *pipe,
+		     struct pipe_query *q,
+		     boolean wait,
+		     uint64_t *result)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Map and count the pixels from the current query BO */
+   if (query->bo) {
+      int i;
+      uint64_t *map;
+      
+      if (brw->sws->bo_is_busy(query->bo) && !wait)
+	 return FALSE;
+      
+      map = bo_map_read(brw->sws, query->bo);
+      if (map == NULL)
+	 return FALSE;
+      
+      for (i = query->first_index; i <= query->last_index; i++) {
+	 query->result += map[i * 2 + 1] - map[i * 2];
+      }
+
+      brw->sws->bo_unmap(query->bo);
+      bo_reference(&query->bo, NULL);
+   }
+
+   *result = query->result;
+   return TRUE;
+}
+
+static struct pipe_query *
+brw_query_create(struct pipe_context *pipe, unsigned type )
+{
+   struct brw_query_object *query;
+
+   switch (type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      query = CALLOC_STRUCT( brw_query_object );
+      if (query == NULL)
+	 return NULL;
+      return (struct pipe_query *)query;
+      
+   default:
+      return NULL;
+   }
+}
+
+static void
+brw_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   bo_reference(&query->bo, NULL);
+   FREE(query);
+}
+
+static void
+brw_query_begin(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   bo_reference(&query->bo, NULL);
+   query->result = 0;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   brw->query.stats_wm++;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+static void
+brw_query_end(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      brw_context_flush( brw );
+
+      bo_reference(&brw->query.bo, NULL);
+   }
+
+   remove_from_list(query);
+   brw->query.stats_wm--;
+   brw->state.dirty.mesa |= PIPE_NEW_QUERY;
+}
+
+/***********************************************************************
+ * Internal functions and callbacks to implement queries 
+ */
+
+/** Called to set up the query BO and account for its aperture space */
+enum pipe_error
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   enum pipe_error ret;
+
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return PIPE_OK;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+
+      ret = brw->sws->bo_alloc(brw->sws, BRW_BUFFER_TYPE_QUERY, 4096, 1,
+                               &brw->query.bo);
+      if (ret)
+         return ret;
+
+      brw->query.index = 0;
+   }
+
+   brw_add_validated_bo(brw, brw->query.bo);
+
+   return PIPE_OK;
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     BRW_USAGE_QUERY_RESULT,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 uint64_t tmp;
+	 
+	 /* Propogate the results from this buffer to all of the
+	  * active queries, as the bo is going away.
+	  */
+	 if (query->bo != NULL)
+	    brw_query_get_result( &brw->base, 
+				  (struct pipe_query *)query,
+				  FALSE,
+				  &tmp );
+
+	 bo_reference( &query->bo, brw->query.bo );
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     BRW_USAGE_QUERY_RESULT,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_pipe_query_init( struct brw_context *brw )
+{
+   brw->base.create_query = brw_query_create;
+   brw->base.destroy_query = brw_query_destroy;
+   brw->base.begin_query = brw_query_begin;
+   brw->base.end_query = brw_query_end;
+   brw->base.get_query_result = brw_query_get_result;
+}
+
+
+void brw_pipe_query_cleanup( struct brw_context *brw )
+{
+   /* Unreference brw->query.bo ??
+    */
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.c b/src/gallium/drivers/i965/brw_pipe_rast.c
new file mode 100644
index 00000000000..2117e91a9e4
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.c
@@ -0,0 +1,161 @@
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_pipe_rast.h"
+#include "brw_wm.h"
+
+
+static unsigned translate_fill( unsigned fill )
+{
+   switch (fill) {
+   case PIPE_POLYGON_MODE_FILL:
+      return CLIP_FILL;
+   case PIPE_POLYGON_MODE_LINE:
+      return CLIP_LINE;
+   case PIPE_POLYGON_MODE_POINT:
+      return CLIP_POINT;
+   default:
+      assert(0);
+      return CLIP_FILL;
+   }
+}
+
+
+/* Calculates the key for triangle-mode clipping.  Non-triangle
+ * clipping keys use much less information and are computed on the
+ * fly.
+ */
+static void
+calculate_clip_key_rast( const struct brw_context *brw,
+			 const struct pipe_rasterizer_state *templ,
+			 const struct brw_rasterizer_state *rast,
+			 struct brw_clip_prog_key *key)
+{
+   memset(key, 0, sizeof *key);
+
+   if (brw->chipset.is_igdng)
+       key->clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key->clip_mode = BRW_CLIPMODE_NORMAL;
+
+   key->do_flat_shading = templ->flatshade;
+
+   if (templ->cull_mode == PIPE_WINDING_BOTH) {
+      key->clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      return;
+   }
+
+   key->fill_ccw = CLIP_CULL;
+   key->fill_cw = CLIP_CULL;
+
+   if (!(templ->cull_mode & PIPE_WINDING_CCW)) {
+      key->fill_ccw = translate_fill(templ->fill_ccw);
+   }
+
+   if (!(templ->cull_mode & PIPE_WINDING_CW)) {
+      key->fill_cw = translate_fill(templ->fill_cw);
+   }
+
+   if (key->fill_cw == CLIP_LINE ||
+       key->fill_ccw == CLIP_LINE ||
+       key->fill_cw == CLIP_POINT ||
+       key->fill_ccw == CLIP_POINT) {
+      key->do_unfilled = 1;
+      key->clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+   }
+
+   key->offset_ccw = templ->offset_ccw;
+   key->offset_cw = templ->offset_cw;
+
+   if (templ->light_twoside && key->fill_cw != CLIP_CULL) 
+      key->copy_bfc_cw = 1;
+   
+   if (templ->light_twoside && key->fill_ccw != CLIP_CULL) 
+      key->copy_bfc_ccw = 1;
+}
+
+
+static void
+calculate_line_stipple_rast( const struct pipe_rasterizer_state *templ,
+			     struct brw_line_stipple *bls )
+{
+   GLfloat tmp = 1.0f / (templ->line_stipple_factor + 1);
+   GLint tmpi = tmp * (1<<13);
+
+   bls->header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls->header.length = sizeof(*bls)/4 - 2;
+   bls->bits0.pattern = templ->line_stipple_pattern;
+   bls->bits1.repeat_count = templ->line_stipple_factor + 1;
+   bls->bits1.inverse_repeat_count = tmpi;
+}
+
+static void *brw_create_rasterizer_state( struct pipe_context *pipe,
+					  const struct pipe_rasterizer_state *templ )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_rasterizer_state *rast;
+
+   rast = CALLOC_STRUCT(brw_rasterizer_state);
+   if (rast == NULL)
+      return NULL;
+
+   rast->templ = *templ;
+
+   calculate_clip_key_rast( brw, templ, rast, &rast->clip_key );
+   
+   if (templ->line_stipple_enable)
+      calculate_line_stipple_rast( templ, &rast->bls );
+
+   /* Caclculate lookup value for WM IZ table.
+    */
+   if (templ->line_smooth) {
+      if (templ->fill_cw == PIPE_POLYGON_MODE_LINE &&
+	  templ->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+	 rast->unfilled_aa_line = AA_ALWAYS;
+      }
+      else if (templ->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	       templ->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+	 rast->unfilled_aa_line = AA_SOMETIMES;
+      }
+      else {
+	 rast->unfilled_aa_line = AA_NEVER;
+      }
+   }
+   else {
+      rast->unfilled_aa_line = AA_NEVER;
+   }
+
+   return (void *)rast;
+}
+
+
+static void brw_bind_rasterizer_state(struct pipe_context *pipe,
+				 void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   brw->curr.rast = (const struct brw_rasterizer_state *)cso;
+   brw->state.dirty.mesa |= PIPE_NEW_RAST;
+}
+
+static void brw_delete_rasterizer_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   struct brw_context *brw = brw_context(pipe);
+   assert((const void *)cso != (const void *)brw->curr.rast);
+   FREE(cso);
+}
+
+
+
+void brw_pipe_rast_init( struct brw_context *brw )
+{
+   brw->base.create_rasterizer_state = brw_create_rasterizer_state;
+   brw->base.bind_rasterizer_state = brw_bind_rasterizer_state;
+   brw->base.delete_rasterizer_state = brw_delete_rasterizer_state;
+}
+
+void brw_pipe_rast_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_rast.h b/src/gallium/drivers/i965/brw_pipe_rast.h
new file mode 100644
index 00000000000..9354f01e18a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_rast.h
@@ -0,0 +1,16 @@
+#ifndef BRW_PIPE_RAST_H
+#define BRW_PIPE_RAST_H
+
+#include "brw_clip.h"
+
+struct brw_rasterizer_state {
+   struct pipe_rasterizer_state templ; /* for draw module */
+
+   /* Precalculated hardware state:
+    */
+   struct brw_clip_prog_key clip_key;
+   struct brw_line_stipple bls;
+   unsigned unfilled_aa_line;
+};
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_pipe_sampler.c b/src/gallium/drivers/i965/brw_pipe_sampler.c
new file mode 100644
index 00000000000..81712798a5d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_sampler.c
@@ -0,0 +1,231 @@
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static GLuint translate_wrap_mode( unsigned wrap )
+{
+   switch( wrap ) {
+   case PIPE_TEX_WRAP_REPEAT: 
+      return BRW_TEXCOORDMODE_WRAP;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return BRW_TEXCOORDMODE_CLAMP;
+      
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT: 
+      return BRW_TEXCOORDMODE_MIRROR;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP: 
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 
+      return BRW_TEXCOORDMODE_MIRROR_ONCE;
+
+   default: 
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+static GLuint translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      return BRW_MAPFILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:
+      return BRW_MAPFILTER_LINEAR;
+   default:
+      assert(0);
+      return BRW_MAPFILTER_NEAREST;
+   }
+}
+
+static GLuint translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE: 
+      return BRW_MIPFILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return BRW_MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return BRW_MIPFILTER_LINEAR;
+   default:
+      assert(0);
+      return BRW_MIPFILTER_NONE;
+   }
+}
+
+/* XXX: not sure why there are special translations for the shadow tex
+ * compare functions.  In particular ALWAYS is translated to NEVER.
+ * Is this a hardware issue?  Does i965 really suffer from this?
+ */
+static GLuint translate_shadow_compare_func( unsigned func )
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER: 
+       return BRW_COMPAREFUNCTION_ALWAYS;
+   case PIPE_FUNC_LESS: 
+       return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_LEQUAL: 
+       return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_GREATER: 
+       return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_GEQUAL: 
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_NOTEQUAL: 
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_EQUAL: 
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_ALWAYS: 
+       return BRW_COMPAREFUNCTION_NEVER;
+   default:
+      assert(0);
+      return BRW_COMPAREFUNCTION_NEVER;
+   }
+}
+
+
+
+
+static void *
+brw_create_sampler_state( struct pipe_context *pipe,
+                          const struct pipe_sampler_state *template )
+{
+   struct brw_sampler *sampler = CALLOC_STRUCT(brw_sampler);
+
+   sampler->ss0.min_filter = translate_img_filter( template->min_img_filter );
+   sampler->ss0.mag_filter = translate_img_filter( template->mag_img_filter );
+   sampler->ss0.mip_filter = translate_mip_filter( template->min_mip_filter );
+
+
+   /* XXX: anisotropy logic slightly changed: 
+    */
+   if (template->max_anisotropy > 1.0) {
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC; 
+      sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
+
+      if (template->max_anisotropy > 2.0) {
+	 sampler->ss3.max_aniso = MIN2((template->max_anisotropy - 2) / 2,
+				       BRW_ANISORATIO_16);
+      }
+   }
+
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(template->wrap_r);
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(template->wrap_s);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(template->wrap_t);
+
+   /* Set LOD bias: 
+    */
+   sampler->ss0.lod_bias = 
+      util_signed_fixed(CLAMP(template->lod_bias, -16, 15), 6);
+
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set shadow function: 
+    */
+   if (template->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function =
+	 translate_shadow_compare_func(template->compare_func);
+   }
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD: 
+    */
+   sampler->ss0.base_level = 
+      util_unsigned_fixed(0, 1);
+
+   sampler->ss1.max_lod = 
+      util_unsigned_fixed(CLAMP(template->max_lod, 0, 13), 6);
+
+   sampler->ss1.min_lod = 
+      util_unsigned_fixed(CLAMP(template->min_lod, 0, 13), 6);
+
+   return (void *)sampler;
+}
+
+static void brw_bind_sampler_state(struct pipe_context *pipe,
+                                   unsigned num, void **sampler)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   for (i = 0; i < num; i++)
+      brw->curr.sampler[i] = sampler[i];
+
+   for (i = num; i < brw->curr.num_samplers; i++)
+      brw->curr.sampler[i] = NULL;
+
+   brw->curr.num_samplers = num;
+   brw->state.dirty.mesa |= PIPE_NEW_SAMPLERS;
+}
+
+static void brw_delete_sampler_state(struct pipe_context *pipe,
+				  void *cso)
+{
+   FREE(cso);
+}
+
+static void brw_set_sampler_textures(struct pipe_context *pipe,
+				     unsigned num,
+				     struct pipe_texture **texture)
+{
+   struct brw_context *brw = brw_context(pipe);
+   int i;
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference(&brw->curr.texture[i], texture[i]);
+
+   for (i = num; i < brw->curr.num_textures; i++)
+      pipe_texture_reference(&brw->curr.texture[i], NULL);
+
+   brw->curr.num_textures = num;
+   brw->state.dirty.mesa |= PIPE_NEW_BOUND_TEXTURES;
+}
+
+static void brw_set_vertex_sampler_textures(struct pipe_context *pipe,
+                                            unsigned num,
+                                            struct pipe_texture **texture)
+{
+}
+
+static void brw_bind_vertex_sampler_state(struct pipe_context *pipe,
+                                          unsigned num, void **sampler)
+{
+}
+
+
+void brw_pipe_sampler_init( struct brw_context *brw )
+{
+   brw->base.create_sampler_state = brw_create_sampler_state;
+   brw->base.delete_sampler_state = brw_delete_sampler_state;
+
+   brw->base.set_fragment_sampler_textures = brw_set_sampler_textures;
+   brw->base.bind_fragment_sampler_states = brw_bind_sampler_state;
+
+   brw->base.set_vertex_sampler_textures = brw_set_vertex_sampler_textures;
+   brw->base.bind_vertex_sampler_states = brw_bind_vertex_sampler_state;
+
+}
+void brw_pipe_sampler_cleanup( struct brw_context *brw )
+{
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
new file mode 100644
index 00000000000..e389587f3e1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -0,0 +1,303 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+  
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+
+
+/**
+ * Determine if the given shader uses complex features such as flow
+ * conditionals, loops, subroutines.
+ */
+static GLboolean has_flow_control(const struct tgsi_shader_info *info)
+{
+    return (info->opcode_count[TGSI_OPCODE_ARL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_IF] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_ENDIF] > 0 || /* redundant - IF */
+	    info->opcode_count[TGSI_OPCODE_CAL] > 0 ||
+	    info->opcode_count[TGSI_OPCODE_BRK] > 0 ||   /* redundant - BGNLOOP */
+	    info->opcode_count[TGSI_OPCODE_RET] > 0 ||   /* redundant - CAL */
+	    info->opcode_count[TGSI_OPCODE_BGNLOOP] > 0);
+}
+
+
+static void scan_immediates(const struct tgsi_token *tokens,
+                            const struct tgsi_shader_info *info,
+                            struct brw_immediate_data *imm)
+{
+   struct tgsi_parse_context parse;
+   boolean done = FALSE;
+
+   imm->nr = 0;
+   imm->data = MALLOC(info->immediate_count * 4 * sizeof(float));
+
+   tgsi_parse_init( &parse, tokens );
+   while (!tgsi_parse_end_of_tokens( &parse ) && !done) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+	 static const float id[4] = {0,0,0,1};
+	 const float *value = &parse.FullToken.FullImmediate.u[0].Float;
+	 unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+         unsigned i;
+
+	 for (i = 0; i < size; i++)
+	    imm->data[imm->nr][i] = value[i];
+
+	 for (; i < 4; i++)
+	    imm->data[imm->nr][i] = id[i];
+         
+         imm->nr++;
+	 break;
+      }
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+	 done = 1;
+	 break;
+      }
+   }
+}
+
+
+static void brw_bind_fs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
+   struct brw_context *brw = brw_context(pipe);
+   
+   if (brw->curr.fragment_shader == fs)
+      return;
+
+   if (brw->curr.fragment_shader == NULL ||
+       fs == NULL ||
+       memcmp(&brw->curr.fragment_shader->signature, &fs->signature,
+              brw_fs_signature_size(&fs->signature)) != 0) {
+      brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SIGNATURE;
+   }
+
+   brw->curr.fragment_shader = fs;
+   brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_SHADER;
+}
+
+static void brw_bind_vs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->curr.vertex_shader = (struct brw_vertex_shader *)prog;
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_SHADER;
+}
+
+
+
+static void *brw_create_fs_state( struct pipe_context *pipe,
+				  const struct pipe_shader_state *shader )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_fragment_shader *fs;
+   int i;
+
+   fs = CALLOC_STRUCT(brw_fragment_shader);
+   if (fs == NULL)
+      return NULL;
+
+   /* Duplicate tokens, scan shader
+    */
+   fs->id = brw->program_id++;
+   fs->has_flow_control = has_flow_control(&fs->info);
+
+   fs->tokens = tgsi_dup_tokens(shader->tokens);
+   if (fs->tokens == NULL)
+      goto fail;
+
+   tgsi_scan_shader(fs->tokens, &fs->info);
+   scan_immediates(fs->tokens, &fs->info, &fs->immediates);
+
+   fs->signature.nr_inputs = fs->info.num_inputs;
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      fs->signature.input[i].interp = fs->info.input_interpolate[i];
+      fs->signature.input[i].semantic = fs->info.input_semantic_name[i];
+      fs->signature.input[i].semantic_index = fs->info.input_semantic_index[i];
+   }
+
+   for (i = 0; i < fs->info.num_inputs; i++)
+      if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION)
+	 fs->uses_depth = 1;
+
+   if (fs->info.uses_kill)
+      fs->iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fs->info.writes_z)
+      fs->iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   return (void *)fs;
+
+fail:
+   FREE(fs);
+   return NULL;
+}
+
+
+static void *brw_create_vs_state( struct pipe_context *pipe,
+				  const struct pipe_shader_state *shader )
+{
+   struct brw_context *brw = brw_context(pipe);
+   struct brw_vertex_shader *vs;
+   unsigned i;
+
+   vs = CALLOC_STRUCT(brw_vertex_shader);
+   if (vs == NULL)
+      return NULL;
+
+   /* Duplicate tokens, scan shader
+    */
+   vs->tokens = tgsi_dup_tokens(shader->tokens);
+   if (vs->tokens == NULL)
+      goto fail;
+
+   tgsi_scan_shader(vs->tokens, &vs->info);
+   scan_immediates(vs->tokens, &vs->info, &vs->immediates);
+
+   vs->id = brw->program_id++;
+   vs->has_flow_control = has_flow_control(&vs->info);
+
+   vs->output_hpos = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_edgeflag = BRW_OUTPUT_NOT_PRESENT;
+
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      int index = vs->info.output_semantic_index[i];
+      switch (vs->info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         vs->output_hpos = i;
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (index == 0)
+            vs->output_color0 = i;
+         else
+            vs->output_color1 = i;
+         break;
+      case TGSI_SEMANTIC_BCOLOR:
+         if (index == 0)
+            vs->output_bfc0 = i;
+         else
+            vs->output_bfc1 = i;
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         vs->output_edgeflag = i;
+         break;
+      }
+   }
+
+   
+   /* Done:
+    */
+   return (void *)vs;
+
+fail:
+   FREE(vs);
+   return NULL;
+}
+
+
+static void brw_delete_fs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *fs = (struct brw_fragment_shader *)prog;
+
+   bo_reference(&fs->const_buffer, NULL);
+   FREE( (void *)fs->tokens );
+   FREE( fs );
+}
+
+
+static void brw_delete_vs_state( struct pipe_context *pipe, void *prog )
+{
+   struct brw_fragment_shader *vs = (struct brw_fragment_shader *)prog;
+
+   /* Delete draw shader
+    */
+   FREE( (void *)vs->tokens );
+   FREE( vs );
+}
+
+
+static void brw_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     struct pipe_buffer *buf)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(index == 0);
+
+   if (shader == PIPE_SHADER_FRAGMENT) {
+      pipe_buffer_reference( &brw->curr.fragment_constants,
+                             buf );
+
+      brw->state.dirty.mesa |= PIPE_NEW_FRAGMENT_CONSTANTS;
+   }
+   else {
+      pipe_buffer_reference( &brw->curr.vertex_constants,
+                             buf );
+
+      brw->state.dirty.mesa |= PIPE_NEW_VERTEX_CONSTANTS;
+   }
+}
+
+
+void brw_pipe_shader_init( struct brw_context *brw )
+{
+   brw->base.set_constant_buffer = brw_set_constant_buffer;
+
+   brw->base.create_vs_state = brw_create_vs_state;
+   brw->base.bind_vs_state = brw_bind_vs_state;
+   brw->base.delete_vs_state = brw_delete_vs_state;
+
+   brw->base.create_fs_state = brw_create_fs_state;
+   brw->base.bind_fs_state = brw_bind_fs_state;
+   brw->base.delete_fs_state = brw_delete_fs_state;
+}
+
+void brw_pipe_shader_cleanup( struct brw_context *brw )
+{
+   pipe_buffer_reference( &brw->curr.fragment_constants, NULL );
+   pipe_buffer_reference( &brw->curr.vertex_constants, NULL );
+}
diff --git a/src/gallium/drivers/i965/brw_pipe_vertex.c b/src/gallium/drivers/i965/brw_pipe_vertex.c
new file mode 100644
index 00000000000..e3c48e31493
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_pipe_vertex.c
@@ -0,0 +1,71 @@
+#include "brw_context.h"
+
+
+static void brw_set_vertex_elements( struct pipe_context *pipe,
+				     unsigned count,
+				     const struct pipe_vertex_element *elements )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   memcpy(brw->curr.vertex_element, elements, count * sizeof(elements[0]));
+   brw->curr.num_vertex_elements = count;
+
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_ELEMENT;
+}
+
+
+static void brw_set_vertex_buffers(struct pipe_context *pipe,
+				   unsigned count,
+				   const struct pipe_vertex_buffer *buffers)
+{
+   struct brw_context *brw = brw_context(pipe);
+   unsigned i;
+
+   /* Check for no change */
+   if (count == brw->curr.num_vertex_buffers &&
+       memcmp(brw->curr.vertex_buffer,
+              buffers,
+              count * sizeof buffers[0]) == 0)
+      return;
+
+   /* Adjust refcounts */
+   for (i = 0; i < count; i++) 
+      pipe_buffer_reference(&brw->curr.vertex_buffer[i].buffer, 
+                            buffers[i].buffer);
+
+   for ( ; i < brw->curr.num_vertex_buffers; i++)
+      pipe_buffer_reference(&brw->curr.vertex_buffer[i].buffer,
+                            NULL);
+
+   /* Copy remaining data */
+   memcpy(brw->curr.vertex_buffer, buffers, count * sizeof buffers[0]);
+   brw->curr.num_vertex_buffers = count;
+
+   brw->state.dirty.mesa |= PIPE_NEW_VERTEX_BUFFER;
+}
+
+
+void 
+brw_pipe_vertex_init( struct brw_context *brw )
+{
+   brw->base.set_vertex_buffers = brw_set_vertex_buffers;
+   brw->base.set_vertex_elements = brw_set_vertex_elements;
+}
+
+
+void 
+brw_pipe_vertex_cleanup( struct brw_context *brw )
+{
+
+   /* Release bound pipe vertex_buffers
+    */
+
+   /* Release some other stuff
+    */
+#if 0
+   for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+      bo_reference(&brw->vb.inputs[i].bo, NULL);
+      brw->vb.inputs[i].bo = NULL;
+   }
+#endif
+}
diff --git a/src/gallium/drivers/i965/brw_reg.h b/src/gallium/drivers/i965/brw_reg.h
new file mode 100644
index 00000000000..a63403b6afd
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_reg.h
@@ -0,0 +1,115 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_NOOP				(CMD_MI | 0)
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+#define MI_FLUSH			(CMD_MI | (4 << 23))
+
+#define _3DSTATE_DRAWRECT_INFO_I965	(CMD_3D | (3 << 27) | (1 << 24) | 0x2)
+
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
+
+
+
+/* PCI IDs
+ */
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I946_GZ		0x2972
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+#define PCI_CHIP_GM45_GM                0x2A42
+
+#define PCI_CHIP_IGD_E_G                0x2E02
+#define PCI_CHIP_Q45_G                  0x2E12
+#define PCI_CHIP_G45_G                  0x2E22
+#define PCI_CHIP_G41_G                  0x2E32
+#define PCI_CHIP_B43_G                  0x2E42
+
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
+
+struct brw_chipset {
+   unsigned pci_id:16;
+   unsigned is_965:1;
+   unsigned is_igdng:1;
+   unsigned is_g4x:1;
+   unsigned pad:13;
+};
+
+
+/* XXX: hacks
+ */
+#define VERT_RESULT_HPOS 0	/* not always true */
+#define VERT_RESULT_PSIZ 10000	/* disabled */
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_screen.c b/src/gallium/drivers/i965/brw_screen.c
new file mode 100644
index 00000000000..0ecacac9a3a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.c
@@ -0,0 +1,403 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+#include "brw_reg.h"
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_winsys.h"
+#include "brw_debug.h"
+
+#ifdef DEBUG
+static const struct debug_named_value debug_names[] = {
+   { "tex",   DEBUG_TEXTURE},
+   { "state", DEBUG_STATE},
+   { "ioctl", DEBUG_IOCTL},
+   { "blit",  DEBUG_BLIT},
+   { "curbe", DEBUG_CURBE},
+   { "fall",  DEBUG_FALLBACKS},
+   { "verb",  DEBUG_VERBOSE},
+   { "bat",   DEBUG_BATCH},
+   { "pix",   DEBUG_PIXEL},
+   { "wins",  DEBUG_WINSYS},
+   { "min",   DEBUG_MIN_URB},
+   { "dis",   DEBUG_DISASSEM},
+   { "sync",  DEBUG_SYNC},
+   { "prim",  DEBUG_PRIMS },
+   { "vert",  DEBUG_VERTS },
+   { "dma",   DEBUG_DMA },
+   { "san",   DEBUG_SANITY },
+   { "sleep", DEBUG_SLEEP },
+   { "stats", DEBUG_STATS },
+   { "sing",  DEBUG_SINGLE_THREAD },
+   { "thre",  DEBUG_SINGLE_THREAD },
+   { "wm",    DEBUG_WM },
+   { "urb",   DEBUG_URB },
+   { "vs",    DEBUG_VS },
+   { NULL,    0 }
+};
+
+static const struct debug_named_value dump_names[] = {
+   { "asm",   DUMP_ASM},
+   { "state", DUMP_STATE},
+   { "batch", DUMP_BATCH},
+   { NULL, 0 }
+};
+
+int BRW_DEBUG = 0;
+int BRW_DUMP = 0;
+
+#endif
+
+
+/*
+ * Probe functions
+ */
+
+
+static const char *
+brw_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+static const char *
+brw_get_name(struct pipe_screen *screen)
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (brw_screen(screen)->chipset.pci_id) {
+   case PCI_CHIP_I965_G:
+      chipset = "I965_G";
+      break;
+   case PCI_CHIP_I965_Q:
+      chipset = "I965_Q";
+      break;
+   case PCI_CHIP_I965_G_1:
+      chipset = "I965_G_1";
+      break;
+   case PCI_CHIP_I946_GZ:
+      chipset = "I946_GZ";
+      break;
+   case PCI_CHIP_I965_GM:
+      chipset = "I965_GM";
+      break;
+   case PCI_CHIP_I965_GME:
+      chipset = "I965_GME";
+      break;
+   case PCI_CHIP_GM45_GM:
+      chipset = "GM45_GM";
+      break;
+   case PCI_CHIP_IGD_E_G:
+      chipset = "IGD_E_G";
+      break;
+   case PCI_CHIP_Q45_G:
+      chipset = "Q45_G";
+      break;
+   case PCI_CHIP_G45_G:
+      chipset = "G45_G";
+      break;
+   case PCI_CHIP_G41_G:
+      chipset = "G41_G";
+      break;
+   case PCI_CHIP_B43_G:
+      chipset = "B43_G";
+      break;
+   case PCI_CHIP_ILD_G:
+      chipset = "ILD_G";
+      break;
+   case PCI_CHIP_ILM_G:
+      chipset = "ILM_G";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
+   return buffer;
+}
+
+static int
+brw_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+static float
+brw_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+static boolean
+brw_is_format_supported(struct pipe_screen *screen,
+                         enum pipe_format format, 
+                         enum pipe_texture_target target,
+                         unsigned tex_usage, 
+                         unsigned geom_flags)
+{
+   static const enum pipe_format tex_supported[] = {
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_L16_UNORM,
+      /*PIPE_FORMAT_I16_UNORM,*/
+      /*PIPE_FORMAT_A16_UNORM,*/
+      PIPE_FORMAT_A8L8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_A1R5G5B5_UNORM,
+      PIPE_FORMAT_A4R4G4B4_UNORM,
+      PIPE_FORMAT_X8R8G8B8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      /* video */
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      /* compressed */
+      /*PIPE_FORMAT_FXT1_RGBA,*/
+      PIPE_FORMAT_DXT1_RGB,
+      PIPE_FORMAT_DXT1_RGBA,
+      PIPE_FORMAT_DXT3_RGBA,
+      PIPE_FORMAT_DXT5_RGBA,
+      /* sRGB */
+      PIPE_FORMAT_R8G8B8A8_SRGB,
+      PIPE_FORMAT_A8L8_SRGB,
+      PIPE_FORMAT_L8_SRGB,
+      PIPE_FORMAT_DXT1_SRGB,
+      /* depth */
+      PIPE_FORMAT_Z32_FLOAT,
+      PIPE_FORMAT_X8Z24_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+      PIPE_FORMAT_Z16_UNORM,
+      /* signed */
+      PIPE_FORMAT_R8G8_SNORM,
+      PIPE_FORMAT_R8G8B8A8_SNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format render_supported[] = {
+      PIPE_FORMAT_X8R8G8B8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format depth_supported[] = {
+      PIPE_FORMAT_Z32_FLOAT,
+      PIPE_FORMAT_X8Z24_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+      PIPE_FORMAT_Z16_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   const enum pipe_format *list;
+   uint i;
+
+   if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL)
+      list = depth_supported;
+   else if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET)
+      list = render_supported;
+   else
+      list = tex_supported;
+
+   for (i = 0; list[i] != PIPE_FORMAT_NONE; i++) {
+      if (list[i] == format)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+/*
+ * Fence functions
+ */
+
+
+static void
+brw_fence_reference(struct pipe_screen *screen,
+                     struct pipe_fence_handle **ptr,
+                     struct pipe_fence_handle *fence)
+{
+}
+
+static int
+brw_fence_signalled(struct pipe_screen *screen,
+                     struct pipe_fence_handle *fence,
+                     unsigned flags)
+{
+   return 0;                    /* XXX shouldn't this be a boolean? */
+}
+
+static int
+brw_fence_finish(struct pipe_screen *screen,
+                 struct pipe_fence_handle *fence,
+                 unsigned flags)
+{
+   return 0;
+}
+
+
+/*
+ * Generic functions
+ */
+
+
+static void
+brw_destroy_screen(struct pipe_screen *screen)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+
+   if (bscreen->sws)
+      bscreen->sws->destroy(bscreen->sws);
+
+   FREE(bscreen);
+}
+
+/**
+ * Create a new brw_screen object
+ */
+struct pipe_screen *
+brw_create_screen(struct brw_winsys_screen *sws, uint pci_id)
+{
+   struct brw_screen *bscreen;
+   struct brw_chipset chipset;
+
+#ifdef DEBUG
+   BRW_DEBUG = debug_get_flags_option("BRW_DEBUG", debug_names, 0);
+   BRW_DEBUG |= debug_get_flags_option("INTEL_DEBUG", debug_names, 0);
+   BRW_DEBUG |= DEBUG_STATS | DEBUG_MIN_URB | DEBUG_WM;
+
+   BRW_DUMP = debug_get_flags_option("BRW_DUMP", dump_names, 0);
+#endif
+
+   memset(&chipset, 0, sizeof chipset);
+
+   chipset.pci_id = pci_id;
+
+   switch (pci_id) {
+   case PCI_CHIP_I965_G:
+   case PCI_CHIP_I965_Q:
+   case PCI_CHIP_I965_G_1:
+   case PCI_CHIP_I946_GZ:
+   case PCI_CHIP_I965_GM:
+   case PCI_CHIP_I965_GME:
+      chipset.is_965 = TRUE;
+      break;
+
+   case PCI_CHIP_GM45_GM:
+   case PCI_CHIP_IGD_E_G:
+   case PCI_CHIP_Q45_G:
+   case PCI_CHIP_G45_G:
+   case PCI_CHIP_G41_G:
+   case PCI_CHIP_B43_G:
+      chipset.is_g4x = TRUE;
+      break;
+
+   case PCI_CHIP_ILD_G:
+   case PCI_CHIP_ILM_G:
+      chipset.is_igdng = TRUE;
+      break;
+
+   default:
+      debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
+                   __FUNCTION__, pci_id);
+      return NULL;
+   }
+
+
+   bscreen = CALLOC_STRUCT(brw_screen);
+   if (!bscreen)
+      return NULL;
+
+   bscreen->chipset = chipset;
+   bscreen->sws = sws;
+   bscreen->base.winsys = NULL;
+   bscreen->base.destroy = brw_destroy_screen;
+   bscreen->base.get_name = brw_get_name;
+   bscreen->base.get_vendor = brw_get_vendor;
+   bscreen->base.get_param = brw_get_param;
+   bscreen->base.get_paramf = brw_get_paramf;
+   bscreen->base.is_format_supported = brw_is_format_supported;
+   bscreen->base.fence_reference = brw_fence_reference;
+   bscreen->base.fence_signalled = brw_fence_signalled;
+   bscreen->base.fence_finish = brw_fence_finish;
+
+   brw_screen_tex_init(bscreen);
+   brw_screen_tex_surface_init(bscreen);
+   brw_screen_buffer_init(bscreen);
+
+   bscreen->no_tiling = debug_get_option("BRW_NO_TILING", FALSE) != NULL;
+   
+   
+   return &bscreen->base;
+}
diff --git a/src/gallium/drivers/i965/brw_screen.h b/src/gallium/drivers/i965/brw_screen.h
new file mode 100644
index 00000000000..7226d9228b7
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen.h
@@ -0,0 +1,199 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+
+#include "brw_reg.h"
+#include "brw_structs.h"
+
+struct brw_winsys_screen;
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen base;
+   struct brw_chipset chipset;
+   struct brw_winsys_screen *sws;
+   boolean no_tiling;
+};
+
+/**
+ * Subclass of pipe_transfer
+ */
+struct brw_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned offset;
+};
+
+struct brw_buffer
+{
+   struct pipe_buffer base;
+
+   /* One of either bo or user_buffer will be non-null, depending on
+    * whether this is a hardware or user buffer.
+    */
+   struct brw_winsys_buffer *bo;
+   void *user_buffer;
+
+   /* Mapped pointer??
+    */
+   void *ptr;
+};
+
+
+union brw_surface_id {
+   struct {
+      unsigned face:3;
+      unsigned zslice:13;
+      unsigned level:16;
+   } bits;
+   unsigned value;
+};
+
+
+struct brw_surface
+{
+   struct pipe_surface base;
+   
+   union brw_surface_id id;
+   unsigned cpp;
+   unsigned pitch;
+   unsigned draw_offset;
+   unsigned tiling;
+
+   struct brw_surface_state ss;
+   struct brw_winsys_buffer *bo;
+   struct brw_surface *next, *prev;
+};
+
+
+
+struct brw_texture
+{
+   struct pipe_texture base;
+   struct brw_winsys_buffer *bo;
+   struct brw_surface_state ss;
+
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned level_offset[PIPE_MAX_TEXTURE_LEVELS];
+
+   boolean compressed;
+   unsigned brw_target;
+   unsigned pitch;
+   unsigned tiling;
+   unsigned cpp;
+   unsigned total_height;
+
+   struct brw_surface views[2];
+};
+
+
+
+/*
+ * Cast wrappers
+ */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+static INLINE struct brw_transfer *
+brw_transfer(struct pipe_transfer *transfer)
+{
+   return (struct brw_transfer *)transfer;
+}
+
+static INLINE struct brw_surface *
+brw_surface(struct pipe_surface *surface)
+{
+   return (struct brw_surface *)surface;
+}
+
+static INLINE struct brw_buffer *
+brw_buffer(struct pipe_buffer *buffer)
+{
+   return (struct brw_buffer *)buffer;
+}
+
+static INLINE struct brw_texture *
+brw_texture(struct pipe_texture *texture)
+{
+   return (struct brw_texture *)texture;
+}
+
+
+/* Pipe buffer helpers
+ */
+static INLINE boolean
+brw_buffer_is_user_buffer( const struct pipe_buffer *buf )
+{
+   return ((const struct brw_buffer *)buf)->user_buffer != NULL;
+}
+
+unsigned
+brw_surface_pitch( const struct pipe_surface *surface );
+
+/***********************************************************************
+ * Internal functions 
+ */
+GLboolean brw_texture_layout(struct brw_screen *brw_screen,
+			     struct brw_texture *tex );
+
+void brw_update_texture( struct brw_screen *brw_screen,
+			 struct brw_texture *tex );
+
+
+void brw_screen_tex_init( struct brw_screen *brw_screen );
+void brw_screen_tex_surface_init( struct brw_screen *brw_screen );
+
+void brw_screen_buffer_init(struct brw_screen *brw_screen);
+
+
+boolean brw_is_texture_referenced_by_bo( struct brw_screen *brw_screen,
+                                         struct pipe_texture *texture,
+                                         unsigned face, 
+                                         unsigned level,
+                                         struct brw_winsys_buffer *bo );
+
+boolean brw_is_buffer_referenced_by_bo( struct brw_screen *brw_screen,
+                                        struct pipe_buffer *buffer,
+                                        struct brw_winsys_buffer *bo );
+
+
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965/brw_screen_buffers.c b/src/gallium/drivers/i965/brw_screen_buffers.c
new file mode 100644
index 00000000000..d8141a3f5b9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_buffers.c
@@ -0,0 +1,202 @@
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "brw_screen.h"
+#include "brw_winsys.h"
+
+
+
+static void *
+brw_buffer_map_range( struct pipe_screen *screen,
+                      struct pipe_buffer *buffer,
+                      unsigned offset,
+                      unsigned length,
+                      unsigned usage )
+{
+   struct brw_screen *bscreen = brw_screen(screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer( buffer );
+
+   if (buf->user_buffer)
+      return buf->user_buffer;
+
+   return sws->bo_map( buf->bo, 
+                       BRW_DATA_OTHER,
+                       offset,
+                       length,
+                       (usage & PIPE_BUFFER_USAGE_CPU_WRITE) ? TRUE : FALSE,
+                       (usage & PIPE_BUFFER_USAGE_DISCARD) ? TRUE : FALSE,
+                       (usage & PIPE_BUFFER_USAGE_FLUSH_EXPLICIT) ? TRUE : FALSE);
+}
+
+static void *
+brw_buffer_map( struct pipe_screen *screen,
+                struct pipe_buffer *buffer,
+                unsigned usage )
+{
+   struct brw_screen *bscreen = brw_screen(screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer( buffer );
+
+   if (buf->user_buffer)
+      return buf->user_buffer;
+
+   return sws->bo_map( buf->bo, 
+                       BRW_DATA_OTHER,
+                       0,
+                       buf->base.size,
+                       (usage & PIPE_BUFFER_USAGE_CPU_WRITE) ? TRUE : FALSE,
+                       FALSE,
+                       FALSE);
+}
+
+
+static void 
+brw_buffer_flush_mapped_range( struct pipe_screen *screen,
+                               struct pipe_buffer *buffer,
+                               unsigned offset,
+                               unsigned length )
+{
+   struct brw_screen *bscreen = brw_screen(screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer( buffer );
+
+   if (buf->user_buffer)
+      return;
+
+   sws->bo_flush_range( buf->bo, 
+                        offset,
+                        length );
+}
+
+
+static void 
+brw_buffer_unmap( struct pipe_screen *screen,
+                   struct pipe_buffer *buffer )
+{
+   struct brw_screen *bscreen = brw_screen(screen); 
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf = brw_buffer( buffer );
+   
+   if (buf->bo)
+      sws->bo_unmap(buf->bo);
+}
+
+static void
+brw_buffer_destroy( struct pipe_buffer *buffer )
+{
+   struct brw_buffer *buf = brw_buffer( buffer );
+
+   assert(!p_atomic_read(&buffer->reference.count));
+
+   bo_reference(&buf->bo, NULL);
+   FREE(buf);
+}
+
+
+static struct pipe_buffer *
+brw_buffer_create(struct pipe_screen *screen,
+                   unsigned alignment,
+                   unsigned usage,
+                   unsigned size)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_winsys_screen *sws = bscreen->sws;
+   struct brw_buffer *buf;
+   unsigned buffer_type;
+   enum pipe_error ret;
+   
+   buf = CALLOC_STRUCT(brw_buffer);
+   if (!buf)
+      return NULL;
+      
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.screen = screen;
+   buf->base.alignment = alignment;
+   buf->base.usage = usage;
+   buf->base.size = size;
+
+   switch (usage & (PIPE_BUFFER_USAGE_VERTEX |
+                    PIPE_BUFFER_USAGE_INDEX |
+                    PIPE_BUFFER_USAGE_PIXEL |
+                    PIPE_BUFFER_USAGE_CONSTANT))
+   {
+   case PIPE_BUFFER_USAGE_VERTEX:
+   case PIPE_BUFFER_USAGE_INDEX:
+   case (PIPE_BUFFER_USAGE_VERTEX|PIPE_BUFFER_USAGE_INDEX):
+      buffer_type = BRW_BUFFER_TYPE_VERTEX;
+      break;
+      
+   case PIPE_BUFFER_USAGE_PIXEL:
+      buffer_type = BRW_BUFFER_TYPE_PIXEL;
+      break;
+
+   case PIPE_BUFFER_USAGE_CONSTANT:
+      buffer_type = BRW_BUFFER_TYPE_SHADER_CONSTANTS;
+      break;
+
+   default:
+      buffer_type = BRW_BUFFER_TYPE_GENERIC;
+      break;
+   }
+   
+   ret = sws->bo_alloc( sws, buffer_type,
+                        size, alignment,
+                        &buf->bo );
+   if (ret != PIPE_OK)
+      return NULL;
+      
+   return &buf->base; 
+}
+
+
+static struct pipe_buffer *
+brw_user_buffer_create(struct pipe_screen *screen,
+                       void *ptr,
+                       unsigned bytes)
+{
+   struct brw_buffer *buf;
+   
+   buf = CALLOC_STRUCT(brw_buffer);
+   if (!buf)
+      return NULL;
+      
+   buf->user_buffer = ptr;
+   
+   pipe_reference_init(&buf->base.reference, 1);
+   buf->base.screen = screen;
+   buf->base.alignment = 1;
+   buf->base.usage = 0;
+   buf->base.size = bytes;
+   
+   return &buf->base; 
+}
+
+
+boolean brw_is_buffer_referenced_by_bo( struct brw_screen *brw_screen,
+                                     struct pipe_buffer *buffer,
+                                     struct brw_winsys_buffer *bo )
+{
+   struct brw_buffer *buf = brw_buffer(buffer);
+   if (buf->bo == NULL)
+      return FALSE;
+
+   return brw_screen->sws->bo_references( bo, buf->bo );
+}
+
+   
+void brw_screen_buffer_init(struct brw_screen *brw_screen)
+{
+   brw_screen->base.buffer_create = brw_buffer_create;
+   brw_screen->base.user_buffer_create = brw_user_buffer_create;
+   brw_screen->base.buffer_map = brw_buffer_map;
+   brw_screen->base.buffer_map_range = brw_buffer_map_range;
+   brw_screen->base.buffer_flush_mapped_range = brw_buffer_flush_mapped_range;
+   brw_screen->base.buffer_unmap = brw_buffer_unmap;
+   brw_screen->base.buffer_destroy = brw_buffer_destroy;
+}
diff --git a/src/gallium/drivers/i965/brw_screen_surface.c b/src/gallium/drivers/i965/brw_screen_surface.c
new file mode 100644
index 00000000000..e2b9954e596
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_surface.c
@@ -0,0 +1,262 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_math.h"
+
+#include "pipe/p_screen.h"
+#include "brw_screen.h"
+#include "brw_defines.h"
+#include "brw_winsys.h"
+
+enum {
+   BRW_VIEW_LINEAR,
+   BRW_VIEW_IN_PLACE
+};
+
+
+static boolean need_linear_view( struct brw_screen *brw_screen,
+				 struct brw_texture *brw_texture,
+				 union brw_surface_id id,
+				 unsigned usage )
+{
+#if 0
+   /* XXX: what about IDGNG?
+    */
+   if (!BRW_IS_G4X(brw->brw_screen->pci_id))
+   {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      /* The original gen4 hardware couldn't set up WM surfaces pointing
+       * at an offset within a tile, which can happen when rendering to
+       * anything but the base level of a texture or the +X face/0 depth.
+       * This was fixed with the 4 Series hardware.
+       *
+       * For these original chips, you would have to make the depth and
+       * color destination surfaces include information on the texture
+       * type, LOD, face, and various limits to use them as a destination.
+       *
+       * This is easy in Gallium as surfaces are all backed by
+       * textures, but there's also a nasty requirement that the depth
+       * and the color surfaces all be of the same LOD, which is
+       * harder to get around as we can't look at a surface in
+       * isolation and decide if it's legal.
+       *
+       * Instead, end up being pessimistic and say that for i965,
+       * ... ??
+       */
+      if (brw_tex->tiling != I915_TILING_NONE &&
+	  (brw_tex_image_offset(brw_tex, face, level, zslize) & 4095)) {
+	 if (BRW_DEBUG & DEBUG_VIEW)
+	    debug_printf("%s: need surface view for non-aligned tex image\n",
+			 __FUNCTION__);
+	 return GL_TRUE;
+      }
+   }
+#endif
+
+   /* Tiled 3d textures don't have subsets that look like 2d surfaces:
+    */
+   
+   /* Everything else should be fine to render to in-place:
+    */
+   return GL_FALSE;
+}
+
+/* Look at all texture views and figure out if any of them need to be
+ * back-copied into the texture for sampling
+ */
+void brw_update_texture( struct brw_screen *brw_screen,
+			 struct brw_texture *tex )
+{
+   /* currently nothing to do */
+}
+
+
+/* Create a new surface with linear layout to serve as a render-target
+ * where it would be illegal (perhaps due to tiling constraints) to do
+ * this in-place.
+ * 
+ * Currently not implmented, not sure if it's needed.
+ */
+static struct brw_surface *create_linear_view( struct brw_screen *brw_screen,
+					       struct brw_texture *tex,
+					       union brw_surface_id id,
+					       unsigned usage )
+{
+   return NULL;
+}
+
+
+/* Create a pipe_surface that just points directly into the existing
+ * texture's storage.
+ */
+static struct brw_surface *create_in_place_view( struct brw_screen *brw_screen,
+						  struct brw_texture *tex,
+						  union brw_surface_id id,
+						  unsigned usage )
+{
+   struct brw_surface *surface;
+
+   surface = CALLOC_STRUCT(brw_surface);
+   if (surface == NULL)
+      return NULL;
+
+   pipe_reference_init(&surface->base.reference, 1);
+
+   /* XXX: ignoring render-to-slice-of-3d-texture
+    */
+   assert(id.bits.zslice == 0);
+
+   surface->base.format = tex->base.format;
+   surface->base.width = u_minify(tex->base.width0, id.bits.level);
+   surface->base.height = u_minify(tex->base.height0, id.bits.level);
+   surface->base.offset = tex->image_offset[id.bits.level][id.bits.face];
+   surface->base.usage = usage;
+   surface->base.zslice = id.bits.zslice;
+   surface->base.face = id.bits.face;
+   surface->base.level = id.bits.level;
+   surface->id = id;
+   surface->cpp = tex->cpp;
+   surface->pitch = tex->pitch;
+   surface->tiling = tex->tiling;
+
+   bo_reference( &surface->bo, tex->bo );
+   pipe_texture_reference( &surface->base.texture, &tex->base );
+
+   surface->ss.ss0.surface_format = tex->ss.ss0.surface_format;
+   surface->ss.ss0.surface_type = BRW_SURFACE_2D;
+
+   if (tex->tiling == BRW_TILING_NONE) {
+      surface->ss.ss1.base_addr = surface->base.offset;
+   } else {
+      uint32_t tile_offset = surface->base.offset % 4096;
+
+      surface->ss.ss1.base_addr = surface->base.offset - tile_offset;
+
+      if (brw_screen->chipset.is_g4x) {
+	 if (tex->tiling == BRW_TILING_X) {
+	    /* Note that the low bits of these fields are missing, so
+	     * there's the possibility of getting in trouble.
+	     */
+	    surface->ss.ss5.x_offset = (tile_offset % 512) / tex->cpp / 4;
+	    surface->ss.ss5.y_offset = tile_offset / 512 / 2;
+	 } else {
+	    surface->ss.ss5.x_offset = (tile_offset % 128) / tex->cpp / 4;
+	    surface->ss.ss5.y_offset = tile_offset / 128 / 2;
+	 }
+      }
+      else {
+	 assert(tile_offset == 0);
+      }
+   }
+
+#if 0
+   if (region_bo != NULL)
+      surface->ss.ss1.base_addr += region_bo->offset; /* reloc */
+#endif
+
+   surface->ss.ss2.width = surface->base.width - 1;
+   surface->ss.ss2.height = surface->base.height - 1;
+   surface->ss.ss3.tiled_surface = tex->ss.ss3.tiled_surface;
+   surface->ss.ss3.tile_walk = tex->ss.ss3.tile_walk;
+   surface->ss.ss3.pitch = tex->ss.ss3.pitch;
+
+   return surface;
+}
+
+/* Get a surface which is view into a texture 
+ */
+static struct pipe_surface *brw_get_tex_surface(struct pipe_screen *screen,
+						struct pipe_texture *pt,
+						unsigned face, unsigned level,
+						unsigned zslice,
+						unsigned usage )
+{
+   struct brw_texture *tex = brw_texture(pt);
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_surface *surface;
+   union brw_surface_id id;
+   int type;
+
+   id.bits.face = face;
+   id.bits.level = level;
+   id.bits.zslice = zslice;
+
+   if (need_linear_view(bscreen, tex, id, usage)) 
+      type = BRW_VIEW_LINEAR;
+   else
+      type = BRW_VIEW_IN_PLACE;
+
+   
+   foreach (surface, &tex->views[type]) {
+      if (id.value == surface->id.value)
+	 return &surface->base;
+   }
+
+   switch (type) {
+   case BRW_VIEW_LINEAR:
+      surface = create_linear_view( bscreen, tex, id, usage );
+      break;
+   case BRW_VIEW_IN_PLACE:
+      surface = create_in_place_view( bscreen, tex, id, usage );
+      break;
+   default:
+      return NULL;
+   }
+
+   insert_at_head( &tex->views[type], surface );
+   return &surface->base;
+}
+
+
+static void brw_tex_surface_destroy( struct pipe_surface *surf )
+{
+   struct brw_surface *surface = brw_surface(surf);
+
+   /* Unreference texture, shared buffer:
+    */
+   remove_from_list(surface);
+   bo_reference(&surface->bo, NULL);
+   pipe_texture_reference( &surface->base.texture, NULL );
+
+
+   FREE(surface);
+}
+
+
+void brw_screen_tex_surface_init( struct brw_screen *brw_screen )
+{
+   brw_screen->base.get_tex_surface = brw_get_tex_surface;
+   brw_screen->base.tex_surface_destroy = brw_tex_surface_destroy;
+}
diff --git a/src/gallium/drivers/i965/brw_screen_tex_layout.c b/src/gallium/drivers/i965/brw_screen_tex_layout.c
new file mode 100644
index 00000000000..894f4bea401
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_tex_layout.c
@@ -0,0 +1,414 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+
+#include "pipe/p_format.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_screen.h"
+#include "brw_debug.h"
+#include "brw_winsys.h"
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+static int 
+brw_tex_pitch_align (struct brw_texture *tex,
+		     int pitch)
+{
+   if (!tex->compressed) {
+      int pitch_align;
+
+      switch (tex->tiling) {
+      case BRW_TILING_X:
+	 pitch_align = 512;
+	 break;
+      case BRW_TILING_Y:
+	 pitch_align = 128;
+	 break;
+      default:
+	 /* XXX: Untiled pitch alignment of 64 bytes for now to allow
+	  * render-to-texture to work in all cases. This should
+	  * probably be replaced at some point by some scheme to only
+	  * do this when really necessary, for example standalone
+	  * render target views.
+	  */
+	 pitch_align = 64;
+	 break;
+      }
+
+      pitch = align(pitch * tex->cpp, pitch_align);
+      pitch /= tex->cpp;
+   }
+
+   return pitch;
+}
+
+
+static void 
+brw_tex_alignment_unit(enum pipe_format pf, 
+		       GLuint *w, GLuint *h)
+{
+    switch (pf) {
+    case PIPE_FORMAT_DXT1_RGB:
+    case PIPE_FORMAT_DXT1_RGBA:
+    case PIPE_FORMAT_DXT3_RGBA:
+    case PIPE_FORMAT_DXT5_RGBA:
+    case PIPE_FORMAT_DXT1_SRGB:
+    case PIPE_FORMAT_DXT1_SRGBA:
+    case PIPE_FORMAT_DXT3_SRGBA:
+    case PIPE_FORMAT_DXT5_SRGBA:
+        *w = 4;
+        *h = 4;
+        break;
+
+    default:
+        *w = 4;
+        *h = 2;
+        break;
+    }
+}
+
+
+static void 
+brw_tex_set_level_info(struct brw_texture *tex,
+		       GLuint level,
+		       GLuint nr_images,
+		       GLuint x, GLuint y,
+		       GLuint w, GLuint h, GLuint d)
+{
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+		   level, w, h, d, x, y, tex->level_offset[level]);
+
+   assert(tex->image_offset[level] == NULL);
+   assert(nr_images >= 1);
+
+   tex->level_offset[level] = (x + y * tex->pitch) * tex->cpp;
+   tex->nr_images[level] = nr_images;
+
+   tex->image_offset[level] = MALLOC(nr_images * sizeof(GLuint));
+   tex->image_offset[level][0] = 0;
+}
+
+
+static void
+brw_tex_set_image_offset(struct brw_texture *tex,
+			 GLuint level, GLuint img,
+			 GLuint x, GLuint y, 
+			 GLuint offset)
+{
+   assert((x == 0 && y == 0) || img != 0 || level != 0);
+   assert(img < tex->nr_images[level]);
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s level %d img %d pos %d,%d image_offset %x\n",
+		   __FUNCTION__, level, img, x, y, 
+		   tex->image_offset[level][img]);
+
+   tex->image_offset[level][img] = (x + y * tex->pitch) * tex->cpp + offset;
+}
+
+
+
+static void brw_layout_2d( struct brw_texture *tex )
+{
+   GLuint align_h = 2, align_w = 4;
+   GLuint level;
+   GLuint x = 0;
+   GLuint y = 0;
+   GLuint width = tex->base.width0;
+   GLuint height = tex->base.height0;
+
+   tex->pitch = tex->base.width0;
+   brw_tex_alignment_unit(tex->base.format, &align_w, &align_h);
+
+   if (tex->compressed) {
+       tex->pitch = align(tex->base.width0, align_w);
+   }
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap out past the width of its parent.
+    */
+   if (tex->base.last_level > 0) {
+       GLuint mip1_width;
+
+       if (tex->compressed) {
+          mip1_width = (align(u_minify(tex->base.width0, 1), align_w) + 
+                        align(u_minify(tex->base.width0, 2), align_w));
+       } else {
+          mip1_width = (align(u_minify(tex->base.width0, 1), align_w) + 
+                        u_minify(tex->base.width0, 2));
+       }
+
+       if (mip1_width > tex->pitch) {
+           tex->pitch = mip1_width;
+       }
+   }
+
+   /* Pitch must be a whole number of dwords, even though we
+    * express it in texels.
+    */
+   tex->pitch = brw_tex_pitch_align (tex, tex->pitch);
+   tex->total_height = 0;
+
+   for ( level = 0 ; level <= tex->base.last_level ; level++ ) {
+      GLuint img_height;
+
+      brw_tex_set_level_info(tex, level, 1, x, y, width, height, 1);
+
+      if (tex->compressed)
+	 img_height = MAX2(1, height/4);
+      else
+	 img_height = align(height, align_h);
+
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_height = MAX2(tex->total_height, y + img_height);
+
+      /* Layout_below: step right after second mipmap.
+       */
+      if (level == 1) {
+	 x += align(width, align_w);
+      }
+      else {
+	 y += img_height;
+      }
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+   }
+}
+
+
+static boolean 
+brw_layout_cubemap_idgng( struct brw_texture *tex )
+{
+   GLuint align_h = 2, align_w = 4;
+   GLuint level;
+   GLuint x = 0;
+   GLuint y = 0;
+   GLuint width = tex->base.width0;
+   GLuint height = tex->base.height0;
+   GLuint qpitch = 0;
+   GLuint y_pitch = 0;
+
+   tex->pitch = tex->base.width0;
+   brw_tex_alignment_unit(tex->base.format, &align_w, &align_h);
+   y_pitch = align(height, align_h);
+
+   if (tex->compressed) {
+      tex->pitch = align(tex->base.width0, align_w);
+   }
+
+   if (tex->base.last_level != 0) {
+      GLuint mip1_width;
+
+      if (tex->compressed) {
+	 mip1_width = (align(u_minify(tex->base.width0, 1), align_w) +
+		       align(u_minify(tex->base.width0, 2), align_w));
+      } else {
+	 mip1_width = (align(u_minify(tex->base.width0, 1), align_w) +
+		       u_minify(tex->base.width0, 2));
+      }
+
+      if (mip1_width > tex->pitch) {
+	 tex->pitch = mip1_width;
+      }
+   }
+
+   tex->pitch = brw_tex_pitch_align(tex, tex->pitch);
+
+   if (tex->compressed) {
+      qpitch = ((y_pitch + 
+		 align(u_minify(y_pitch, 1), align_h) +
+		 11 * align_h) / 4) * tex->pitch * tex->cpp;
+
+      tex->total_height = ((y_pitch + 
+			    align(u_minify(y_pitch, 1), align_h) + 
+			    11 * align_h) / 4) * 6;
+   } else {
+      qpitch = (y_pitch + 
+		align(u_minify(y_pitch, 1), align_h) + 
+		11 * align_h) * tex->pitch * tex->cpp;
+
+      tex->total_height = (y_pitch +
+			   align(u_minify(y_pitch, 1), align_h) +
+			   11 * align_h) * 6;
+   }
+
+   for (level = 0; level <= tex->base.last_level; level++) {
+      GLuint img_height;
+      GLuint nr_images = 6;
+      GLuint q = 0;
+
+      brw_tex_set_level_info(tex, level, nr_images, x, y, width, height, 1);
+
+      for (q = 0; q < nr_images; q++)
+	 brw_tex_set_image_offset(tex, level, q, x, y, q * qpitch);
+
+      if (tex->compressed)
+	 img_height = MAX2(1, height/4);
+      else
+	 img_height = align(height, align_h);
+
+      if (level == 1) {
+	 x += align(width, align_w);
+      }
+      else {
+	 y += img_height;
+      }
+
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+brw_layout_3d_cube( struct brw_texture *tex )
+{
+   GLuint width  = tex->base.width0;
+   GLuint height = tex->base.height0;
+   GLuint depth = tex->base.depth0;
+   GLuint pack_x_pitch, pack_x_nr;
+   GLuint pack_y_pitch;
+   GLuint level;
+   GLuint align_h = 2;
+   GLuint align_w = 4;
+
+   tex->total_height = 0;
+   brw_tex_alignment_unit(tex->base.format, &align_w, &align_h);
+
+   if (tex->compressed) {
+      tex->pitch = align(width, align_w);
+      pack_y_pitch = (height + 3) / 4;
+   } else {
+      tex->pitch = brw_tex_pitch_align(tex, tex->base.width0);
+      pack_y_pitch = align(tex->base.height0, align_h);
+   }
+
+   pack_x_pitch = width;
+   pack_x_nr = 1;
+
+   for (level = 0 ; level <= tex->base.last_level ; level++) {
+      GLuint nr_images = tex->base.target == PIPE_TEXTURE_3D ? depth : 6;
+      GLint x = 0;
+      GLint y = 0;
+      GLint q, j;
+
+      brw_tex_set_level_info(tex, level, nr_images,
+				   0, tex->total_height,
+				   width, height, depth);
+
+      for (q = 0; q < nr_images;) {
+	 for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	    brw_tex_set_image_offset(tex, level, q, x, y, 0);
+	    x += pack_x_pitch;
+	 }
+
+	 x = 0;
+	 y += pack_y_pitch;
+      }
+
+
+      tex->total_height += y;
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth  = u_minify(depth, 1);
+
+      if (tex->compressed) {
+	 pack_y_pitch = (height + 3) / 4;
+
+	 if (pack_x_pitch > align(width, align_w)) {
+	    pack_x_pitch = align(width, align_w);
+	    pack_x_nr <<= 1;
+	 }
+      } else {
+	 if (pack_x_pitch > 4) {
+	    pack_x_pitch >>= 1;
+	    pack_x_nr <<= 1;
+	    assert(pack_x_pitch * pack_x_nr <= tex->pitch);
+	 }
+
+	 if (pack_y_pitch > 2) {
+	    pack_y_pitch >>= 1;
+	    pack_y_pitch = align(pack_y_pitch, align_h);
+	 }
+      }
+   }
+
+   /* The 965's sampler lays cachelines out according to how accesses
+    * in the texture surfaces run, so they may be "vertical" through
+    * memory.  As a result, the docs say in Surface Padding Requirements:
+    * Sampling Engine Surfaces that two extra rows of padding are required.
+    */
+   if (tex->base.target == PIPE_TEXTURE_CUBE)
+      tex->total_height += 2;
+
+   return TRUE;
+}
+
+
+
+GLboolean brw_texture_layout(struct brw_screen *brw_screen,
+			     struct brw_texture *tex )
+{
+   switch (tex->base.target) {
+   case PIPE_TEXTURE_CUBE:
+      if (brw_screen->chipset.is_igdng)
+	 brw_layout_cubemap_idgng( tex );
+      else
+	 brw_layout_3d_cube( tex );
+      break;
+	    
+   case PIPE_TEXTURE_3D:
+      brw_layout_3d_cube( tex );
+      break;
+
+   default:
+      brw_layout_2d( tex );
+      break;
+   }
+
+   if (BRW_DEBUG & DEBUG_TEXTURE)
+      debug_printf("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		   tex->pitch,
+		   tex->total_height,
+		   tex->cpp,
+		   tex->pitch * tex->total_height * tex->cpp );
+
+   return GL_TRUE;
+}
diff --git a/src/gallium/drivers/i965/brw_screen_texture.c b/src/gallium/drivers/i965/brw_screen_texture.c
new file mode 100644
index 00000000000..feb9d5f7657
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_screen_texture.c
@@ -0,0 +1,573 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "util/u_format.h"
+
+#include "brw_screen.h"
+#include "brw_defines.h"
+#include "brw_structs.h"
+#include "brw_winsys.h"
+
+
+
+static GLuint translate_tex_target( unsigned target )
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D: 
+      return BRW_SURFACE_1D;
+
+   case PIPE_TEXTURE_2D: 
+      return BRW_SURFACE_2D;
+
+   case PIPE_TEXTURE_3D: 
+      return BRW_SURFACE_3D;
+
+   case PIPE_TEXTURE_CUBE:
+      return BRW_SURFACE_CUBE;
+
+   default: 
+      assert(0); 
+      return BRW_SURFACE_1D;
+   }
+}
+
+
+static GLuint translate_tex_format( enum pipe_format pf )
+{
+   switch( pf ) {
+   case PIPE_FORMAT_L8_UNORM:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case PIPE_FORMAT_I8_UNORM:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return BRW_SURFACEFORMAT_A8_UNORM; 
+
+   case PIPE_FORMAT_L16_UNORM:
+      return BRW_SURFACEFORMAT_L16_UNORM;
+
+      /* XXX: Add these to gallium
+   case PIPE_FORMAT_I16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+
+   case PIPE_FORMAT_A16_UNORM:
+      return BRW_SURFACEFORMAT_A16_UNORM; 
+      */
+
+   case PIPE_FORMAT_A8L8_UNORM:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   /*
+    * Video formats
+    */
+
+   case PIPE_FORMAT_YCBCR_REV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case PIPE_FORMAT_YCBCR:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+
+   /*
+    * Compressed formats.
+    */
+      /* XXX: Add FXT to gallium?
+   case PIPE_FORMAT_FXT1_RGBA:
+      return BRW_SURFACEFORMAT_FXT1;
+      */
+
+   case PIPE_FORMAT_DXT1_RGB:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case PIPE_FORMAT_DXT1_RGBA:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+       
+   case PIPE_FORMAT_DXT3_RGBA:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+       
+   case PIPE_FORMAT_DXT5_RGBA:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   /*
+    * sRGB formats
+    */
+
+   case PIPE_FORMAT_R8G8B8A8_SRGB:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+
+   case PIPE_FORMAT_A8L8_SRGB:
+      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
+
+   case PIPE_FORMAT_L8_SRGB:
+      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
+
+   case PIPE_FORMAT_DXT1_SRGB:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+
+   /*
+    * Depth formats
+    */
+
+   case PIPE_FORMAT_Z16_UNORM:
+         return BRW_SURFACEFORMAT_I16_UNORM;
+
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+
+   case PIPE_FORMAT_Z32_FLOAT:
+         return BRW_SURFACEFORMAT_I32_FLOAT;
+
+      /* XXX: presumably for bump mapping.  Add this to mesa state
+       * tracker?
+       *
+       * XXX: Add flipped versions of these formats to Gallium.
+       */
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   default:
+      return BRW_SURFACEFORMAT_INVALID;
+   }
+}
+
+
+
+
+
+static struct pipe_texture *brw_texture_create( struct pipe_screen *screen,
+						const struct pipe_texture *templ )
+
+{  
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_texture *tex;
+   enum brw_buffer_type buffer_type;
+   enum pipe_error ret;
+   
+   tex = CALLOC_STRUCT(brw_texture);
+   if (tex == NULL)
+      return NULL;
+
+   memcpy(&tex->base, templ, sizeof *templ);
+   pipe_reference_init(&tex->base.reference, 1);
+   tex->base.screen = screen;
+
+   /* XXX: compressed textures need special treatment here
+    */
+   tex->cpp = util_format_get_blocksize(tex->base.format);
+   tex->compressed = util_format_is_compressed(tex->base.format);
+
+   make_empty_list(&tex->views[0]);
+   make_empty_list(&tex->views[1]);
+
+   /* XXX: No tiling with compressed textures??
+    */
+   if (tex->compressed == 0 &&
+       !bscreen->no_tiling) 
+   {
+      if (bscreen->chipset.is_965 &&
+	  util_format_is_depth_or_stencil(templ->format))
+	 tex->tiling = BRW_TILING_Y;
+      else
+	 tex->tiling = BRW_TILING_X;
+   } 
+   else {
+      tex->tiling = BRW_TILING_NONE;
+   }
+
+
+
+
+   if (!brw_texture_layout( bscreen, tex ))
+      goto fail;
+
+   
+   if (templ->tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+                           PIPE_TEXTURE_USAGE_PRIMARY)) {
+      buffer_type = BRW_BUFFER_TYPE_SCANOUT;
+   }
+   else {
+      buffer_type = BRW_BUFFER_TYPE_TEXTURE;
+   }
+
+   ret = bscreen->sws->bo_alloc( bscreen->sws,
+                                 buffer_type,
+                                 tex->pitch * tex->total_height * tex->cpp,
+                                 64,
+                                 &tex->bo );
+   if (ret)
+      goto fail;
+
+   tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   tex->ss.ss0.surface_type = translate_tex_target(tex->base.target);
+   tex->ss.ss0.surface_format = translate_tex_format(tex->base.format);
+   assert(tex->ss.ss0.surface_format != BRW_SURFACEFORMAT_INVALID);
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    tex->ss.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+
+   /* XXX: what happens when tex->bo->offset changes???
+    */
+   tex->ss.ss1.base_addr = 0; /* reloc */
+   tex->ss.ss2.mip_count = tex->base.last_level;
+   tex->ss.ss2.width = tex->base.width0 - 1;
+   tex->ss.ss2.height = tex->base.height0 - 1;
+
+   switch (tex->tiling) {
+   case BRW_TILING_NONE:
+      tex->ss.ss3.tiled_surface = 0;
+      tex->ss.ss3.tile_walk = 0;
+      break;
+   case BRW_TILING_X:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+      break;
+   case BRW_TILING_Y:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_YMAJOR;
+      break;
+   }
+
+   tex->ss.ss3.pitch = (tex->pitch * tex->cpp) - 1;
+   tex->ss.ss3.depth = tex->base.depth0 - 1;
+
+   tex->ss.ss4.min_lod = 0;
+ 
+   if (tex->base.target == PIPE_TEXTURE_CUBE) {
+      tex->ss.ss0.cube_pos_x = 1;
+      tex->ss.ss0.cube_pos_y = 1;
+      tex->ss.ss0.cube_pos_z = 1;
+      tex->ss.ss0.cube_neg_x = 1;
+      tex->ss.ss0.cube_neg_y = 1;
+      tex->ss.ss0.cube_neg_z = 1;
+   }
+
+   return &tex->base;
+
+fail:
+   bo_reference(&tex->bo, NULL);
+   FREE(tex);
+   return NULL;
+}
+
+static struct pipe_texture *brw_texture_blanket(struct pipe_screen *screen,
+						const struct pipe_texture *templ,
+						const unsigned *stride,
+						struct pipe_buffer *buffer)
+{
+   return NULL;
+}
+
+static void brw_texture_destroy(struct pipe_texture *pt)
+{
+   struct brw_texture *tex = brw_texture(pt);
+   bo_reference(&tex->bo, NULL);
+   FREE(pt);
+}
+
+
+static boolean brw_is_format_supported( struct pipe_screen *screen,
+					enum pipe_format format,
+					enum pipe_texture_target target,
+					unsigned tex_usage, 
+					unsigned geom_flags )
+{
+   return translate_tex_format(format) != BRW_SURFACEFORMAT_INVALID;
+}
+
+
+boolean brw_is_texture_referenced_by_bo( struct brw_screen *brw_screen,
+                                      struct pipe_texture *texture,
+                                      unsigned face, 
+                                      unsigned level,
+                                      struct brw_winsys_buffer *bo )
+{
+   struct brw_texture *tex = brw_texture(texture);
+   struct brw_surface *surf;
+   int i;
+
+   /* XXX: this is subject to false positives if the underlying
+    * texture BO is referenced, we can't tell whether the sub-region
+    * we care about participates in that.
+    */
+   if (brw_screen->sws->bo_references( bo, tex->bo ))
+      return TRUE;
+
+   /* Find any view on this texture for this face/level and see if it
+    * is referenced:
+    */
+   for (i = 0; i < 2; i++) {
+      foreach (surf, &tex->views[i]) {
+         if (surf->bo == tex->bo)
+            continue;
+
+         if (surf->id.bits.face != face ||
+             surf->id.bits.level != level)
+            continue;
+         
+         if (brw_screen->sws->bo_references( bo, surf->bo))
+            return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+
+/*
+ * Transfer functions
+ */
+
+static struct pipe_transfer*
+brw_get_tex_transfer(struct pipe_screen *screen,
+                     struct pipe_texture *texture,
+                     unsigned face, unsigned level, unsigned zslice,
+                     enum pipe_transfer_usage usage, unsigned x, unsigned y,
+                     unsigned w, unsigned h)
+{
+   struct brw_texture *tex = brw_texture(texture);
+   struct brw_transfer *trans;
+   unsigned offset;  /* in bytes */
+
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   } else if (texture->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   } else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   trans = CALLOC_STRUCT(brw_transfer);
+   if (trans) {
+      pipe_texture_reference(&trans->base.texture, texture);
+      trans->base.x = x;
+      trans->base.y = y;
+      trans->base.width = w;
+      trans->base.height = h;
+      trans->base.stride = tex->pitch * tex->cpp;
+      trans->offset = offset;
+      trans->base.usage = usage;
+   }
+   return &trans->base;
+}
+
+static void *
+brw_transfer_map(struct pipe_screen *screen,
+                 struct pipe_transfer *transfer)
+{
+   struct brw_texture *tex = brw_texture(transfer->texture);
+   struct brw_winsys_screen *sws = brw_screen(screen)->sws;
+   char *map;
+   unsigned usage = transfer->usage;
+
+   map = sws->bo_map(tex->bo, 
+                     BRW_DATA_OTHER,
+                     0,
+                     tex->bo->size,
+                     (usage & PIPE_TRANSFER_WRITE) ? TRUE : FALSE,
+                     (usage & 0) ? TRUE : FALSE,
+                     (usage & 0) ? TRUE : FALSE);
+
+   if (!map)
+      return NULL;
+
+   /* XXX: blocksize and compressed textures
+    */
+   return map + brw_transfer(transfer)->offset +
+      transfer->y /* / transfer->block.height */ * transfer->stride +
+      transfer->x /* / transfer->block.width */ * brw_texture(transfer->texture)->cpp;
+}
+
+static void
+brw_transfer_unmap(struct pipe_screen *screen,
+                   struct pipe_transfer *transfer)
+{
+   struct brw_texture *tex = brw_texture(transfer->texture);
+   struct brw_winsys_screen *sws = brw_screen(screen)->sws;
+
+   sws->bo_unmap(tex->bo);
+}
+
+static void
+brw_tex_transfer_destroy(struct pipe_transfer *trans)
+{
+   pipe_texture_reference(&trans->texture, NULL);
+   FREE(trans);
+}
+
+
+/*
+ * Functions exported to the winsys
+ */
+
+boolean brw_texture_get_winsys_buffer(struct pipe_texture *texture,
+                                      struct brw_winsys_buffer **buffer,
+                                      unsigned *stride)
+{
+   struct brw_texture *tex = brw_texture(texture);
+
+   *buffer = tex->bo;
+   if (stride)
+      *stride = tex->pitch * tex->cpp;
+
+   return TRUE;
+}
+
+struct pipe_texture * 
+brw_texture_blanket_winsys_buffer(struct pipe_screen *screen,
+                                  const struct pipe_texture *templ,
+                                  unsigned pitch,
+				  unsigned tiling,
+                                  struct brw_winsys_buffer *buffer)
+{
+   struct brw_screen *bscreen = brw_screen(screen);
+   struct brw_texture *tex;
+
+   if (templ->target != PIPE_TEXTURE_2D ||
+       templ->last_level != 0 ||
+       templ->depth0 != 1)
+      return NULL;
+
+   if (util_format_is_compressed(templ->format))
+      return NULL;
+
+   tex = CALLOC_STRUCT(brw_texture);
+   if (!tex)
+      return NULL;
+
+   memcpy(&tex->base, templ, sizeof *templ);
+   pipe_reference_init(&tex->base.reference, 1);
+   tex->base.screen = screen;
+
+   /* XXX: cpp vs. blocksize
+    */
+   tex->cpp = util_format_get_blocksize(tex->base.format);
+   tex->tiling = tiling;
+
+   make_empty_list(&tex->views[0]);
+   make_empty_list(&tex->views[1]);
+
+   if (!brw_texture_layout(bscreen, tex))
+      goto fail;
+
+   /* XXX Maybe some more checks? */
+   if ((pitch / tex->cpp) < tex->pitch)
+      goto fail;
+
+   tex->pitch = pitch / tex->cpp;
+
+   tex->bo = buffer;
+
+   /* fix this warning */
+#if 0
+   if (tex->size > buffer->size)
+      goto fail;
+#endif
+
+   tex->ss.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   tex->ss.ss0.surface_type = translate_tex_target(tex->base.target);
+   tex->ss.ss0.surface_format = translate_tex_format(tex->base.format);
+   assert(tex->ss.ss0.surface_format != BRW_SURFACEFORMAT_INVALID);
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    tex->ss.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+
+   /* XXX: what happens when tex->bo->offset changes???
+    */
+   tex->ss.ss1.base_addr = 0; /* reloc */
+   tex->ss.ss2.mip_count = tex->base.last_level;
+   tex->ss.ss2.width = tex->base.width0 - 1;
+   tex->ss.ss2.height = tex->base.height0 - 1;
+
+   switch (tex->tiling) {
+   case BRW_TILING_NONE:
+      tex->ss.ss3.tiled_surface = 0;
+      tex->ss.ss3.tile_walk = 0;
+      break;
+   case BRW_TILING_X:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+      break;
+   case BRW_TILING_Y:
+      tex->ss.ss3.tiled_surface = 1;
+      tex->ss.ss3.tile_walk = BRW_TILEWALK_YMAJOR;
+      break;
+   }
+
+   tex->ss.ss3.pitch = (tex->pitch * tex->cpp) - 1;
+   tex->ss.ss3.depth = tex->base.depth0 - 1;
+
+   tex->ss.ss4.min_lod = 0;
+
+   return &tex->base;
+
+fail:
+   FREE(tex);
+   return NULL;
+}
+
+void brw_screen_tex_init( struct brw_screen *brw_screen )
+{
+   brw_screen->base.is_format_supported = brw_is_format_supported;
+   brw_screen->base.texture_create = brw_texture_create;
+   brw_screen->base.texture_destroy = brw_texture_destroy;
+   brw_screen->base.texture_blanket = brw_texture_blanket;
+   brw_screen->base.get_tex_transfer = brw_get_tex_transfer;
+   brw_screen->base.transfer_map = brw_transfer_map;
+   brw_screen->base.transfer_unmap = brw_transfer_unmap;
+   brw_screen->base.tex_transfer_destroy = brw_tex_transfer_destroy;
+}
diff --git a/src/gallium/drivers/i965/brw_sf.c b/src/gallium/drivers/i965/brw_sf.c
new file mode 100644
index 00000000000..e1986a9dbbd
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf.c
@@ -0,0 +1,216 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+#include "pipe/p_state.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+
+static enum pipe_error compile_sf_prog( struct brw_context *brw,
+                                        struct brw_sf_prog_key *key,
+                                        struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_sf_compile c;
+   const GLuint *program;
+   GLuint program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(brw, &c.func);
+
+   c.key = *key;
+   c.nr_attrs = c.key.nr_attrs;
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+   c.nr_setup_attrs = c.key.nr_attrs;
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+   /* Special case when there are no attributes to setup.
+    *
+    * XXX: should be able to set nr_setup_attrs to nr_attrs-1 -- but
+    * breaks vp-tris.c
+    */
+   if (c.nr_attrs - 1 == 0) {
+      c.nr_verts = 0;
+      brw_emit_null_setup( &c );
+   }
+   else {
+      /* Which primitive?  Or all three? 
+       */
+      switch (key->primitive) {
+      case SF_TRIANGLES:
+         c.nr_verts = 3;
+         brw_emit_tri_setup( &c, GL_TRUE );
+         break;
+      case SF_LINES:
+         c.nr_verts = 2;
+         brw_emit_line_setup( &c, GL_TRUE );
+         break;
+      case SF_POINTS:
+         c.nr_verts = 1;
+         if (key->do_point_sprite)
+            brw_emit_point_sprite_setup( &c, GL_TRUE );
+         else
+            brw_emit_point_setup( &c, GL_TRUE );
+         break;
+      case SF_UNFILLED_TRIS:
+         c.nr_verts = 3;
+         brw_emit_anyprim_setup( &c );
+         break;
+      default:
+         assert(0);
+         return PIPE_ERROR_BAD_INPUT;
+      }
+   }
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   /* Upload
+    */
+   ret = brw_upload_cache( &brw->cache, BRW_SF_PROG,
+                           &c.key, sizeof(c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->sf.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static enum pipe_error upload_sf_prog(struct brw_context *brw)
+{
+   const struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+   struct brw_sf_prog_key key;
+   enum pipe_error ret;
+   unsigned i;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+
+   /* XXX: Add one to account for the position input.
+    */
+   /* PIPE_NEW_FRAGMENT_SIGNATURE */
+   key.nr_attrs = sig->nr_inputs + 1;
+
+
+   /* XXX: why is position required to be linear?  why do we care
+    * about it at all?
+    */
+   key.linear_attrs = 1;        /* position -- but why? */
+
+   for (i = 0; i < sig->nr_inputs; i++) {
+      switch (sig->input[i].interp) {
+      case TGSI_INTERPOLATE_CONSTANT:
+         break;
+      case TGSI_INTERPOLATE_LINEAR:
+         key.linear_attrs |= 1 << (i+1);
+         break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         key.persp_attrs |= 1 << (i+1);
+         break;
+      }
+   }
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_TRIANGLES: 
+      /* PIPE_NEW_RAST
+       */
+      if (brw->curr.rast->templ.fill_cw != PIPE_POLYGON_MODE_FILL ||
+	  brw->curr.rast->templ.fill_ccw != PIPE_POLYGON_MODE_FILL)
+	 key.primitive = SF_UNFILLED_TRIS;
+      else
+	 key.primitive = SF_TRIANGLES;
+      break;
+   case PIPE_PRIM_LINES: 
+      key.primitive = SF_LINES; 
+      break;
+   case PIPE_PRIM_POINTS: 
+      key.primitive = SF_POINTS; 
+      break;
+   }
+
+   key.do_point_sprite = brw->curr.rast->templ.point_sprite;
+   key.sprite_origin_lower_left = 0; /* XXX: ctx->Point.SpriteOrigin - fix rast state */
+   key.do_flat_shading = brw->curr.rast->templ.flatshade;
+   key.do_twoside_color = brw->curr.rast->templ.light_twoside;
+
+   if (key.do_twoside_color) {
+      key.frontface_ccw = (brw->curr.rast->templ.front_winding == 
+			   PIPE_WINDING_CCW);
+   }
+
+   if (brw_search_cache(&brw->cache, BRW_SF_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->sf.prog_data,
+                        &brw->sf.prog_bo))
+      return PIPE_OK;
+
+   ret = compile_sf_prog( brw, &key, &brw->sf.prog_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST | PIPE_NEW_FRAGMENT_SIGNATURE),
+      .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = 0
+   },
+   .prepare = upload_sf_prog
+};
+
diff --git a/src/gallium/drivers/i965/brw_sf.h b/src/gallium/drivers/i965/brw_sf.h
new file mode 100644
index 00000000000..a895c7d2f6a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf.h
@@ -0,0 +1,122 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+struct brw_sf_prog_key {
+
+   /* Bitmask of linear and perspective interpolated inputs, 0..nr
+    */
+   GLuint persp_attrs:32;
+   GLuint linear_attrs:32;
+   GLuint point_coord_replace_attrs:32;
+
+   GLuint nr_attrs:8;
+   GLuint primitive:2;
+   GLuint do_twoside_color:1;
+   GLuint do_flat_shading:1;
+   GLuint frontface_ccw:1;
+   GLuint do_point_sprite:1;
+   GLuint sprite_origin_lower_left:1;
+   GLuint pad:17;
+
+   GLuint attr_col0:8;
+   GLuint attr_col1:8;
+   GLuint attr_bfc0:8;
+   GLuint attr_bfc1:8;
+};
+
+struct brw_sf_point_tex {
+   GLboolean CoordReplace;	
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+   
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+   
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   GLuint nr_verts;
+   GLuint nr_attrs;
+   GLuint nr_attr_regs;
+   GLuint nr_setup_attrs;
+   GLuint nr_setup_regs;
+
+   GLuint point_coord_replace_mask;
+};
+
+ 
+void brw_emit_null_setup( struct brw_sf_compile *c );
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_sf_emit.c b/src/gallium/drivers/i965/brw_sf_emit.c
new file mode 100644
index 00000000000..3b85725e368
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf_emit.c
@@ -0,0 +1,765 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+
+#include "brw_batchbuffer.h"
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+
+
+static struct brw_reg get_vert_attr(struct brw_sf_compile *c,
+				    struct brw_reg vert,
+				    GLuint attr)
+{
+   GLuint off = attr / 2;
+   GLuint sub = attr % 2;
+
+   return brw_vec4_grf(vert.nr + off, sub * 4);
+}
+
+
+/*********************************************************************** 
+ * Twoside lighting
+ */
+static void copy_bfc( struct brw_sf_compile *c,
+		      struct brw_reg vert )
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->key.attr_col0 && c->key.attr_bfc0)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col0), 
+	      get_vert_attr(c, vert, c->key.attr_bfc0));
+
+   if (c->key.attr_col1 && c->key.attr_bfc1)
+      brw_MOV(p, 
+	      get_vert_attr(c, vert, c->key.attr_col1), 
+	      get_vert_attr(c, vert, c->key.attr_bfc1));
+}
+
+
+static void do_twoside_color( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   GLuint backface_conditional = c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   /* XXX: What happens if BFC isn't present?  This could only happen
+    * for user-supplied vertex programs, as t_vp_build.c always does
+    * the right thing.
+    */
+   if (!(c->key.attr_col0 && c->key.attr_bfc0) &&
+       !(c->key.attr_col1 && c->key.attr_bfc1))
+      return;
+   
+   /* Need to use BRW_EXECUTE_4 and also do an 4-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_push_insn_state(p);
+   brw_CMP(p, vec4(brw_null_reg()), backface_conditional, c->det, brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_4); 
+   {
+      switch (c->nr_verts) {
+      case 3: copy_bfc(c, c->vert[2]);
+      case 2: copy_bfc(c, c->vert[1]);
+      case 1: copy_bfc(c, c->vert[0]);
+      }
+   }
+   brw_ENDIF(p, if_insn);
+   brw_pop_insn_state(p);
+}
+
+
+
+/***********************************************************************
+ * Flat shading
+ */
+
+#define VERT_RESULT_COLOR_BITS ((1<<VERT_RESULT_COL0) | \
+                                 (1<<VERT_RESULT_COL1))
+
+static void copy_colors( struct brw_sf_compile *c,
+		     struct brw_reg dst,
+		     struct brw_reg src)
+{
+   struct brw_compile *p = &c->func;
+
+   if (c->key.attr_col0)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col0), 
+	      get_vert_attr(c, src, c->key.attr_col0));
+
+   if (c->key.attr_col1)
+      brw_MOV(p, 
+	      get_vert_attr(c, dst, c->key.attr_col1), 
+	      get_vert_attr(c, src, c->key.attr_col1));
+
+}
+
+
+
+/* Need to use a computed jump to copy flatshaded attributes as the
+ * vertices are ordered according to y-coordinate before reaching this
+ * point, so the PV could be anywhere.
+ */
+static void do_flatshade_triangle( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
+      return;
+
+   /* Already done in clip program:
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+
+   copy_colors(c, c->vert[1], c->vert[0]);
+   copy_colors(c, c->vert[2], c->vert[0]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
+
+   copy_colors(c, c->vert[0], c->vert[1]);
+   copy_colors(c, c->vert[2], c->vert[1]);
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
+
+   copy_colors(c, c->vert[0], c->vert[2]);
+   copy_colors(c, c->vert[1], c->vert[2]);
+
+   brw_pop_insn_state(p);
+}
+	
+
+static void do_flatshade_line( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   GLuint jmpi = 1;
+   GLuint nr = 0;
+
+   if (c->key.attr_col0)
+      nr++;
+
+   if (c->key.attr_col1)
+      nr++;
+
+   if (nr == 0)
+      return;
+
+   /* Already done in clip program: 
+    */
+   if (c->key.primitive == SF_UNFILLED_TRIS)
+      return;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
+   brw_push_insn_state(p);
+   
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
+   brw_JMPI(p, ip, ip, c->pv);
+   copy_colors(c, c->vert[1], c->vert[0]);
+
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
+   copy_colors(c, c->vert[0], c->vert[1]);
+
+   brw_pop_insn_state(p);
+}
+
+	
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   GLuint reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+   
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+   
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   brw_push_insn_state(p);
+	
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+	 
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   /* Looks like we invert all 8 elements just to get 1/det in
+    * position 2 !?!
+    */
+   brw_math(&c->func, 
+	    c->inv_det, 
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0, 
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+
+/* Two attributes packed into a wide register.  Figure out if either
+ * or both of them need linear/perspective interpolation.  Constant
+ * regs are left as-is.
+ */
+static GLboolean calculate_masks( struct brw_sf_compile *c,
+				  GLuint reg,
+				  GLushort *pc,
+				  GLushort *pc_persp,
+				  GLushort *pc_linear)
+{
+   GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   GLuint persp_mask = c->key.persp_attrs;
+   GLuint linear_mask = (c->key.persp_attrs | c->key.linear_attrs);
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+      
+   if (persp_mask & (1 << (reg*2))) 
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << (reg*2))) 
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << (reg*2+1))) 
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << (reg*2+1))) 
+	 *pc_linear |= 0xf0;
+   }
+
+   return is_last_attr;
+}
+
+
+void brw_emit_null_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   /* m0 is implicitly copied from r0 in the send instruction:
+    */	 
+   brw_urb_WRITE(p, 
+                 brw_null_reg(),
+                 0,
+                 brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+                 0, 	/* allocate */
+                 1,	/* used */
+                 1, 	/* msg len */
+                 0,	/* response len */
+                 1,	/* eot */
+                 1, 	/* writes complete */
+                 0,	/* offset */
+                 BRW_URB_SWIZZLE_TRANSPOSE); 
+}
+
+void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 3;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_twoside_color) 
+      do_twoside_color(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_triangle(c);
+      
+   
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+      
+      
+      /* Calculate coefficients for interpolated values:
+       */      
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+      
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */	 
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+
+   c->nr_verts = 2;
+
+   if (allocate)
+      alloc_regs(c);
+
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   if (c->key.do_flat_shading)
+      do_flatshade_line(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear); 
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0); 
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+		
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); 
+      }
+   } 
+}
+
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* XXX: only seems to check point_coord_replace_attrs for every
+       * second attribute?!?
+       */
+      boolean coord_replace = !!(c->key.point_coord_replace_attrs & (1<<(2*i)));
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	 if (coord_replace) {
+	    brw_set_predicate_control_flag_value(p, pc_persp);
+	    brw_MUL(p, a0, a0, c->inv_w[0]);
+	 }
+      }
+
+      if (coord_replace) {
+	 /* Caculate 1.0/PointWidth */
+	 brw_math(&c->func,
+		  c->tmp,
+		  BRW_MATH_FUNCTION_INV,
+		  BRW_MATH_SATURATE_NONE,
+		  0,
+		  c->dx0,
+		  BRW_MATH_DATA_SCALAR,
+		  BRW_MATH_PRECISION_FULL);
+
+	 if (c->key.sprite_origin_lower_left) {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, negate(c->inv_w[0]));
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 } 
+	 else {
+	    brw_MUL(p, c->m1Cx, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m1Cx, 1)), brw_imm_f(0.0));
+	    brw_MUL(p, c->m2Cy, c->tmp, c->inv_w[0]);
+	    brw_MOV(p, vec1(suboffset(c->m2Cy, 0)), brw_imm_f(0.0));
+	 }
+      } 
+      else {
+	 brw_MOV(p, c->m1Cx, brw_imm_ud(0));
+	 brw_MOV(p, c->m2Cy, brw_imm_ud(0));
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+	 if (coord_replace) {
+	    if (c->key.sprite_origin_lower_left) {
+	       brw_MUL(p, c->m3C0, c->inv_w[0], brw_imm_f(1.0));
+	       brw_MOV(p, vec1(suboffset(c->m3C0, 0)), brw_imm_f(0.0));
+	    }
+	    else {
+	       brw_MOV(p, c->m3C0, brw_imm_f(0.0));
+	    }
+	 } 
+	 else {
+	    brw_MOV(p, c->m3C0, a0); /* constant value */
+	 }
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c, GLboolean allocate)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   c->nr_verts = 1;
+   
+   if (allocate)
+      alloc_regs(c);
+
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      GLushort pc, pc_persp, pc_linear;
+      GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+            
+      if (pc_persp)
+      {				
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc); 
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB. 
+	  */
+	 brw_urb_WRITE(p, 
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+void brw_emit_anyprim_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg ip = brw_ip_reg();
+   struct brw_reg payload_prim = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0);
+   struct brw_reg payload_attr = get_element_ud(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), 0); 
+   struct brw_reg primmask;
+   struct brw_instruction *jmp;
+   struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+   
+   GLuint saveflag;
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+
+   primmask = retype(get_element(c->tmp, 0), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, primmask, brw_imm_ud(1));
+   brw_SHL(p, primmask, primmask, payload_prim);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_TRILIST) |
+					       (1<<_3DPRIM_TRISTRIP) |
+					       (1<<_3DPRIM_TRIFAN) |
+					       (1<<_3DPRIM_TRISTRIP_REVERSE) |
+					       (1<<_3DPRIM_POLYGON) |
+					       (1<<_3DPRIM_RECTLIST) |
+					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_tri_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine, so must
+       * restore the flag which is changed when building
+       * the subroutine. fix #13240
+       */
+   }
+   brw_land_fwd_jump(p, jmp);
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, primmask, brw_imm_ud((1<<_3DPRIM_LINELIST) |
+					       (1<<_3DPRIM_LINESTRIP) |
+					       (1<<_3DPRIM_LINELOOP) |
+					       (1<<_3DPRIM_LINESTRIP_CONT) |
+					       (1<<_3DPRIM_LINESTRIP_BF) |
+					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_line_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+      /* note - thread killed in subroutine */
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+   brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+   {
+      saveflag = p->flag_value;
+      brw_push_insn_state(p); 
+      brw_emit_point_sprite_setup( c, GL_FALSE );
+      brw_pop_insn_state(p);
+      p->flag_value = saveflag;
+   }
+   brw_land_fwd_jump(p, jmp); 
+
+   brw_emit_point_setup( c, GL_FALSE );
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_sf_state.c b/src/gallium/drivers/i965/brw_sf_state.c
new file mode 100644
index 00000000000..25dc2b52e07
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_sf_state.c
@@ -0,0 +1,333 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+   
+#include "util/u_math.h"
+
+#include "pipe/p_state.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
+
+static enum pipe_error upload_sf_vp(struct brw_context *brw)
+{
+   const struct pipe_viewport_state *vp = &brw->curr.viewport;
+   const struct pipe_scissor_state *scissor = &brw->curr.scissor;
+   struct brw_sf_viewport sfv;
+   enum pipe_error ret;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+   /* PIPE_NEW_VIEWPORT, PIPE_NEW_SCISSOR */
+
+   sfv.viewport.m00 = vp->scale[0];
+   sfv.viewport.m11 = vp->scale[1];
+   sfv.viewport.m22 = vp->scale[2];
+   sfv.viewport.m30 = vp->translate[0];
+   sfv.viewport.m31 = vp->translate[1];
+   sfv.viewport.m32 = vp->translate[2];
+
+   sfv.scissor.xmin = scissor->minx;
+   sfv.scissor.xmax = scissor->maxx - 1; /* ? */
+   sfv.scissor.ymin = scissor->miny;
+   sfv.scissor.ymax = scissor->maxy - 1; /* ? */
+
+   ret = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0,
+                         &brw->sf.vp_bo );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_VIEWPORT | 
+		PIPE_NEW_SCISSOR),
+      .brw   = 0,
+      .cache = 0
+   },
+   .prepare = upload_sf_vp
+};
+
+struct brw_sf_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int nr_urb_entries, urb_size, sfsize;
+   
+   unsigned scissor:1;
+   unsigned line_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_attenuated:1;
+   unsigned front_face:2;
+   unsigned cull_mode:2;
+   unsigned flatshade_first:1;
+   unsigned gl_rasterization_rules:1;
+   unsigned line_last_pixel_enable:1;
+   float line_width;
+   float point_size;
+};
+
+static void
+sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
+{
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_SF_PROG */
+   key->total_grf = brw->sf.prog_data->total_grf;
+   key->urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_sf_entries;
+   key->urb_size = brw->urb.vsize;
+   key->sfsize = brw->urb.sfsize;
+
+   /* PIPE_NEW_RAST */
+   key->scissor = rast->scissor;
+   key->front_face = rast->front_winding;
+   key->cull_mode = rast->cull_mode;
+   key->line_smooth = rast->line_smooth;
+   key->line_width = rast->line_width;
+   key->flatshade_first = rast->flatshade_first;
+   key->line_last_pixel_enable = rast->line_last_pixel;
+   key->gl_rasterization_rules = rast->gl_rasterization_rules;
+
+   key->point_sprite = rast->point_sprite;
+   key->point_attenuated = rast->point_size_per_vertex;
+
+   key->point_size = CLAMP(rast->point_size, 
+			   rast->point_size_min, 
+			   rast->point_size_max);
+}
+
+static enum pipe_error
+sf_unit_create_from_key(struct brw_context *brw,
+                        struct brw_sf_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_sf_unit_state sf;
+   enum pipe_error ret;
+   int chipset_max_threads;
+   memset(&sf, 0, sizeof(sf));
+
+   sf.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   /* reloc */
+   sf.thread0.kernel_start_pointer = 0;
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   sf.thread3.dispatch_grf_start_reg = 3;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
+   sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
+
+   sf.thread4.nr_urb_entries = key->nr_urb_entries;
+   sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   /* reloc */
+   sf.sf5.sf_viewport_state_offset = 0;
+
+   sf.sf5.viewport_transform = 1;
+
+   if (key->scissor)
+      sf.sf6.scissor = 1;
+
+   if (key->front_face == PIPE_WINDING_CCW)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+   switch (key->cull_mode) {
+   case PIPE_WINDING_CCW:
+   case PIPE_WINDING_CW:
+      sf.sf6.cull_mode = (key->front_face == key->cull_mode ?
+			  BRW_CULLMODE_FRONT :
+			  BRW_CULLMODE_BACK);
+      break;
+   case PIPE_WINDING_BOTH:
+      sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+      break;
+   case PIPE_WINDING_NONE:
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+      break;
+   default:
+      assert(0);
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+      break;
+   }
+
+   /* _NEW_LINE */
+   /* XXX use ctx->Const.Min/MaxLineWidth here */
+   sf.sf6.line_width = CLAMP(key->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (key->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   /* XXX: gl_rasterization_rules?  something else?
+    */
+   sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
+   sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
+   sf.sf6.point_rast_rule = 1;
+
+   /* XXX clamp max depends on AA vs. non-AA */
+
+   /* _NEW_POINT */
+   sf.sf7.sprite_point = key->point_sprite;
+   sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
+   sf.sf7.use_point_size_state = !key->point_attenuated;
+   sf.sf7.aa_line_distance_mode = 0;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   if (!key->flatshade_first) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
+
+   sf.sf7.line_last_pixel_enable = key->line_last_pixel_enable;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   if (key->gl_rasterization_rules) {
+      sf.sf6.dest_org_vbias = 0x8;
+      sf.sf6.dest_org_hbias = 0x8;
+   }
+   else {
+      sf.sf6.dest_org_vbias = 0x0;
+      sf.sf6.dest_org_hbias = 0x0;
+   }
+
+   ret = brw_upload_cache(&brw->cache, BRW_SF_UNIT,
+                          key, sizeof(*key),
+                          reloc, 2,
+                          &sf, sizeof(sf),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   
+   return PIPE_OK;
+}
+
+static enum pipe_error upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_key key;
+   struct brw_winsys_reloc reloc[2];
+   unsigned total_grf;
+   unsigned viewport_transform;
+   unsigned front_winding;
+   enum pipe_error ret;
+
+   sf_unit_populate_key(brw, &key);
+   
+   /* XXX: cut this crap and pre calculate the key:
+    */
+   total_grf = (align(key.total_grf, 16) / 16 - 1);
+   viewport_transform = 1;
+   front_winding = (key.front_face == PIPE_WINDING_CCW ?
+                    BRW_FRONTWINDING_CCW :
+                    BRW_FRONTWINDING_CW);
+
+   /* Emit SF program relocation */
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              total_grf << 1,
+              offsetof(struct brw_sf_unit_state, thread0),
+              brw->sf.prog_bo);
+
+   /* Emit SF viewport relocation */
+   make_reloc(&reloc[1],
+              BRW_USAGE_STATE,
+              front_winding | (viewport_transform << 1),
+              offsetof(struct brw_sf_unit_state, sf5),
+              brw->sf.vp_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_SF_UNIT,
+                        &key, sizeof(key),
+                        reloc, 2,
+                        NULL,
+                        &brw->sf.state_bo))
+      return PIPE_OK;
+
+
+   ret = sf_unit_create_from_key(brw, &key,
+                                 reloc,
+                                 &brw->sf.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_RAST),
+      .brw   = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = upload_sf_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_state.h b/src/gallium/drivers/i965/brw_state.h
new file mode 100644
index 00000000000..d2bbd0123d1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state.h
@@ -0,0 +1,174 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+
+static INLINE void
+brw_add_validated_bo(struct brw_context *brw, struct brw_winsys_buffer *bo)
+{
+   assert(brw->state.validated_bo_count < Elements(brw->state.validated_bos));
+
+   if (bo != NULL) {
+      bo_reference( &brw->state.validated_bos[brw->state.validated_bo_count++],
+                    bo );
+   }
+}
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_curbe_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_aa_line_parameters;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_input_sizes;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_indices;
+const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
+
+
+/***********************************************************************
+ * brw_state.c
+ */
+int brw_validate_state(struct brw_context *brw);
+int brw_upload_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+enum pipe_error brw_cache_data(struct brw_cache *cache,
+                               enum brw_cache_id cache_id,
+                               const void *data,
+                               struct brw_winsys_reloc *relocs,
+                               GLuint nr_relocs,
+                               struct brw_winsys_buffer **bo_out );
+
+enum pipe_error brw_cache_data_sz(struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *data,
+                                  GLuint data_size,
+                                  struct brw_winsys_reloc *relocs,
+                                  GLuint nr_relocs,
+                                  struct brw_winsys_buffer **bo_out);
+
+enum pipe_error brw_upload_cache( struct brw_cache *cache,
+                                  enum brw_cache_id cache_id,
+                                  const void *key,
+                                  GLuint key_sz,
+                                  struct brw_winsys_reloc *relocs,
+                                  GLuint nr_relocs,
+                                  const void *data,
+                                  GLuint data_sz,
+                                  const void *aux,
+                                  void *aux_return ,
+                                  struct brw_winsys_buffer **bo_out);
+
+boolean brw_search_cache( struct brw_cache *cache,
+                          enum brw_cache_id cache_id,
+                          const void *key,
+                          GLuint key_size,
+                          struct brw_winsys_reloc *relocs,
+                          GLuint nr_relocs,
+                          void *aux_return,
+                          struct brw_winsys_buffer **bo_out);
+
+void brw_state_cache_check_size( struct brw_context *brw );
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+void brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo);
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->batch, (s), sizeof(*(s)), IGNORE_CLIPRECTS)
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz );
+void brw_destroy_batch_cache( struct brw_context *brw );
+void brw_clear_batch_cache( struct brw_context *brw );
+
+/***********************************************************************
+ * brw_wm_surface_state.c 
+ */
+
+/***********************************************************************
+ * brw_state_debug.c
+ */
+void brw_update_dirty_counts( unsigned mesa,
+			      unsigned brw,
+			      unsigned cache );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_state_batch.c b/src/gallium/drivers/i965/brw_state_batch.c
new file mode 100644
index 00000000000..7d212e5c247
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_batch.c
@@ -0,0 +1,98 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+     
+
+
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+
+
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+GLboolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   GLuint sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->flags.always_emit_state) {
+      brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
+      return GL_TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return GL_FALSE;
+	 if (item->sz != sz) {
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = MALLOC(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+ emit:
+   memcpy(item->header, newheader, sz);
+   brw_batchbuffer_data(brw->batch, data, sz, IGNORE_CLIPRECTS);
+   return GL_TRUE;
+}
+
+void brw_clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      free((void *)item->header);
+      free(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+}
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   brw_clear_batch_cache(brw);
+}
diff --git a/src/gallium/drivers/i965/brw_state_cache.c b/src/gallium/drivers/i965/brw_state_cache.c
new file mode 100644
index 00000000000..16b643ceb28
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_cache.c
@@ -0,0 +1,617 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_cache.c
+ *
+ * This file implements a simple static state cache for 965.  The consumers
+ * can query the hash table of state using a cache_id, opaque key data,
+ * and list of buffers that will be used in relocations, and receive the
+ * corresponding state buffer object of state (plus associated auxiliary
+ * data) in return.
+ *
+ * The inner workings are a simple hash table based on a CRC of the key data.
+ * The cache_id and relocation target buffers associated with the state
+ * buffer are included as auxiliary key data, but are not part of the hash
+ * value (this should be fixed, but will likely be fixed instead by making
+ * consumers use structured keys).
+ *
+ * Replacement is not implemented.  Instead, when the cache gets too big, at
+ * a safe point (unlock) we throw out all of the cache data and let it
+ * regenerate for the next rendering operation.
+ *
+ * The reloc structs need to be included as key data, otherwise the
+ * non-unique values stuffed in the offset in key data through
+ * brw_cache_data() may result in successful probe for state buffers
+ * even when the buffer being referenced doesn't match.  The result would be
+ * that the same state cache entry is used twice for different buffers,
+ * only one of the two buffers referenced gets put into the offset, and the
+ * incorrect program is run for the other instance.
+ */
+#include "util/u_memory.h"
+
+#include "brw_debug.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+
+/* XXX: Fixme - have to include these to get the sizes of the prog_key
+ * structs:
+ */
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         struct brw_winsys_reloc *relocs, GLuint nr_relocs)
+{
+   GLuint *ikey = (GLuint *)key;
+   GLuint hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   /* Include the BO pointers as key data as well */
+   ikey = (GLuint *)relocs;
+   key_size = nr_relocs * sizeof(struct brw_winsys_reloc);
+   for (i = 0; i < key_size/4; i++) {
+      hash ^= ikey[i];
+      hash = (hash << 5) | (hash >> 27);
+   }
+
+   return hash;
+}
+
+
+/**
+ * Marks a new buffer as being chosen for the given cache id.
+ */
+static void
+update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
+		  struct brw_winsys_buffer *bo)
+{
+   if (bo == cache->last_bo[cache_id])
+      return; /* no change */
+
+   bo_reference( &cache->last_bo[cache_id],  bo );
+
+   cache->brw->state.dirty.cache |= 1 << cache_id;
+}
+
+
+static struct brw_cache_item *
+search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+	     GLuint hash, const void *key, GLuint key_size,
+	     struct brw_winsys_reloc *relocs, GLuint nr_relocs)
+{
+   struct brw_cache_item *c;
+
+#if 0
+   int bucketcount = 0;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next)
+      bucketcount++;
+
+   debug_printf("bucket %d/%d = %d/%d items\n", hash % cache->size,
+	   cache->size, bucketcount, cache->n_items);
+#endif
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->cache_id == cache_id &&
+	  c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0 &&
+	  c->nr_relocs == nr_relocs &&
+	  memcmp(c->relocs, relocs, nr_relocs * sizeof *relocs) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void
+rehash(struct brw_cache *cache)
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   GLuint size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) CALLOC(size, sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+/**
+ * Returns the buffer object matching cache_id and key, or NULL.
+ */
+boolean
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 struct brw_winsys_reloc *relocs, 
+		 GLuint nr_relocs,
+                 void *aux_return,
+                 struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(key, key_size, relocs, nr_relocs);
+
+   item = search_cache(cache, cache_id, hash, key, key_size,
+		       relocs, nr_relocs);
+
+   if (item) {
+      if (aux_return)
+         *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+      
+      update_cache_last(cache, cache_id, item->bo);
+      bo_reference(bo_out, item->bo);
+      return TRUE;
+   }
+   
+   return FALSE;      
+}
+
+
+enum pipe_error
+brw_upload_cache( struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *key,
+		  GLuint key_size,
+		  struct brw_winsys_reloc *relocs,
+		  GLuint nr_relocs,
+		  const void *data,
+		  GLuint data_size,
+		  const void *aux,
+		  void *aux_return,
+                  struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   GLuint hash = hash_key(key, key_size, relocs, nr_relocs);
+   GLuint relocs_size = nr_relocs * sizeof relocs[0];
+   GLuint aux_size = cache->aux_size[cache_id];
+   enum pipe_error ret;
+   void *tmp;
+   int i;
+
+   /* Create the buffer object to contain the data.  For now, use a
+    * single buffer type to describe all cached state atoms.  Later,
+    * may want to take advantage of hardware distinctions between
+    * these various entities.
+    */
+   ret = cache->sws->bo_alloc(cache->sws,
+                              cache->buffer_type,
+                              data_size, 1 << 6, 
+                              bo_out);
+   if (ret)
+      return ret;
+
+
+   /* Set up the memory containing the key, aux_data, and relocs */
+   tmp = MALLOC(key_size + aux_size + relocs_size);
+
+   memcpy(tmp, key, key_size);
+   memcpy((char *)tmp + key_size, aux, cache->aux_size[cache_id]);
+   memcpy((char *)tmp + key_size + aux_size, relocs, relocs_size);
+   for (i = 0; i < nr_relocs; i++) {
+      p_atomic_inc(&relocs[i].bo->reference.count);
+   }
+
+   item->cache_id = cache_id;
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->relocs = (struct brw_winsys_reloc *)((char *)tmp + key_size + aux_size);
+   item->nr_relocs = nr_relocs;
+   bo_reference( &item->bo, *bo_out );
+   item->data_size = data_size;
+
+   if (cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+   cache->n_items++;
+
+   if (aux_return) {
+      assert(cache->aux_size[cache_id]);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to cache id %d\n",
+		   cache->name[cache_id],
+		   data_size, cache_id);
+
+   /* Copy data to the buffer */
+   ret = cache->sws->bo_subdata(item->bo, 
+                                cache_id,
+                                0, data_size, data,
+                                relocs, nr_relocs);
+   if (ret)
+      return ret;
+
+   update_cache_last(cache, cache_id, item->bo);
+
+   return PIPE_OK;
+}
+
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
+ */
+enum pipe_error
+brw_cache_data_sz(struct brw_cache *cache,
+		  enum brw_cache_id cache_id,
+		  const void *data,
+		  GLuint data_size,
+		  struct brw_winsys_reloc *relocs,
+		  GLuint nr_relocs,
+                  struct brw_winsys_buffer **bo_out)
+{
+   struct brw_cache_item *item;
+   GLuint hash = hash_key(data, data_size, relocs, nr_relocs);
+
+   item = search_cache(cache, cache_id, hash, data, data_size,
+		       relocs, nr_relocs);
+   if (item) {
+      update_cache_last(cache, cache_id, item->bo);
+
+      bo_reference(bo_out, item->bo);
+      return PIPE_OK;
+   }
+
+   return brw_upload_cache(cache, cache_id,
+                           data, data_size,
+                           relocs, nr_relocs,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
+}
+
+
+/**
+ * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
+ *
+ * If nr_relocs is nonzero, brw_search_cache()/brw_upload_cache() would be
+ * better to use, as the potentially changing offsets in the data-used-as-key
+ * will result in excessive cache misses.
+ * 
+ * XXX: above is no longer true -- can we remove some code?
+ */
+enum pipe_error
+brw_cache_data(struct brw_cache *cache,
+	       enum brw_cache_id cache_id,
+	       const void *data,
+	       struct brw_winsys_reloc *relocs,
+	       GLuint nr_relocs,
+               struct brw_winsys_buffer **bo_out)
+{
+   return brw_cache_data_sz(cache, cache_id, data, cache->key_size[cache_id],
+			    relocs, nr_relocs, bo_out);
+}
+
+
+static void
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
+{
+   cache->name[id] = strdup(name);
+   cache->key_size[id] = key_size;
+   cache->aux_size[id] = aux_size;
+}
+
+
+static void
+brw_init_general_state_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->cache;
+
+   cache->brw = brw;
+   cache->sws = brw->sws;
+
+   cache->buffer_type = BRW_BUFFER_TYPE_GENERAL_STATE;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "CC_VP",
+		     BRW_CC_VP,
+		     sizeof(struct brw_cc_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CC_UNIT",
+		     BRW_CC_UNIT,
+		     sizeof(struct brw_cc_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_PROG",
+		     BRW_WM_PROG,
+		     sizeof(struct brw_wm_prog_key),
+		     sizeof(struct brw_wm_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SAMPLER_DEFAULT_COLOR",
+		     BRW_SAMPLER_DEFAULT_COLOR,
+		     sizeof(struct brw_sampler_default_color),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SAMPLER",
+		     BRW_SAMPLER,
+		     0,		/* variable key/data size */
+		     0);
+
+   brw_init_cache_id(cache,
+		     "WM_UNIT",
+		     BRW_WM_UNIT,
+		     sizeof(struct brw_wm_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_PROG",
+		     BRW_SF_PROG,
+		     sizeof(struct brw_sf_prog_key),
+		     sizeof(struct brw_sf_prog_data));
+
+   brw_init_cache_id(cache,
+		     "SF_VP",
+		     BRW_SF_VP,
+		     sizeof(struct brw_sf_viewport),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SF_UNIT",
+		     BRW_SF_UNIT,
+		     sizeof(struct brw_sf_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_UNIT",
+		     BRW_VS_UNIT,
+		     sizeof(struct brw_vs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "VS_PROG",
+		     BRW_VS_PROG,
+		     sizeof(struct brw_vs_prog_key),
+		     sizeof(struct brw_vs_prog_data));
+
+   brw_init_cache_id(cache,
+		     "CLIP_UNIT",
+		     BRW_CLIP_UNIT,
+		     sizeof(struct brw_clip_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "CLIP_PROG",
+		     BRW_CLIP_PROG,
+		     sizeof(struct brw_clip_prog_key),
+		     sizeof(struct brw_clip_prog_data));
+
+   brw_init_cache_id(cache,
+		     "GS_UNIT",
+		     BRW_GS_UNIT,
+		     sizeof(struct brw_gs_unit_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "GS_PROG",
+		     BRW_GS_PROG,
+		     sizeof(struct brw_gs_prog_key),
+		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_state_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
+   cache->sws = brw->sws;
+
+   cache->buffer_type = BRW_BUFFER_TYPE_SURFACE_STATE;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
+		     "SS_SURFACE",
+		     BRW_SS_SURFACE,
+		     sizeof(struct brw_surface_state),
+		     0);
+
+   brw_init_cache_id(cache,
+		     "SS_SURF_BIND",
+		     BRW_SS_SURF_BIND,
+		     0,
+		     0);
+}
+
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_general_state_cache(brw);
+   brw_init_surface_state_cache(brw);
+}
+
+
+static void
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   struct brw_cache_item *c, *next;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 int j;
+
+	 next = c->next;
+
+	 for (j = 0; j < c->nr_relocs; j++)
+	    bo_reference(&c->relocs[j].bo, NULL);
+
+	 bo_reference(&c->bo, NULL);
+	 FREE((void *)c->key);
+	 FREE(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+
+   if (brw->curbe.last_buf) {
+      FREE(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.mesa |= ~0;
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+/* Clear all entries from the cache that point to the given bo.
+ *
+ * This lets us release memory for reuse earlier for known-dead buffers,
+ * at the cost of walking the entire hash table.
+ */
+void
+brw_state_cache_bo_delete(struct brw_cache *cache, struct brw_winsys_buffer *bo)
+{
+   struct brw_cache_item **prev;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < cache->size; i++) {
+      for (prev = &cache->items[i]; *prev;) {
+	 struct brw_cache_item *c = *prev;
+
+	 if (cache->sws->bo_references(c->bo, bo)) {
+	    int j;
+
+	    *prev = c->next;
+
+	    for (j = 0; j < c->nr_relocs; j++)
+	       bo_reference(&c->relocs[j].bo, NULL);
+
+	    bo_reference(&c->bo, NULL);
+
+	    FREE((void *)c->key);
+	    FREE(c);
+	    cache->n_items--;
+	 } else {
+	    prev = &c->next;
+	 }
+      }
+   }
+}
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
+{
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
+   /* un-tuned guess.  We've got around 20 state objects for a total of around
+    * 32k, so 1000 of them is around 1.5MB.
+    */
+   if (brw->cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
+}
+
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
+{
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
+   for (i = 0; i < BRW_MAX_CACHE; i++) {
+      bo_reference(&cache->last_bo[i], NULL);
+      FREE(cache->name[i]);
+   }
+   FREE(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
+}
diff --git a/src/gallium/drivers/i965/brw_state_debug.c b/src/gallium/drivers/i965/brw_state_debug.c
new file mode 100644
index 00000000000..049c278c93e
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_debug.c
@@ -0,0 +1,153 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+      
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+struct dirty_bit_map {
+   uint32_t bit;
+   char *name;
+   uint32_t count;
+};
+
+#define DEFINE_BIT(name) {name, #name, 0}
+
+static struct dirty_bit_map mesa_bits[] = {
+   DEFINE_BIT(PIPE_NEW_DEPTH_STENCIL_ALPHA),
+   DEFINE_BIT(PIPE_NEW_RAST),
+   DEFINE_BIT(PIPE_NEW_BLEND),
+   DEFINE_BIT(PIPE_NEW_VIEWPORT),
+   DEFINE_BIT(PIPE_NEW_SAMPLERS),
+   DEFINE_BIT(PIPE_NEW_VERTEX_BUFFER),
+   DEFINE_BIT(PIPE_NEW_VERTEX_ELEMENT),
+   DEFINE_BIT(PIPE_NEW_FRAGMENT_SHADER),
+   DEFINE_BIT(PIPE_NEW_VERTEX_SHADER),
+   DEFINE_BIT(PIPE_NEW_FRAGMENT_CONSTANTS),
+   DEFINE_BIT(PIPE_NEW_VERTEX_CONSTANTS),
+   DEFINE_BIT(PIPE_NEW_CLIP),
+   DEFINE_BIT(PIPE_NEW_INDEX_BUFFER),
+   DEFINE_BIT(PIPE_NEW_INDEX_RANGE),
+   DEFINE_BIT(PIPE_NEW_BLEND_COLOR),
+   DEFINE_BIT(PIPE_NEW_POLYGON_STIPPLE),
+   DEFINE_BIT(PIPE_NEW_FRAMEBUFFER_DIMENSIONS),
+   DEFINE_BIT(PIPE_NEW_DEPTH_BUFFER),
+   DEFINE_BIT(PIPE_NEW_COLOR_BUFFERS),
+   DEFINE_BIT(PIPE_NEW_QUERY),
+   DEFINE_BIT(PIPE_NEW_SCISSOR),
+   DEFINE_BIT(PIPE_NEW_BOUND_TEXTURES),
+   DEFINE_BIT(PIPE_NEW_NR_CBUFS),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map brw_bits[] = {
+   DEFINE_BIT(BRW_NEW_URB_FENCE),
+   DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
+   DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
+   DEFINE_BIT(BRW_NEW_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
+   DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_PRIMITIVE),
+   DEFINE_BIT(BRW_NEW_CONTEXT),
+   DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
+   DEFINE_BIT(BRW_NEW_PSP),
+   DEFINE_BIT(BRW_NEW_WM_SURFACES),
+   DEFINE_BIT(BRW_NEW_xxx),
+   DEFINE_BIT(BRW_NEW_INDICES),
+   {0, 0, 0}
+};
+
+static struct dirty_bit_map cache_bits[] = {
+   DEFINE_BIT(CACHE_NEW_CC_VP),
+   DEFINE_BIT(CACHE_NEW_CC_UNIT),
+   DEFINE_BIT(CACHE_NEW_WM_PROG),
+   DEFINE_BIT(CACHE_NEW_SAMPLER_DEFAULT_COLOR),
+   DEFINE_BIT(CACHE_NEW_SAMPLER),
+   DEFINE_BIT(CACHE_NEW_WM_UNIT),
+   DEFINE_BIT(CACHE_NEW_SF_PROG),
+   DEFINE_BIT(CACHE_NEW_SF_VP),
+   DEFINE_BIT(CACHE_NEW_SF_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_UNIT),
+   DEFINE_BIT(CACHE_NEW_VS_PROG),
+   DEFINE_BIT(CACHE_NEW_GS_UNIT),
+   DEFINE_BIT(CACHE_NEW_GS_PROG),
+   DEFINE_BIT(CACHE_NEW_CLIP_VP),
+   DEFINE_BIT(CACHE_NEW_CLIP_UNIT),
+   DEFINE_BIT(CACHE_NEW_CLIP_PROG),
+   DEFINE_BIT(CACHE_NEW_SURFACE),
+   DEFINE_BIT(CACHE_NEW_SURF_BIND),
+   {0, 0, 0}
+};
+
+
+static void
+brw_update_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      if (bit_map[i].bit & bits)
+	 bit_map[i].count++;
+   }
+}
+
+static void
+brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
+{
+   int i;
+
+   for (i = 0; i < 32; i++) {
+      if (bit_map[i].bit == 0)
+	 return;
+
+      debug_printf("0x%08x: %12d (%s)\n",
+	      bit_map[i].bit, bit_map[i].count, bit_map[i].name);
+   }
+}
+
+void
+brw_update_dirty_counts( unsigned mesa,
+			 unsigned brw,
+			 unsigned cache )
+{
+   static int dirty_count = 0;
+
+   brw_update_dirty_count(mesa_bits, mesa);
+   brw_update_dirty_count(brw_bits, brw);
+   brw_update_dirty_count(cache_bits, cache);
+      if (dirty_count++ % 1000 == 0) {
+	 brw_print_dirty_count(mesa_bits, mesa);
+	 brw_print_dirty_count(brw_bits, brw);
+	 brw_print_dirty_count(cache_bits, cache);
+	 debug_printf("\n");
+      }
+}
diff --git a/src/gallium/drivers/i965/brw_state_upload.c b/src/gallium/drivers/i965/brw_state_upload.c
new file mode 100644
index 00000000000..f8b91eff816
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_state_upload.c
@@ -0,0 +1,270 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+       
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_batchbuffer.h"
+#include "brw_debug.h"
+
+const struct brw_tracked_state *atoms[] =
+{
+/*   &brw_wm_input_sizes, */
+   &brw_vs_prog,
+   &brw_gs_prog, 
+   &brw_clip_prog, 
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_vs_surfaces,		/* must do before unit */
+   /*&brw_wm_constant_surface,*/	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,  
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_depthbuffer,
+   &brw_polygon_stipple,
+   &brw_line_stipple,
+
+   &brw_psp_urb_cbs,
+
+   &brw_drawing_rect,
+   &brw_indices,
+   &brw_index_buffer,
+   &brw_vertices,
+
+   &brw_curbe_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_caches(brw);
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+}
+
+/***********************************************************************
+ */
+
+static GLboolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->mesa & b->mesa) ||
+	   (a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->mesa |= b->mesa;
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->mesa = a->mesa ^ b->mesa;
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+static void
+brw_clear_validated_bos(struct brw_context *brw)
+{
+   int i;
+
+   /* Clear the last round of validated bos */
+   for (i = 0; i < brw->state.validated_bo_count; i++) {
+      bo_reference(&brw->state.validated_bos[i], NULL);
+   }
+   brw->state.validated_bo_count = 0;
+}
+
+
+/***********************************************************************
+ * Emit all state:
+ */
+enum pipe_error brw_validate_state( struct brw_context *brw )
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   GLuint i;
+   int ret;
+
+   brw_clear_validated_bos(brw);
+   brw_add_validated_bo(brw, brw->batch->buf);
+
+   if (brw->flags.always_emit_state) {
+      state->mesa |= ~0;
+      state->brw |= ~0;
+      state->cache |= ~0;
+   }
+
+   if (state->mesa == 0 &&
+       state->cache == 0 &&
+       state->brw == 0)
+      return 0;
+
+   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
+      brw_clear_batch_cache(brw);
+
+   /* do prepare stage for all atoms */
+   for (i = 0; i < Elements(atoms); i++) {
+      const struct brw_tracked_state *atom = atoms[i];
+
+      if (check_state(state, &atom->dirty)) {
+         if (atom->prepare) {
+            ret = atom->prepare(brw);
+	    if (ret)
+	       return ret;
+        }
+      }
+   }
+
+   /* Make sure that the textures which are referenced by the current
+    * brw fragment program are actually present/valid.
+    * If this fails, we can experience GPU lock-ups.
+    */
+   {
+      const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+      if (fp) {
+         assert(fp->info.file_max[TGSI_FILE_SAMPLER] < (int)brw->curr.num_samplers);
+	 /*assert(fp->info.texture_max <= brw->curr.num_textures);*/
+      }
+   }
+
+   return 0;
+}
+
+
+enum pipe_error brw_upload_state(struct brw_context *brw)
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   int ret;
+   int i;
+
+   brw_clear_validated_bos(brw);
+
+   if (BRW_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;      
+      memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.mesa ||
+		atom->dirty.brw ||
+		atom->dirty.cache);
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
+	    }
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 if (check_state(state, &atom->dirty)) {
+	    if (atom->emit) {
+	       ret = atom->emit( brw );
+	       if (ret)
+		  return ret;
+	    }
+	 }
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE) {
+      brw_update_dirty_counts( state->mesa, 
+			       state->brw,
+			       state->cache );
+   }
+   
+   /* Clear dirty flags:
+    */
+   memset(state, 0, sizeof(*state));
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_structs.h b/src/gallium/drivers/i965/brw_structs.h
new file mode 100644
index 00000000000..bf10bc04de7
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs.h
@@ -0,0 +1,1576 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+        
+
+#ifndef BRW_STRUCTS_H
+#define BRW_STRUCTS_H
+
+#include "brw_types.h"
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
+/* Command packets:
+ */
+struct header 
+{
+   GLuint length:16; 
+   GLuint opcode:16; 
+};
+
+
+union header_union
+{
+   struct header bits;
+   GLuint dword;
+};
+
+struct brw_3d_control
+{   
+   struct 
+   {
+      GLuint length:8;
+      GLuint notify_enable:1;
+      GLuint pad:3;
+      GLuint wc_flush_enable:1; 
+      GLuint depth_stall_enable:1; 
+      GLuint operation:2; 
+      GLuint opcode:16; 
+   } header;
+   
+   struct
+   {
+      GLuint pad:2;
+      GLuint dest_addr_type:1; 
+      GLuint dest_addr:29; 
+   } dest;
+   
+   GLuint dword2;   
+   GLuint dword3;   
+};
+
+
+struct brw_3d_primitive
+{
+   struct
+   {
+      GLuint length:8; 
+      GLuint pad:2;
+      GLuint topology:5; 
+      GLuint indexed:1; 
+      GLuint opcode:16; 
+   } header;
+
+   GLuint verts_per_instance;  
+   GLuint start_vert_location;  
+   GLuint instance_count;  
+   GLuint start_instance_location;  
+   GLuint base_vert_location;  
+};
+
+/* These seem to be passed around as function args, so it works out
+ * better to keep them as #defines:
+ */
+#define BRW_FLUSH_READ_CACHE           0x1
+#define BRW_FLUSH_STATE_CACHE          0x2
+#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
+#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
+
+struct brw_mi_flush
+{
+   GLuint flags:4;
+   GLuint pad:12;
+   GLuint opcode:16;
+};
+
+struct brw_vf_statistics
+{
+   GLuint statistics_enable:1;
+   GLuint pad:15;
+   GLuint opcode:16;
+};
+
+
+
+struct brw_binding_table_pointers
+{
+   struct header header;
+   GLuint vs; 
+   GLuint gs; 
+   GLuint clp; 
+   GLuint sf; 
+   GLuint wm; 
+};
+
+
+struct brw_blend_constant_color
+{
+   struct header header;
+   GLfloat blend_constant_color[4];  
+};
+
+
+struct brw_depthbuffer
+{
+   union header_union header;
+   
+   union {
+      struct {
+	 GLuint pitch:18; 
+	 GLuint format:3; 
+	 GLuint pad:2;
+	 GLuint software_tiled_rendering_mode:2;
+	 GLuint depth_offset_disable:1; 
+	 GLuint tile_walk:1; 
+	 GLuint tiled_surface:1; 
+	 GLuint pad2:1;
+	 GLuint surface_type:3; 
+      } bits;
+      GLuint dword;
+   } dword1;
+   
+   GLuint dword2_base_addr; 
+ 
+   union {
+      struct {
+	 GLuint pad:1;
+	 GLuint mipmap_layout:1; 
+	 GLuint lod:4; 
+	 GLuint width:13; 
+	 GLuint height:13; 
+      } bits;
+      GLuint dword;
+   } dword3;
+
+   union {
+      struct {
+	 GLuint pad:10;
+	 GLuint min_array_element:11; 
+	 GLuint depth:11; 
+      } bits;
+      GLuint dword;
+   } dword4;
+};
+
+struct brw_depthbuffer_g4x
+{
+   union header_union header;
+   
+   union {
+      struct {
+	 GLuint pitch:18; 
+	 GLuint format:3; 
+	 GLuint pad:2;
+	 GLuint software_tiled_rendering_mode:2;
+	 GLuint depth_offset_disable:1; 
+	 GLuint tile_walk:1; 
+	 GLuint tiled_surface:1; 
+	 GLuint pad2:1;
+	 GLuint surface_type:3; 
+      } bits;
+      GLuint dword;
+   } dword1;
+   
+   GLuint dword2_base_addr; 
+ 
+   union {
+      struct {
+	 GLuint pad:1;
+	 GLuint mipmap_layout:1; 
+	 GLuint lod:4; 
+	 GLuint width:13; 
+	 GLuint height:13; 
+      } bits;
+      GLuint dword;
+   } dword3;
+
+   union {
+      struct {
+	 GLuint pad:10;
+	 GLuint min_array_element:11; 
+	 GLuint depth:11; 
+      } bits;
+      GLuint dword;
+   } dword4;
+
+   union {
+      struct {
+         GLuint xoffset:16;
+         GLuint yoffset:16;
+      } bits;
+      GLuint dword;
+   } dword5;   /* NEW in Integrated Graphics Device */
+};
+
+struct brw_drawrect
+{
+   struct header header;
+   GLuint xmin:16; 
+   GLuint ymin:16; 
+   GLuint xmax:16; 
+   GLuint ymax:16; 
+   GLuint xorg:16;  
+   GLuint yorg:16;  
+};
+
+
+
+
+struct brw_global_depth_offset_clamp
+{
+   struct header header;
+   GLfloat depth_offset_clamp;  
+};
+
+struct brw_indexbuffer
+{   
+   union {
+      struct
+      {
+	 GLuint length:8; 
+	 GLuint index_format:2; 
+	 GLuint cut_index_enable:1; 
+	 GLuint pad:5; 
+	 GLuint opcode:16; 
+      } bits;
+      GLuint dword;
+
+   } header;
+
+   GLuint buffer_start; 
+   GLuint buffer_end; 
+};
+
+/* NEW in Integrated Graphics Device */
+struct brw_aa_line_parameters
+{
+   struct header header;
+
+   struct {
+      GLuint aa_coverage_scope:8;
+      GLuint pad0:8;
+      GLuint aa_coverage_bias:8;
+      GLuint pad1:8;
+   } bits0;
+
+   struct {
+      GLuint aa_coverage_endcap_slope:8;
+      GLuint pad0:8;
+      GLuint aa_coverage_endcap_bias:8;
+      GLuint pad1:8;
+   } bits1;
+};
+
+struct brw_line_stipple
+{   
+   struct header header;
+  
+   struct
+   {
+      GLuint pattern:16; 
+      GLuint pad:16;
+   } bits0;
+   
+   struct
+   {
+      GLuint repeat_count:9; 
+      GLuint pad:7;
+      GLuint inverse_repeat_count:16; 
+   } bits1;
+};
+
+
+struct brw_pipelined_state_pointers
+{
+   struct header header;
+   
+   struct {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } vs;
+   
+   struct
+   {
+      GLuint enable:1;
+      GLuint pad:4;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } gs;
+   
+   struct
+   {
+      GLuint enable:1;
+      GLuint pad:4;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } clp;
+   
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } sf;
+
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE */
+   } wm;
+   
+   struct
+   {
+      GLuint pad:5;
+      GLuint offset:27; /* Offset from GENERAL_STATE_BASE. KW: check me! */
+   } cc;
+};
+
+
+struct brw_polygon_stipple_offset
+{
+   struct header header;
+
+   struct {
+      GLuint y_offset:5; 
+      GLuint pad:3;
+      GLuint x_offset:5; 
+      GLuint pad0:19;
+   } bits0;
+};
+
+
+
+struct brw_polygon_stipple
+{
+   struct header header;
+   GLuint stipple[32];
+};
+
+
+
+struct brw_pipeline_select
+{
+   struct
+   {
+      GLuint pipeline_select:1;   
+      GLuint pad:15;
+      GLuint opcode:16;   
+   } header;
+};
+
+
+struct brw_pipe_control
+{
+   struct
+   {
+      GLuint length:8;
+      GLuint notify_enable:1;
+      GLuint texture_cache_flush_enable:1;
+      GLuint indirect_state_pointers_disable:1;
+      GLuint instruction_state_cache_flush_enable:1;
+      GLuint write_cache_flush_enable:1;
+      GLuint depth_stall_enable:1;
+      GLuint post_sync_operation:2;
+
+      GLuint opcode:16;
+   } header;
+
+   struct
+   {
+      GLuint pad:2;
+      GLuint dest_addr_type:1;
+      GLuint dest_addr:29;
+   } bits1;
+
+   GLuint data0;
+   GLuint data1;
+};
+
+
+struct brw_urb_fence
+{
+   struct
+   {
+      GLuint length:8;   
+      GLuint vs_realloc:1;   
+      GLuint gs_realloc:1;   
+      GLuint clp_realloc:1;   
+      GLuint sf_realloc:1;   
+      GLuint vfe_realloc:1;   
+      GLuint cs_realloc:1;   
+      GLuint pad:2;
+      GLuint opcode:16;   
+   } header;
+
+   struct
+   {
+      GLuint vs_fence:10;  
+      GLuint gs_fence:10;  
+      GLuint clp_fence:10;  
+      GLuint pad:2;
+   } bits0;
+
+   struct
+   {
+      GLuint sf_fence:10;  
+      GLuint vf_fence:10;  
+      GLuint cs_fence:11;  
+      GLuint pad:1;
+   } bits1;
+};
+
+struct brw_cs_urb_state
+{
+   struct header header;
+
+   struct
+   {
+      GLuint nr_urb_entries:3;   
+      GLuint pad:1;
+      GLuint urb_entry_size:5;   
+      GLuint pad0:23;
+   } bits0;
+};
+
+struct brw_constant_buffer
+{
+   struct
+   {
+      GLuint length:8;   
+      GLuint valid:1;   
+      GLuint pad:7;
+      GLuint opcode:16;   
+   } header;
+
+   struct
+   {
+      GLuint buffer_length:6;   
+      GLuint buffer_address:26;  
+   } bits0;
+};
+
+struct brw_state_base_address
+{
+   struct header header;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint general_state_address:27;  
+   } bits0;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint surface_state_address:27;  
+   } bits1;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:4;
+      GLuint indirect_object_state_address:27;  
+   } bits2;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:11;
+      GLuint general_state_upper_bound:20;  
+   } bits3;
+
+   struct
+   {
+      GLuint modify_enable:1;
+      GLuint pad:11;
+      GLuint indirect_object_state_upper_bound:20;  
+   } bits4;
+};
+
+struct brw_state_prefetch
+{
+   struct header header;
+
+   struct
+   {
+      GLuint prefetch_count:3;   
+      GLuint pad:3;
+      GLuint prefetch_pointer:26;  
+   } bits0;
+};
+
+struct brw_system_instruction_pointer
+{
+   struct header header;
+
+   struct
+   {
+      GLuint pad:4;
+      GLuint system_instruction_pointer:28;  
+   } bits0;
+};
+
+
+
+
+/* State structs for the various fixed function units:
+ */
+
+
+struct thread0
+{
+   GLuint pad0:1;
+   GLuint grf_reg_count:3; 
+   GLuint pad1:2;
+   GLuint kernel_start_pointer:26; /* Offset from GENERAL_STATE_BASE */
+};
+
+struct thread1
+{
+   GLuint ext_halt_exception_enable:1; 
+   GLuint sw_exception_enable:1; 
+   GLuint mask_stack_exception_enable:1; 
+   GLuint timeout_exception_enable:1; 
+   GLuint illegal_op_exception_enable:1; 
+   GLuint pad0:3;
+   GLuint depth_coef_urb_read_offset:6;	/* WM only */
+   GLuint pad1:2;
+   GLuint floating_point_mode:1; 
+   GLuint thread_priority:1; 
+   GLuint binding_table_entry_count:8; 
+   GLuint pad3:5;
+   GLuint single_program_flow:1; 
+};
+
+struct thread2
+{
+   GLuint per_thread_scratch_space:4; 
+   GLuint pad0:6;
+   GLuint scratch_space_base_pointer:22; 
+};
+
+   
+struct thread3
+{
+   GLuint dispatch_grf_start_reg:4; 
+   GLuint urb_entry_read_offset:6; 
+   GLuint pad0:1;
+   GLuint urb_entry_read_length:6; 
+   GLuint pad1:1;
+   GLuint const_urb_entry_read_offset:6; 
+   GLuint pad2:1;
+   GLuint const_urb_entry_read_length:6; 
+   GLuint pad3:1;
+};
+
+
+
+struct brw_clip_unit_state
+{
+   struct thread0 thread0;
+   struct
+   {
+      GLuint pad0:7;
+      GLuint sw_exception_enable:1;
+      GLuint pad1:3;
+      GLuint mask_stack_exception_enable:1;
+      GLuint pad2:1;
+      GLuint illegal_op_exception_enable:1;
+      GLuint pad3:2;
+      GLuint floating_point_mode:1;
+      GLuint thread_priority:1;
+      GLuint binding_table_entry_count:8;
+      GLuint pad4:5;
+      GLuint single_program_flow:1;
+   } thread1;
+
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:9;
+      GLuint gs_output_stats:1; /* not always */
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:5; 	/* may be less */
+      GLuint pad3:2;
+   } thread4;   
+      
+   struct
+   {
+      GLuint pad0:13;
+      GLuint clip_mode:3; 
+      GLuint userclip_enable_flags:8; 
+      GLuint userclip_must_clip:1; 
+      GLuint negative_w_clip_test:1;
+      GLuint guard_band_enable:1; 
+      GLuint viewport_z_clip_enable:1; 
+      GLuint viewport_xy_clip_enable:1; 
+      GLuint vertex_position_space:1; 
+      GLuint api_mode:1; 
+      GLuint pad2:1;
+   } clip5;
+   
+   struct
+   {
+      GLuint pad0:5;
+      GLuint clipper_viewport_state_ptr:27; 
+   } clip6;
+
+   
+   GLfloat viewport_xmin;  
+   GLfloat viewport_xmax;  
+   GLfloat viewport_ymin;  
+   GLfloat viewport_ymax;  
+};
+
+
+
+struct brw_cc_unit_state
+{
+   struct brw_cc0
+   {
+      GLuint pad0:3;
+      GLuint bf_stencil_pass_depth_pass_op:3; 
+      GLuint bf_stencil_pass_depth_fail_op:3; 
+      GLuint bf_stencil_fail_op:3; 
+      GLuint bf_stencil_func:3; 
+      GLuint bf_stencil_enable:1; 
+      GLuint pad1:2;
+      GLuint stencil_write_enable:1; 
+      GLuint stencil_pass_depth_pass_op:3; 
+      GLuint stencil_pass_depth_fail_op:3; 
+      GLuint stencil_fail_op:3; 
+      GLuint stencil_func:3; 
+      GLuint stencil_enable:1; 
+   } cc0;
+
+   
+   struct brw_cc1
+   {
+      GLuint bf_stencil_ref:8; 
+      GLuint stencil_write_mask:8; 
+      GLuint stencil_test_mask:8; 
+      GLuint stencil_ref:8; 
+   } cc1;
+
+   
+   struct brw_cc2
+   {
+      GLuint logicop_enable:1; 
+      GLuint pad0:10;
+      GLuint depth_write_enable:1; 
+      GLuint depth_test_function:3; 
+      GLuint depth_test:1; 
+      GLuint bf_stencil_write_mask:8; 
+      GLuint bf_stencil_test_mask:8; 
+   } cc2;
+
+   
+   struct brw_cc3
+   {
+      GLuint pad0:8;
+      GLuint alpha_test_func:3; 
+      GLuint alpha_test:1; 
+      GLuint blend_enable:1; 
+      GLuint ia_blend_enable:1; 
+      GLuint pad1:1;
+      GLuint alpha_test_format:1;
+      GLuint pad2:16;
+   } cc3;
+   
+   struct brw_cc4
+   {
+      GLuint pad0:5; 
+      GLuint cc_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
+   } cc4;
+   
+   struct brw_cc5
+   {
+      GLuint pad0:2;
+      GLuint ia_dest_blend_factor:5; 
+      GLuint ia_src_blend_factor:5; 
+      GLuint ia_blend_function:3; 
+      GLuint statistics_enable:1; 
+      GLuint logicop_func:4; 
+      GLuint pad1:11;
+      GLuint dither_enable:1; 
+   } cc5;
+
+   struct brw_cc6
+   {
+      GLuint clamp_post_alpha_blend:1; 
+      GLuint clamp_pre_alpha_blend:1; 
+      GLuint clamp_range:2; 
+      GLuint pad0:11;
+      GLuint y_dither_offset:2; 
+      GLuint x_dither_offset:2; 
+      GLuint dest_blend_factor:5; 
+      GLuint src_blend_factor:5; 
+      GLuint blend_function:3; 
+   } cc6;
+
+   struct brw_cc7 {
+      union {
+	 GLfloat f;  
+	 GLubyte ub[4];
+      } alpha_ref;
+   } cc7;
+};
+
+
+
+struct brw_sf_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:10;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:6; 
+      GLuint pad3:1;
+   } thread4;   
+
+   struct
+   {
+      GLuint front_winding:1; 
+      GLuint viewport_transform:1; 
+      GLuint pad0:3;
+      GLuint sf_viewport_state_offset:27; /* Offset from GENERAL_STATE_BASE */
+   } sf5;
+   
+   struct
+   {
+      GLuint pad0:9;
+      GLuint dest_org_vbias:4; 
+      GLuint dest_org_hbias:4; 
+      GLuint scissor:1; 
+      GLuint disable_2x2_trifilter:1; 
+      GLuint disable_zero_pix_trifilter:1; 
+      GLuint point_rast_rule:2; 
+      GLuint line_endcap_aa_region_width:2; 
+      GLuint line_width:4; 
+      GLuint fast_scissor_disable:1; 
+      GLuint cull_mode:2; 
+      GLuint aa_enable:1; 
+   } sf6;
+
+   struct
+   {
+      GLuint point_size:11; 
+      GLuint use_point_size_state:1; 
+      GLuint subpixel_precision:1; 
+      GLuint sprite_point:1; 
+      GLuint pad0:10;
+      GLuint aa_line_distance_mode:1;
+      GLuint trifan_pv:2; 
+      GLuint linestrip_pv:2; 
+      GLuint tristrip_pv:2; 
+      GLuint line_last_pixel_enable:1; 
+   } sf7;
+
+};
+
+
+struct brw_gs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      GLuint pad0:8;
+      GLuint rendering_enable:1; /* for IGDNG */
+      GLuint pad4:1;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:5; 
+      GLuint pad3:2;
+   } thread4;   
+      
+   struct
+   {
+      GLuint sampler_count:3; 
+      GLuint pad0:2;
+      GLuint sampler_state_pointer:27; 
+   } gs5;
+
+   
+   struct
+   {
+      GLuint max_vp_index:4; 
+      GLuint pad0:12;
+      GLuint svbi_post_inc_value:10;
+      GLuint pad1:1;
+      GLuint svbi_post_inc_enable:1;
+      GLuint svbi_payload:1;
+      GLuint discard_adjaceny:1;
+      GLuint reorder_enable:1; 
+      GLuint pad2:1;
+   } gs6;
+};
+
+
+struct brw_vs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+   
+   struct
+   {
+      GLuint pad0:10;
+      GLuint stats_enable:1; 
+      GLuint nr_urb_entries:7; 
+      GLuint pad1:1;
+      GLuint urb_entry_allocation_size:5; 
+      GLuint pad2:1;
+      GLuint max_threads:6; 
+      GLuint pad3:1;
+   } thread4;   
+
+   struct
+   {
+      GLuint sampler_count:3; 
+      GLuint pad0:2;
+      GLuint sampler_state_pointer:27; 
+   } vs5;
+
+   struct
+   {
+      GLuint vs_enable:1; 
+      GLuint vert_cache_disable:1; 
+      GLuint pad0:30;
+   } vs6;
+};
+
+
+struct brw_wm_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+   
+   struct {
+      GLuint stats_enable:1; 
+      GLuint depth_buffer_clear:1;
+      GLuint sampler_count:3; 
+      GLuint sampler_state_pointer:27; 
+   } wm4;
+   
+   struct
+   {
+      GLuint enable_8_pix:1; 
+      GLuint enable_16_pix:1; 
+      GLuint enable_32_pix:1; 
+      GLuint enable_con_32_pix:1;
+      GLuint enable_con_64_pix:1;
+      GLuint pad0:5;
+      GLuint legacy_global_depth_bias:1; 
+      GLuint line_stipple:1; 
+      GLuint depth_offset:1; 
+      GLuint polygon_stipple:1; 
+      GLuint line_aa_region_width:2; 
+      GLuint line_endcap_aa_region_width:2; 
+      GLuint early_depth_test:1; 
+      GLuint thread_dispatch_enable:1; 
+      GLuint program_uses_depth:1; 
+      GLuint program_computes_depth:1; 
+      GLuint program_uses_killpixel:1; 
+      GLuint legacy_line_rast: 1; 
+      GLuint transposed_urb_read_enable:1; 
+      GLuint max_threads:7; 
+   } wm5;
+   
+   GLfloat global_depth_offset_constant;  
+   GLfloat global_depth_offset_scale;   
+   
+   /* for IGDNG only */
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_1:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_1:26;
+   } wm8;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_2:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_2:26;
+   } wm9;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_3:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_3:26;
+   } wm10;       
+};
+
+struct brw_sampler_default_color {
+   GLfloat color[4];
+};
+
+struct brw_sampler_state
+{
+   
+   struct brw_ss0
+   {
+      GLuint shadow_function:3; 
+      GLuint lod_bias:11; 
+      GLuint min_filter:3; 
+      GLuint mag_filter:3; 
+      GLuint mip_filter:2; 
+      GLuint base_level:5; 
+      GLuint pad:1;
+      GLuint lod_preclamp:1; 
+      GLuint default_color_mode:1; 
+      GLuint pad0:1;
+      GLuint disable:1; 
+   } ss0;
+
+   struct brw_ss1
+   {
+      GLuint r_wrap_mode:3; 
+      GLuint t_wrap_mode:3; 
+      GLuint s_wrap_mode:3; 
+      GLuint pad:3;
+      GLuint max_lod:10; 
+      GLuint min_lod:10; 
+   } ss1;
+
+   
+   struct brw_ss2
+   {
+      GLuint pad:5;
+      GLuint default_color_pointer:27; 
+   } ss2;
+   
+   struct brw_ss3
+   {
+      GLuint pad:19;
+      GLuint max_aniso:3; 
+      GLuint chroma_key_mode:1; 
+      GLuint chroma_key_index:2; 
+      GLuint chroma_key_enable:1; 
+      GLuint monochrome_filter_width:3; 
+      GLuint monochrome_filter_height:3; 
+   } ss3;
+};
+
+
+struct brw_clipper_viewport
+{
+   GLfloat xmin;  
+   GLfloat xmax;  
+   GLfloat ymin;  
+   GLfloat ymax;  
+};
+
+struct brw_cc_viewport
+{
+   GLfloat min_depth;  
+   GLfloat max_depth;  
+};
+
+struct brw_sf_viewport
+{
+   struct {
+      GLfloat m00;  
+      GLfloat m11;  
+      GLfloat m22;  
+      GLfloat m30;  
+      GLfloat m31;  
+      GLfloat m32;  
+   } viewport;
+
+   /* scissor coordinates are inclusive */
+   struct {
+      GLshort xmin;
+      GLshort ymin;
+      GLshort xmax;
+      GLshort ymax;
+   } scissor;
+};
+
+/* Documented in the subsystem/shared-functions/sampler chapter...
+ */
+struct brw_surface_state
+{
+   struct brw_surf_ss0 {
+      GLuint cube_pos_z:1; 
+      GLuint cube_neg_z:1; 
+      GLuint cube_pos_y:1; 
+      GLuint cube_neg_y:1; 
+      GLuint cube_pos_x:1; 
+      GLuint cube_neg_x:1; 
+      GLuint pad:4;
+      GLuint mipmap_layout_mode:1; 
+      GLuint vert_line_stride_ofs:1; 
+      GLuint vert_line_stride:1; 
+      GLuint color_blend:1; 
+      GLuint writedisable_blue:1; 
+      GLuint writedisable_green:1; 
+      GLuint writedisable_red:1; 
+      GLuint writedisable_alpha:1; 
+      GLuint surface_format:9;     /**< BRW_SURFACEFORMAT_x */
+      GLuint data_return_format:1; 
+      GLuint pad0:1;
+      GLuint surface_type:3;       /**< BRW_SURFACE_1D/2D/3D/CUBE */
+   } ss0;
+   
+   struct brw_surf_ss1 {
+      GLuint base_addr;  
+   } ss1;
+   
+   struct brw_surf_ss2 {
+      GLuint pad:2;
+      GLuint mip_count:4; 
+      GLuint width:13; 
+      GLuint height:13; 
+   } ss2;
+
+   struct brw_surf_ss3 {
+      GLuint tile_walk:1; 
+      GLuint tiled_surface:1; 
+      GLuint pad:1; 
+      GLuint pitch:18; 
+      GLuint depth:11; 
+   } ss3;
+   
+   struct brw_surf_ss4 {
+      GLuint multisample_position_palette_index:3;
+      GLuint pad1:1;
+      GLuint num_multisamples:3;
+      GLuint pad0:1;
+      GLuint render_target_view_extent:9;
+      GLuint min_array_elt:11;
+      GLuint min_lod:4; 
+   } ss4;
+
+   struct brw_surf_ss5 {
+      GLuint pad1:16;
+      GLuint llc_mapping:1;
+      GLuint mlc_mapping:1;
+      GLuint gfdt:1;
+      GLuint gfdt_src:1;
+      GLuint y_offset:4;
+      GLuint pad0:1;
+      GLuint x_offset:7;
+   } ss5;   /* New in G4X */
+
+};
+
+
+
+struct brw_vertex_buffer_state
+{
+   struct {
+      GLuint pitch:11; 
+      GLuint pad:15;
+      GLuint access_type:1; 
+      GLuint vb_index:5; 
+   } vb0;
+   
+   GLuint start_addr; 
+   GLuint max_index;   
+#if 1
+   GLuint instance_data_step_rate; /* not included for sequential/random vertices? */
+#endif
+};
+
+#define BRW_VBP_MAX 17
+
+struct brw_vb_array_state {
+   struct header header;
+   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
+};
+
+
+struct brw_vertex_element_state
+{
+   struct
+   {
+      GLuint src_offset:11; 
+      GLuint pad:5;
+      GLuint src_format:9; 
+      GLuint pad0:1;
+      GLuint valid:1; 
+      GLuint vertex_buffer_index:5; 
+   } ve0;
+   
+   struct
+   {
+      GLuint dst_offset:8; 
+      GLuint pad:8;
+      GLuint vfcomponent3:4; 
+      GLuint vfcomponent2:4; 
+      GLuint vfcomponent1:4; 
+      GLuint vfcomponent0:4; 
+   } ve1;
+};
+
+#define BRW_VEP_MAX 18
+
+struct brw_vertex_element_packet {
+   struct header header;
+   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
+};
+
+
+struct brw_urb_immediate {
+   GLuint opcode:4;
+   GLuint offset:6;
+   GLuint swizzle_control:2; 
+   GLuint pad:1;
+   GLuint allocate:1;
+   GLuint used:1;
+   GLuint complete:1;
+   GLuint response_length:4;
+   GLuint msg_length:4;
+   GLuint msg_target:4;
+   GLuint pad1:3;
+   GLuint end_of_thread:1;
+};
+
+/* Instruction format for the execution units:
+ */
+ 
+struct brw_instruction
+{
+   struct 
+   {
+      GLuint opcode:7;
+      GLuint pad:1;
+      GLuint access_mode:1;
+      GLuint mask_control:1;
+      GLuint dependency_control:2;
+      GLuint compression_control:2;
+      GLuint thread_control:2;
+      GLuint predicate_control:4;
+      GLuint predicate_inverse:1;
+      GLuint execution_size:3;
+      GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */
+      GLuint pad0:2;
+      GLuint debug_control:1;
+      GLuint saturate:1;
+   } header;
+
+   union {
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;
+	 GLuint src1_reg_type:3;
+	 GLuint pad:1;
+	 GLuint dest_subreg_nr:5;
+	 GLuint dest_reg_nr:8;
+	 GLuint dest_horiz_stride:2;
+	 GLuint dest_address_mode:1;
+      } da1;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;        /* 0x00000c00 */
+	 GLuint src1_reg_type:3;        /* 0x00007000 */
+	 GLuint pad:1;
+	 GLint dest_indirect_offset:10;	/* offset against the deref'd address reg */
+	 GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */
+	 GLuint dest_horiz_stride:2;
+	 GLuint dest_address_mode:1;
+      } ia1;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint src1_reg_file:2;
+	 GLuint src1_reg_type:3;
+	 GLuint pad:1;
+	 GLuint dest_writemask:4;
+	 GLuint dest_subreg_nr:1;
+	 GLuint dest_reg_nr:8;
+	 GLuint pad1:2;
+	 GLuint dest_address_mode:1;
+      } da16;
+
+      struct
+      {
+	 GLuint dest_reg_file:2;
+	 GLuint dest_reg_type:3;
+	 GLuint src0_reg_file:2;
+	 GLuint src0_reg_type:3;
+	 GLuint pad0:6;
+	 GLuint dest_writemask:4;
+	 GLint dest_indirect_offset:6;
+	 GLuint dest_subreg_nr:3;
+	 GLuint pad1:2;
+	 GLuint dest_address_mode:1;
+      } ia16;
+   } bits1;
+
+
+   union {
+      struct
+      {
+	 GLuint src0_subreg_nr:5;
+	 GLuint src0_reg_nr:8;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_horiz_stride:2;
+	 GLuint src0_width:3;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad:6;
+      } da1;
+
+      struct
+      {
+	 GLint src0_indirect_offset:10;
+	 GLuint src0_subreg_nr:3;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_horiz_stride:2;
+	 GLuint src0_width:3;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad:6;	
+      } ia1;
+
+      struct
+      {
+	 GLuint src0_swz_x:2;
+	 GLuint src0_swz_y:2;
+	 GLuint src0_subreg_nr:1;
+	 GLuint src0_reg_nr:8;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_swz_z:2;
+	 GLuint src0_swz_w:2;
+	 GLuint pad0:1;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;
+      } da16;
+
+      struct
+      {
+	 GLuint src0_swz_x:2;
+	 GLuint src0_swz_y:2;
+	 GLint src0_indirect_offset:6;
+	 GLuint src0_subreg_nr:3;
+	 GLuint src0_abs:1;
+	 GLuint src0_negate:1;
+	 GLuint src0_address_mode:1;
+	 GLuint src0_swz_z:2;
+	 GLuint src0_swz_w:2;
+	 GLuint pad0:1;
+	 GLuint src0_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;
+      } ia16;
+
+       struct 
+       {
+           GLuint pad:26;
+           GLuint end_of_thread:1;
+           GLuint pad1:1;
+           GLuint sfid:4;
+       } send_igdng;  /* for IGDNG only */
+
+   } bits2;
+
+   union
+   {
+      struct
+      {
+	 GLuint src1_subreg_nr:5;
+	 GLuint src1_reg_nr:8;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_horiz_stride:2;
+	 GLuint src1_width:3;
+	 GLuint src1_vert_stride:4;
+	 GLuint pad0:7;
+      } da1;
+
+      struct
+      {
+	 GLuint src1_swz_x:2;
+	 GLuint src1_swz_y:2;
+	 GLuint src1_subreg_nr:1;
+	 GLuint src1_reg_nr:8;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_swz_z:2;
+	 GLuint src1_swz_w:2;
+	 GLuint pad1:1;
+	 GLuint src1_vert_stride:4;
+	 GLuint pad2:7;
+      } da16;
+
+      struct
+      {
+	 GLint  src1_indirect_offset:10;
+	 GLuint src1_subreg_nr:3;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint src1_address_mode:1;
+	 GLuint src1_horiz_stride:2;
+	 GLuint src1_width:3;
+	 GLuint src1_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad1:6;	
+      } ia1;
+
+      struct
+      {
+	 GLuint src1_swz_x:2;
+	 GLuint src1_swz_y:2;
+	 GLint  src1_indirect_offset:6;
+	 GLuint src1_subreg_nr:3;
+	 GLuint src1_abs:1;
+	 GLuint src1_negate:1;
+	 GLuint pad0:1;
+	 GLuint src1_swz_z:2;
+	 GLuint src1_swz_w:2;
+	 GLuint pad1:1;
+	 GLuint src1_vert_stride:4;
+	 GLuint flag_reg_nr:1;
+	 GLuint pad2:6;
+      } ia16;
+
+
+      struct
+      {
+	 GLint  jump_count:16;	/* note: signed */
+	 GLuint  pop_count:4;
+	 GLuint  pad0:12;
+      } if_else;
+
+      struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint pad0:8;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } math;
+
+      struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint snapshot:1;
+	 GLuint pad0:10;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } math_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint return_format:2; 
+	 GLuint msg_type:2;   
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } sampler;
+
+      struct {
+         GLuint binding_table_index:8;
+         GLuint sampler:4;
+         GLuint msg_type:4;
+         GLuint response_length:4;
+         GLuint msg_length:4;
+         GLuint msg_target:4;
+         GLuint pad1:3;
+         GLuint end_of_thread:1;
+      } sampler_g4x;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint msg_type:4;
+	 GLuint simd_mode:2;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } sampler_igdng;
+
+      struct brw_urb_immediate urb;
+
+      struct {
+	 GLuint opcode:4;
+	 GLuint offset:6;
+	 GLuint swizzle_control:2; 
+	 GLuint pad:1;
+	 GLuint allocate:1;
+	 GLuint used:1;
+	 GLuint complete:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } urb_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:4;  
+	 GLuint msg_type:2;  
+	 GLuint target_cache:2;    
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } dp_read;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;  
+	 GLuint msg_type:3;  
+	 GLuint target_cache:2;    
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_read_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } dp_write;
+
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_write_igdng;
+
+      struct {
+	 GLuint pad:16;
+	 GLuint response_length:4;
+	 GLuint msg_length:4;
+	 GLuint msg_target:4;
+	 GLuint pad1:3;
+	 GLuint end_of_thread:1;
+      } generic;
+
+      struct {
+	 GLuint pad:19;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } generic_igdng;
+
+      GLint d;
+      GLuint ud;
+      float f;
+   } bits3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_structs_dump.c b/src/gallium/drivers/i965/brw_structs_dump.c
new file mode 100644
index 00000000000..cd40fc6d618
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.c
@@ -0,0 +1,1247 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Dump i965 data structures.
+ *
+ * Generated automatically from brw_structs.h by brw_structs_dump.py.
+ */
+
+#include "util/u_debug.h"
+
+#include "brw_types.h"
+#include "brw_structs.h"
+#include "brw_structs_dump.h"
+
+void
+brw_dump_3d_control(const struct brw_3d_control *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.notify_enable = 0x%x\n", (*ptr).header.notify_enable);
+   debug_printf("\t\t.header.wc_flush_enable = 0x%x\n", (*ptr).header.wc_flush_enable);
+   debug_printf("\t\t.header.depth_stall_enable = 0x%x\n", (*ptr).header.depth_stall_enable);
+   debug_printf("\t\t.header.operation = 0x%x\n", (*ptr).header.operation);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.dest.dest_addr_type = 0x%x\n", (*ptr).dest.dest_addr_type);
+   debug_printf("\t\t.dest.dest_addr = 0x%x\n", (*ptr).dest.dest_addr);
+   debug_printf("\t\t.dword2 = 0x%x\n", (*ptr).dword2);
+   debug_printf("\t\t.dword3 = 0x%x\n", (*ptr).dword3);
+}
+
+void
+brw_dump_3d_primitive(const struct brw_3d_primitive *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.topology = 0x%x\n", (*ptr).header.topology);
+   debug_printf("\t\t.header.indexed = 0x%x\n", (*ptr).header.indexed);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.verts_per_instance = 0x%x\n", (*ptr).verts_per_instance);
+   debug_printf("\t\t.start_vert_location = 0x%x\n", (*ptr).start_vert_location);
+   debug_printf("\t\t.instance_count = 0x%x\n", (*ptr).instance_count);
+   debug_printf("\t\t.start_instance_location = 0x%x\n", (*ptr).start_instance_location);
+   debug_printf("\t\t.base_vert_location = 0x%x\n", (*ptr).base_vert_location);
+}
+
+void
+brw_dump_aa_line_parameters(const struct brw_aa_line_parameters *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.aa_coverage_scope = 0x%x\n", (*ptr).bits0.aa_coverage_scope);
+   debug_printf("\t\t.bits0.aa_coverage_bias = 0x%x\n", (*ptr).bits0.aa_coverage_bias);
+   debug_printf("\t\t.bits1.aa_coverage_endcap_slope = 0x%x\n", (*ptr).bits1.aa_coverage_endcap_slope);
+   debug_printf("\t\t.bits1.aa_coverage_endcap_bias = 0x%x\n", (*ptr).bits1.aa_coverage_endcap_bias);
+}
+
+void
+brw_dump_binding_table_pointers(const struct brw_binding_table_pointers *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vs = 0x%x\n", (*ptr).vs);
+   debug_printf("\t\t.gs = 0x%x\n", (*ptr).gs);
+   debug_printf("\t\t.clp = 0x%x\n", (*ptr).clp);
+   debug_printf("\t\t.sf = 0x%x\n", (*ptr).sf);
+   debug_printf("\t\t.wm = 0x%x\n", (*ptr).wm);
+}
+
+void
+brw_dump_blend_constant_color(const struct brw_blend_constant_color *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.blend_constant_color[0] = %f\n", (*ptr).blend_constant_color[0]);
+   debug_printf("\t\t.blend_constant_color[1] = %f\n", (*ptr).blend_constant_color[1]);
+   debug_printf("\t\t.blend_constant_color[2] = %f\n", (*ptr).blend_constant_color[2]);
+   debug_printf("\t\t.blend_constant_color[3] = %f\n", (*ptr).blend_constant_color[3]);
+}
+
+void
+brw_dump_cc0(const struct brw_cc0 *ptr)
+{
+   debug_printf("\t\t.bf_stencil_pass_depth_pass_op = 0x%x\n", (*ptr).bf_stencil_pass_depth_pass_op);
+   debug_printf("\t\t.bf_stencil_pass_depth_fail_op = 0x%x\n", (*ptr).bf_stencil_pass_depth_fail_op);
+   debug_printf("\t\t.bf_stencil_fail_op = 0x%x\n", (*ptr).bf_stencil_fail_op);
+   debug_printf("\t\t.bf_stencil_func = 0x%x\n", (*ptr).bf_stencil_func);
+   debug_printf("\t\t.bf_stencil_enable = 0x%x\n", (*ptr).bf_stencil_enable);
+   debug_printf("\t\t.stencil_write_enable = 0x%x\n", (*ptr).stencil_write_enable);
+   debug_printf("\t\t.stencil_pass_depth_pass_op = 0x%x\n", (*ptr).stencil_pass_depth_pass_op);
+   debug_printf("\t\t.stencil_pass_depth_fail_op = 0x%x\n", (*ptr).stencil_pass_depth_fail_op);
+   debug_printf("\t\t.stencil_fail_op = 0x%x\n", (*ptr).stencil_fail_op);
+   debug_printf("\t\t.stencil_func = 0x%x\n", (*ptr).stencil_func);
+   debug_printf("\t\t.stencil_enable = 0x%x\n", (*ptr).stencil_enable);
+}
+
+void
+brw_dump_cc1(const struct brw_cc1 *ptr)
+{
+   debug_printf("\t\t.bf_stencil_ref = 0x%x\n", (*ptr).bf_stencil_ref);
+   debug_printf("\t\t.stencil_write_mask = 0x%x\n", (*ptr).stencil_write_mask);
+   debug_printf("\t\t.stencil_test_mask = 0x%x\n", (*ptr).stencil_test_mask);
+   debug_printf("\t\t.stencil_ref = 0x%x\n", (*ptr).stencil_ref);
+}
+
+void
+brw_dump_cc2(const struct brw_cc2 *ptr)
+{
+   debug_printf("\t\t.logicop_enable = 0x%x\n", (*ptr).logicop_enable);
+   debug_printf("\t\t.depth_write_enable = 0x%x\n", (*ptr).depth_write_enable);
+   debug_printf("\t\t.depth_test_function = 0x%x\n", (*ptr).depth_test_function);
+   debug_printf("\t\t.depth_test = 0x%x\n", (*ptr).depth_test);
+   debug_printf("\t\t.bf_stencil_write_mask = 0x%x\n", (*ptr).bf_stencil_write_mask);
+   debug_printf("\t\t.bf_stencil_test_mask = 0x%x\n", (*ptr).bf_stencil_test_mask);
+}
+
+void
+brw_dump_cc3(const struct brw_cc3 *ptr)
+{
+   debug_printf("\t\t.alpha_test_func = 0x%x\n", (*ptr).alpha_test_func);
+   debug_printf("\t\t.alpha_test = 0x%x\n", (*ptr).alpha_test);
+   debug_printf("\t\t.blend_enable = 0x%x\n", (*ptr).blend_enable);
+   debug_printf("\t\t.ia_blend_enable = 0x%x\n", (*ptr).ia_blend_enable);
+   debug_printf("\t\t.alpha_test_format = 0x%x\n", (*ptr).alpha_test_format);
+}
+
+void
+brw_dump_cc4(const struct brw_cc4 *ptr)
+{
+   debug_printf("\t\t.cc_viewport_state_offset = 0x%x\n", (*ptr).cc_viewport_state_offset);
+}
+
+void
+brw_dump_cc5(const struct brw_cc5 *ptr)
+{
+   debug_printf("\t\t.ia_dest_blend_factor = 0x%x\n", (*ptr).ia_dest_blend_factor);
+   debug_printf("\t\t.ia_src_blend_factor = 0x%x\n", (*ptr).ia_src_blend_factor);
+   debug_printf("\t\t.ia_blend_function = 0x%x\n", (*ptr).ia_blend_function);
+   debug_printf("\t\t.statistics_enable = 0x%x\n", (*ptr).statistics_enable);
+   debug_printf("\t\t.logicop_func = 0x%x\n", (*ptr).logicop_func);
+   debug_printf("\t\t.dither_enable = 0x%x\n", (*ptr).dither_enable);
+}
+
+void
+brw_dump_cc6(const struct brw_cc6 *ptr)
+{
+   debug_printf("\t\t.clamp_post_alpha_blend = 0x%x\n", (*ptr).clamp_post_alpha_blend);
+   debug_printf("\t\t.clamp_pre_alpha_blend = 0x%x\n", (*ptr).clamp_pre_alpha_blend);
+   debug_printf("\t\t.clamp_range = 0x%x\n", (*ptr).clamp_range);
+   debug_printf("\t\t.y_dither_offset = 0x%x\n", (*ptr).y_dither_offset);
+   debug_printf("\t\t.x_dither_offset = 0x%x\n", (*ptr).x_dither_offset);
+   debug_printf("\t\t.dest_blend_factor = 0x%x\n", (*ptr).dest_blend_factor);
+   debug_printf("\t\t.src_blend_factor = 0x%x\n", (*ptr).src_blend_factor);
+   debug_printf("\t\t.blend_function = 0x%x\n", (*ptr).blend_function);
+}
+
+void
+brw_dump_cc7(const struct brw_cc7 *ptr)
+{
+   debug_printf("\t\t.alpha_ref.f = %f\n", (*ptr).alpha_ref.f);
+   debug_printf("\t\t.alpha_ref.ub[0] = 0x%x\n", (*ptr).alpha_ref.ub[0]);
+   debug_printf("\t\t.alpha_ref.ub[1] = 0x%x\n", (*ptr).alpha_ref.ub[1]);
+   debug_printf("\t\t.alpha_ref.ub[2] = 0x%x\n", (*ptr).alpha_ref.ub[2]);
+   debug_printf("\t\t.alpha_ref.ub[3] = 0x%x\n", (*ptr).alpha_ref.ub[3]);
+}
+
+void
+brw_dump_cc_unit_state(const struct brw_cc_unit_state *ptr)
+{
+   debug_printf("\t\t.cc0.bf_stencil_pass_depth_pass_op = 0x%x\n", (*ptr).cc0.bf_stencil_pass_depth_pass_op);
+   debug_printf("\t\t.cc0.bf_stencil_pass_depth_fail_op = 0x%x\n", (*ptr).cc0.bf_stencil_pass_depth_fail_op);
+   debug_printf("\t\t.cc0.bf_stencil_fail_op = 0x%x\n", (*ptr).cc0.bf_stencil_fail_op);
+   debug_printf("\t\t.cc0.bf_stencil_func = 0x%x\n", (*ptr).cc0.bf_stencil_func);
+   debug_printf("\t\t.cc0.bf_stencil_enable = 0x%x\n", (*ptr).cc0.bf_stencil_enable);
+   debug_printf("\t\t.cc0.stencil_write_enable = 0x%x\n", (*ptr).cc0.stencil_write_enable);
+   debug_printf("\t\t.cc0.stencil_pass_depth_pass_op = 0x%x\n", (*ptr).cc0.stencil_pass_depth_pass_op);
+   debug_printf("\t\t.cc0.stencil_pass_depth_fail_op = 0x%x\n", (*ptr).cc0.stencil_pass_depth_fail_op);
+   debug_printf("\t\t.cc0.stencil_fail_op = 0x%x\n", (*ptr).cc0.stencil_fail_op);
+   debug_printf("\t\t.cc0.stencil_func = 0x%x\n", (*ptr).cc0.stencil_func);
+   debug_printf("\t\t.cc0.stencil_enable = 0x%x\n", (*ptr).cc0.stencil_enable);
+   debug_printf("\t\t.cc1.bf_stencil_ref = 0x%x\n", (*ptr).cc1.bf_stencil_ref);
+   debug_printf("\t\t.cc1.stencil_write_mask = 0x%x\n", (*ptr).cc1.stencil_write_mask);
+   debug_printf("\t\t.cc1.stencil_test_mask = 0x%x\n", (*ptr).cc1.stencil_test_mask);
+   debug_printf("\t\t.cc1.stencil_ref = 0x%x\n", (*ptr).cc1.stencil_ref);
+   debug_printf("\t\t.cc2.logicop_enable = 0x%x\n", (*ptr).cc2.logicop_enable);
+   debug_printf("\t\t.cc2.depth_write_enable = 0x%x\n", (*ptr).cc2.depth_write_enable);
+   debug_printf("\t\t.cc2.depth_test_function = 0x%x\n", (*ptr).cc2.depth_test_function);
+   debug_printf("\t\t.cc2.depth_test = 0x%x\n", (*ptr).cc2.depth_test);
+   debug_printf("\t\t.cc2.bf_stencil_write_mask = 0x%x\n", (*ptr).cc2.bf_stencil_write_mask);
+   debug_printf("\t\t.cc2.bf_stencil_test_mask = 0x%x\n", (*ptr).cc2.bf_stencil_test_mask);
+   debug_printf("\t\t.cc3.alpha_test_func = 0x%x\n", (*ptr).cc3.alpha_test_func);
+   debug_printf("\t\t.cc3.alpha_test = 0x%x\n", (*ptr).cc3.alpha_test);
+   debug_printf("\t\t.cc3.blend_enable = 0x%x\n", (*ptr).cc3.blend_enable);
+   debug_printf("\t\t.cc3.ia_blend_enable = 0x%x\n", (*ptr).cc3.ia_blend_enable);
+   debug_printf("\t\t.cc3.alpha_test_format = 0x%x\n", (*ptr).cc3.alpha_test_format);
+   debug_printf("\t\t.cc4.cc_viewport_state_offset = 0x%x\n", (*ptr).cc4.cc_viewport_state_offset);
+   debug_printf("\t\t.cc5.ia_dest_blend_factor = 0x%x\n", (*ptr).cc5.ia_dest_blend_factor);
+   debug_printf("\t\t.cc5.ia_src_blend_factor = 0x%x\n", (*ptr).cc5.ia_src_blend_factor);
+   debug_printf("\t\t.cc5.ia_blend_function = 0x%x\n", (*ptr).cc5.ia_blend_function);
+   debug_printf("\t\t.cc5.statistics_enable = 0x%x\n", (*ptr).cc5.statistics_enable);
+   debug_printf("\t\t.cc5.logicop_func = 0x%x\n", (*ptr).cc5.logicop_func);
+   debug_printf("\t\t.cc5.dither_enable = 0x%x\n", (*ptr).cc5.dither_enable);
+   debug_printf("\t\t.cc6.clamp_post_alpha_blend = 0x%x\n", (*ptr).cc6.clamp_post_alpha_blend);
+   debug_printf("\t\t.cc6.clamp_pre_alpha_blend = 0x%x\n", (*ptr).cc6.clamp_pre_alpha_blend);
+   debug_printf("\t\t.cc6.clamp_range = 0x%x\n", (*ptr).cc6.clamp_range);
+   debug_printf("\t\t.cc6.y_dither_offset = 0x%x\n", (*ptr).cc6.y_dither_offset);
+   debug_printf("\t\t.cc6.x_dither_offset = 0x%x\n", (*ptr).cc6.x_dither_offset);
+   debug_printf("\t\t.cc6.dest_blend_factor = 0x%x\n", (*ptr).cc6.dest_blend_factor);
+   debug_printf("\t\t.cc6.src_blend_factor = 0x%x\n", (*ptr).cc6.src_blend_factor);
+   debug_printf("\t\t.cc6.blend_function = 0x%x\n", (*ptr).cc6.blend_function);
+   debug_printf("\t\t.cc7.alpha_ref.f = %f\n", (*ptr).cc7.alpha_ref.f);
+   debug_printf("\t\t.cc7.alpha_ref.ub[0] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[0]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[1] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[1]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[2] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[2]);
+   debug_printf("\t\t.cc7.alpha_ref.ub[3] = 0x%x\n", (*ptr).cc7.alpha_ref.ub[3]);
+}
+
+void
+brw_dump_cc_viewport(const struct brw_cc_viewport *ptr)
+{
+   debug_printf("\t\t.min_depth = %f\n", (*ptr).min_depth);
+   debug_printf("\t\t.max_depth = %f\n", (*ptr).max_depth);
+}
+
+void
+brw_dump_clip_unit_state(const struct brw_clip_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.gs_output_stats = 0x%x\n", (*ptr).thread4.gs_output_stats);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.clip5.clip_mode = 0x%x\n", (*ptr).clip5.clip_mode);
+   debug_printf("\t\t.clip5.userclip_enable_flags = 0x%x\n", (*ptr).clip5.userclip_enable_flags);
+   debug_printf("\t\t.clip5.userclip_must_clip = 0x%x\n", (*ptr).clip5.userclip_must_clip);
+   debug_printf("\t\t.clip5.negative_w_clip_test = 0x%x\n", (*ptr).clip5.negative_w_clip_test);
+   debug_printf("\t\t.clip5.guard_band_enable = 0x%x\n", (*ptr).clip5.guard_band_enable);
+   debug_printf("\t\t.clip5.viewport_z_clip_enable = 0x%x\n", (*ptr).clip5.viewport_z_clip_enable);
+   debug_printf("\t\t.clip5.viewport_xy_clip_enable = 0x%x\n", (*ptr).clip5.viewport_xy_clip_enable);
+   debug_printf("\t\t.clip5.vertex_position_space = 0x%x\n", (*ptr).clip5.vertex_position_space);
+   debug_printf("\t\t.clip5.api_mode = 0x%x\n", (*ptr).clip5.api_mode);
+   debug_printf("\t\t.clip6.clipper_viewport_state_ptr = 0x%x\n", (*ptr).clip6.clipper_viewport_state_ptr);
+   debug_printf("\t\t.viewport_xmin = %f\n", (*ptr).viewport_xmin);
+   debug_printf("\t\t.viewport_xmax = %f\n", (*ptr).viewport_xmax);
+   debug_printf("\t\t.viewport_ymin = %f\n", (*ptr).viewport_ymin);
+   debug_printf("\t\t.viewport_ymax = %f\n", (*ptr).viewport_ymax);
+}
+
+void
+brw_dump_clipper_viewport(const struct brw_clipper_viewport *ptr)
+{
+   debug_printf("\t\t.xmin = %f\n", (*ptr).xmin);
+   debug_printf("\t\t.xmax = %f\n", (*ptr).xmax);
+   debug_printf("\t\t.ymin = %f\n", (*ptr).ymin);
+   debug_printf("\t\t.ymax = %f\n", (*ptr).ymax);
+}
+
+void
+brw_dump_constant_buffer(const struct brw_constant_buffer *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.valid = 0x%x\n", (*ptr).header.valid);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.buffer_length = 0x%x\n", (*ptr).bits0.buffer_length);
+   debug_printf("\t\t.bits0.buffer_address = 0x%x\n", (*ptr).bits0.buffer_address);
+}
+
+void
+brw_dump_cs_urb_state(const struct brw_cs_urb_state *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.nr_urb_entries = 0x%x\n", (*ptr).bits0.nr_urb_entries);
+   debug_printf("\t\t.bits0.urb_entry_size = 0x%x\n", (*ptr).bits0.urb_entry_size);
+}
+
+void
+brw_dump_depthbuffer(const struct brw_depthbuffer *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.dword1.bits.pitch = 0x%x\n", (*ptr).dword1.bits.pitch);
+   debug_printf("\t\t.dword1.bits.format = 0x%x\n", (*ptr).dword1.bits.format);
+   debug_printf("\t\t.dword1.bits.software_tiled_rendering_mode = 0x%x\n", (*ptr).dword1.bits.software_tiled_rendering_mode);
+   debug_printf("\t\t.dword1.bits.depth_offset_disable = 0x%x\n", (*ptr).dword1.bits.depth_offset_disable);
+   debug_printf("\t\t.dword1.bits.tile_walk = 0x%x\n", (*ptr).dword1.bits.tile_walk);
+   debug_printf("\t\t.dword1.bits.tiled_surface = 0x%x\n", (*ptr).dword1.bits.tiled_surface);
+   debug_printf("\t\t.dword1.bits.surface_type = 0x%x\n", (*ptr).dword1.bits.surface_type);
+   debug_printf("\t\t.dword2_base_addr = 0x%x\n", (*ptr).dword2_base_addr);
+   debug_printf("\t\t.dword3.bits.mipmap_layout = 0x%x\n", (*ptr).dword3.bits.mipmap_layout);
+   debug_printf("\t\t.dword3.bits.lod = 0x%x\n", (*ptr).dword3.bits.lod);
+   debug_printf("\t\t.dword3.bits.width = 0x%x\n", (*ptr).dword3.bits.width);
+   debug_printf("\t\t.dword3.bits.height = 0x%x\n", (*ptr).dword3.bits.height);
+   debug_printf("\t\t.dword4.bits.min_array_element = 0x%x\n", (*ptr).dword4.bits.min_array_element);
+   debug_printf("\t\t.dword4.bits.depth = 0x%x\n", (*ptr).dword4.bits.depth);
+}
+
+void
+brw_dump_depthbuffer_g4x(const struct brw_depthbuffer_g4x *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.dword1.bits.pitch = 0x%x\n", (*ptr).dword1.bits.pitch);
+   debug_printf("\t\t.dword1.bits.format = 0x%x\n", (*ptr).dword1.bits.format);
+   debug_printf("\t\t.dword1.bits.software_tiled_rendering_mode = 0x%x\n", (*ptr).dword1.bits.software_tiled_rendering_mode);
+   debug_printf("\t\t.dword1.bits.depth_offset_disable = 0x%x\n", (*ptr).dword1.bits.depth_offset_disable);
+   debug_printf("\t\t.dword1.bits.tile_walk = 0x%x\n", (*ptr).dword1.bits.tile_walk);
+   debug_printf("\t\t.dword1.bits.tiled_surface = 0x%x\n", (*ptr).dword1.bits.tiled_surface);
+   debug_printf("\t\t.dword1.bits.surface_type = 0x%x\n", (*ptr).dword1.bits.surface_type);
+   debug_printf("\t\t.dword2_base_addr = 0x%x\n", (*ptr).dword2_base_addr);
+   debug_printf("\t\t.dword3.bits.mipmap_layout = 0x%x\n", (*ptr).dword3.bits.mipmap_layout);
+   debug_printf("\t\t.dword3.bits.lod = 0x%x\n", (*ptr).dword3.bits.lod);
+   debug_printf("\t\t.dword3.bits.width = 0x%x\n", (*ptr).dword3.bits.width);
+   debug_printf("\t\t.dword3.bits.height = 0x%x\n", (*ptr).dword3.bits.height);
+   debug_printf("\t\t.dword4.bits.min_array_element = 0x%x\n", (*ptr).dword4.bits.min_array_element);
+   debug_printf("\t\t.dword4.bits.depth = 0x%x\n", (*ptr).dword4.bits.depth);
+   debug_printf("\t\t.dword5.bits.xoffset = 0x%x\n", (*ptr).dword5.bits.xoffset);
+   debug_printf("\t\t.dword5.bits.yoffset = 0x%x\n", (*ptr).dword5.bits.yoffset);
+}
+
+void
+brw_dump_drawrect(const struct brw_drawrect *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.xmin = 0x%x\n", (*ptr).xmin);
+   debug_printf("\t\t.ymin = 0x%x\n", (*ptr).ymin);
+   debug_printf("\t\t.xmax = 0x%x\n", (*ptr).xmax);
+   debug_printf("\t\t.ymax = 0x%x\n", (*ptr).ymax);
+   debug_printf("\t\t.xorg = 0x%x\n", (*ptr).xorg);
+   debug_printf("\t\t.yorg = 0x%x\n", (*ptr).yorg);
+}
+
+void
+brw_dump_global_depth_offset_clamp(const struct brw_global_depth_offset_clamp *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.depth_offset_clamp = %f\n", (*ptr).depth_offset_clamp);
+}
+
+void
+brw_dump_gs_unit_state(const struct brw_gs_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.rendering_enable = 0x%x\n", (*ptr).thread4.rendering_enable);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.gs5.sampler_count = 0x%x\n", (*ptr).gs5.sampler_count);
+   debug_printf("\t\t.gs5.sampler_state_pointer = 0x%x\n", (*ptr).gs5.sampler_state_pointer);
+   debug_printf("\t\t.gs6.max_vp_index = 0x%x\n", (*ptr).gs6.max_vp_index);
+   debug_printf("\t\t.gs6.svbi_post_inc_value = 0x%x\n", (*ptr).gs6.svbi_post_inc_value);
+   debug_printf("\t\t.gs6.svbi_post_inc_enable = 0x%x\n", (*ptr).gs6.svbi_post_inc_enable);
+   debug_printf("\t\t.gs6.svbi_payload = 0x%x\n", (*ptr).gs6.svbi_payload);
+   debug_printf("\t\t.gs6.discard_adjaceny = 0x%x\n", (*ptr).gs6.discard_adjaceny);
+   debug_printf("\t\t.gs6.reorder_enable = 0x%x\n", (*ptr).gs6.reorder_enable);
+}
+
+void
+brw_dump_indexbuffer(const struct brw_indexbuffer *ptr)
+{
+   debug_printf("\t\t.header.bits.length = 0x%x\n", (*ptr).header.bits.length);
+   debug_printf("\t\t.header.bits.index_format = 0x%x\n", (*ptr).header.bits.index_format);
+   debug_printf("\t\t.header.bits.cut_index_enable = 0x%x\n", (*ptr).header.bits.cut_index_enable);
+   debug_printf("\t\t.header.bits.opcode = 0x%x\n", (*ptr).header.bits.opcode);
+   debug_printf("\t\t.buffer_start = 0x%x\n", (*ptr).buffer_start);
+   debug_printf("\t\t.buffer_end = 0x%x\n", (*ptr).buffer_end);
+}
+
+void
+brw_dump_line_stipple(const struct brw_line_stipple *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.pattern = 0x%x\n", (*ptr).bits0.pattern);
+   debug_printf("\t\t.bits1.repeat_count = 0x%x\n", (*ptr).bits1.repeat_count);
+   debug_printf("\t\t.bits1.inverse_repeat_count = 0x%x\n", (*ptr).bits1.inverse_repeat_count);
+}
+
+void
+brw_dump_mi_flush(const struct brw_mi_flush *ptr)
+{
+   debug_printf("\t\t.flags = 0x%x\n", (*ptr).flags);
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+}
+
+void
+brw_dump_pipe_control(const struct brw_pipe_control *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.notify_enable = 0x%x\n", (*ptr).header.notify_enable);
+   debug_printf("\t\t.header.texture_cache_flush_enable = 0x%x\n", (*ptr).header.texture_cache_flush_enable);
+   debug_printf("\t\t.header.indirect_state_pointers_disable = 0x%x\n", (*ptr).header.indirect_state_pointers_disable);
+   debug_printf("\t\t.header.instruction_state_cache_flush_enable = 0x%x\n", (*ptr).header.instruction_state_cache_flush_enable);
+   debug_printf("\t\t.header.write_cache_flush_enable = 0x%x\n", (*ptr).header.write_cache_flush_enable);
+   debug_printf("\t\t.header.depth_stall_enable = 0x%x\n", (*ptr).header.depth_stall_enable);
+   debug_printf("\t\t.header.post_sync_operation = 0x%x\n", (*ptr).header.post_sync_operation);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits1.dest_addr_type = 0x%x\n", (*ptr).bits1.dest_addr_type);
+   debug_printf("\t\t.bits1.dest_addr = 0x%x\n", (*ptr).bits1.dest_addr);
+   debug_printf("\t\t.data0 = 0x%x\n", (*ptr).data0);
+   debug_printf("\t\t.data1 = 0x%x\n", (*ptr).data1);
+}
+
+void
+brw_dump_pipeline_select(const struct brw_pipeline_select *ptr)
+{
+   debug_printf("\t\t.header.pipeline_select = 0x%x\n", (*ptr).header.pipeline_select);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+}
+
+void
+brw_dump_pipelined_state_pointers(const struct brw_pipelined_state_pointers *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vs.offset = 0x%x\n", (*ptr).vs.offset);
+   debug_printf("\t\t.gs.enable = 0x%x\n", (*ptr).gs.enable);
+   debug_printf("\t\t.gs.offset = 0x%x\n", (*ptr).gs.offset);
+   debug_printf("\t\t.clp.enable = 0x%x\n", (*ptr).clp.enable);
+   debug_printf("\t\t.clp.offset = 0x%x\n", (*ptr).clp.offset);
+   debug_printf("\t\t.sf.offset = 0x%x\n", (*ptr).sf.offset);
+   debug_printf("\t\t.wm.offset = 0x%x\n", (*ptr).wm.offset);
+   debug_printf("\t\t.cc.offset = 0x%x\n", (*ptr).cc.offset);
+}
+
+void
+brw_dump_polygon_stipple(const struct brw_polygon_stipple *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.stipple[0] = 0x%x\n", (*ptr).stipple[0]);
+   debug_printf("\t\t.stipple[1] = 0x%x\n", (*ptr).stipple[1]);
+   debug_printf("\t\t.stipple[2] = 0x%x\n", (*ptr).stipple[2]);
+   debug_printf("\t\t.stipple[3] = 0x%x\n", (*ptr).stipple[3]);
+   debug_printf("\t\t.stipple[4] = 0x%x\n", (*ptr).stipple[4]);
+   debug_printf("\t\t.stipple[5] = 0x%x\n", (*ptr).stipple[5]);
+   debug_printf("\t\t.stipple[6] = 0x%x\n", (*ptr).stipple[6]);
+   debug_printf("\t\t.stipple[7] = 0x%x\n", (*ptr).stipple[7]);
+   debug_printf("\t\t.stipple[8] = 0x%x\n", (*ptr).stipple[8]);
+   debug_printf("\t\t.stipple[9] = 0x%x\n", (*ptr).stipple[9]);
+   debug_printf("\t\t.stipple[10] = 0x%x\n", (*ptr).stipple[10]);
+   debug_printf("\t\t.stipple[11] = 0x%x\n", (*ptr).stipple[11]);
+   debug_printf("\t\t.stipple[12] = 0x%x\n", (*ptr).stipple[12]);
+   debug_printf("\t\t.stipple[13] = 0x%x\n", (*ptr).stipple[13]);
+   debug_printf("\t\t.stipple[14] = 0x%x\n", (*ptr).stipple[14]);
+   debug_printf("\t\t.stipple[15] = 0x%x\n", (*ptr).stipple[15]);
+   debug_printf("\t\t.stipple[16] = 0x%x\n", (*ptr).stipple[16]);
+   debug_printf("\t\t.stipple[17] = 0x%x\n", (*ptr).stipple[17]);
+   debug_printf("\t\t.stipple[18] = 0x%x\n", (*ptr).stipple[18]);
+   debug_printf("\t\t.stipple[19] = 0x%x\n", (*ptr).stipple[19]);
+   debug_printf("\t\t.stipple[20] = 0x%x\n", (*ptr).stipple[20]);
+   debug_printf("\t\t.stipple[21] = 0x%x\n", (*ptr).stipple[21]);
+   debug_printf("\t\t.stipple[22] = 0x%x\n", (*ptr).stipple[22]);
+   debug_printf("\t\t.stipple[23] = 0x%x\n", (*ptr).stipple[23]);
+   debug_printf("\t\t.stipple[24] = 0x%x\n", (*ptr).stipple[24]);
+   debug_printf("\t\t.stipple[25] = 0x%x\n", (*ptr).stipple[25]);
+   debug_printf("\t\t.stipple[26] = 0x%x\n", (*ptr).stipple[26]);
+   debug_printf("\t\t.stipple[27] = 0x%x\n", (*ptr).stipple[27]);
+   debug_printf("\t\t.stipple[28] = 0x%x\n", (*ptr).stipple[28]);
+   debug_printf("\t\t.stipple[29] = 0x%x\n", (*ptr).stipple[29]);
+   debug_printf("\t\t.stipple[30] = 0x%x\n", (*ptr).stipple[30]);
+   debug_printf("\t\t.stipple[31] = 0x%x\n", (*ptr).stipple[31]);
+}
+
+void
+brw_dump_polygon_stipple_offset(const struct brw_polygon_stipple_offset *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.y_offset = 0x%x\n", (*ptr).bits0.y_offset);
+   debug_printf("\t\t.bits0.x_offset = 0x%x\n", (*ptr).bits0.x_offset);
+}
+
+void
+brw_dump_sampler_default_color(const struct brw_sampler_default_color *ptr)
+{
+   debug_printf("\t\t.color[0] = %f\n", (*ptr).color[0]);
+   debug_printf("\t\t.color[1] = %f\n", (*ptr).color[1]);
+   debug_printf("\t\t.color[2] = %f\n", (*ptr).color[2]);
+   debug_printf("\t\t.color[3] = %f\n", (*ptr).color[3]);
+}
+
+void
+brw_dump_sampler_state(const struct brw_sampler_state *ptr)
+{
+   debug_printf("\t\t.ss0.shadow_function = 0x%x\n", (*ptr).ss0.shadow_function);
+   debug_printf("\t\t.ss0.lod_bias = 0x%x\n", (*ptr).ss0.lod_bias);
+   debug_printf("\t\t.ss0.min_filter = 0x%x\n", (*ptr).ss0.min_filter);
+   debug_printf("\t\t.ss0.mag_filter = 0x%x\n", (*ptr).ss0.mag_filter);
+   debug_printf("\t\t.ss0.mip_filter = 0x%x\n", (*ptr).ss0.mip_filter);
+   debug_printf("\t\t.ss0.base_level = 0x%x\n", (*ptr).ss0.base_level);
+   debug_printf("\t\t.ss0.lod_preclamp = 0x%x\n", (*ptr).ss0.lod_preclamp);
+   debug_printf("\t\t.ss0.default_color_mode = 0x%x\n", (*ptr).ss0.default_color_mode);
+   debug_printf("\t\t.ss0.disable = 0x%x\n", (*ptr).ss0.disable);
+   debug_printf("\t\t.ss1.r_wrap_mode = 0x%x\n", (*ptr).ss1.r_wrap_mode);
+   debug_printf("\t\t.ss1.t_wrap_mode = 0x%x\n", (*ptr).ss1.t_wrap_mode);
+   debug_printf("\t\t.ss1.s_wrap_mode = 0x%x\n", (*ptr).ss1.s_wrap_mode);
+   debug_printf("\t\t.ss1.max_lod = 0x%x\n", (*ptr).ss1.max_lod);
+   debug_printf("\t\t.ss1.min_lod = 0x%x\n", (*ptr).ss1.min_lod);
+   debug_printf("\t\t.ss2.default_color_pointer = 0x%x\n", (*ptr).ss2.default_color_pointer);
+   debug_printf("\t\t.ss3.max_aniso = 0x%x\n", (*ptr).ss3.max_aniso);
+   debug_printf("\t\t.ss3.chroma_key_mode = 0x%x\n", (*ptr).ss3.chroma_key_mode);
+   debug_printf("\t\t.ss3.chroma_key_index = 0x%x\n", (*ptr).ss3.chroma_key_index);
+   debug_printf("\t\t.ss3.chroma_key_enable = 0x%x\n", (*ptr).ss3.chroma_key_enable);
+   debug_printf("\t\t.ss3.monochrome_filter_width = 0x%x\n", (*ptr).ss3.monochrome_filter_width);
+   debug_printf("\t\t.ss3.monochrome_filter_height = 0x%x\n", (*ptr).ss3.monochrome_filter_height);
+}
+
+void
+brw_dump_sf_unit_state(const struct brw_sf_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.sf5.front_winding = 0x%x\n", (*ptr).sf5.front_winding);
+   debug_printf("\t\t.sf5.viewport_transform = 0x%x\n", (*ptr).sf5.viewport_transform);
+   debug_printf("\t\t.sf5.sf_viewport_state_offset = 0x%x\n", (*ptr).sf5.sf_viewport_state_offset);
+   debug_printf("\t\t.sf6.dest_org_vbias = 0x%x\n", (*ptr).sf6.dest_org_vbias);
+   debug_printf("\t\t.sf6.dest_org_hbias = 0x%x\n", (*ptr).sf6.dest_org_hbias);
+   debug_printf("\t\t.sf6.scissor = 0x%x\n", (*ptr).sf6.scissor);
+   debug_printf("\t\t.sf6.disable_2x2_trifilter = 0x%x\n", (*ptr).sf6.disable_2x2_trifilter);
+   debug_printf("\t\t.sf6.disable_zero_pix_trifilter = 0x%x\n", (*ptr).sf6.disable_zero_pix_trifilter);
+   debug_printf("\t\t.sf6.point_rast_rule = 0x%x\n", (*ptr).sf6.point_rast_rule);
+   debug_printf("\t\t.sf6.line_endcap_aa_region_width = 0x%x\n", (*ptr).sf6.line_endcap_aa_region_width);
+   debug_printf("\t\t.sf6.line_width = 0x%x\n", (*ptr).sf6.line_width);
+   debug_printf("\t\t.sf6.fast_scissor_disable = 0x%x\n", (*ptr).sf6.fast_scissor_disable);
+   debug_printf("\t\t.sf6.cull_mode = 0x%x\n", (*ptr).sf6.cull_mode);
+   debug_printf("\t\t.sf6.aa_enable = 0x%x\n", (*ptr).sf6.aa_enable);
+   debug_printf("\t\t.sf7.point_size = 0x%x\n", (*ptr).sf7.point_size);
+   debug_printf("\t\t.sf7.use_point_size_state = 0x%x\n", (*ptr).sf7.use_point_size_state);
+   debug_printf("\t\t.sf7.subpixel_precision = 0x%x\n", (*ptr).sf7.subpixel_precision);
+   debug_printf("\t\t.sf7.sprite_point = 0x%x\n", (*ptr).sf7.sprite_point);
+   debug_printf("\t\t.sf7.aa_line_distance_mode = 0x%x\n", (*ptr).sf7.aa_line_distance_mode);
+   debug_printf("\t\t.sf7.trifan_pv = 0x%x\n", (*ptr).sf7.trifan_pv);
+   debug_printf("\t\t.sf7.linestrip_pv = 0x%x\n", (*ptr).sf7.linestrip_pv);
+   debug_printf("\t\t.sf7.tristrip_pv = 0x%x\n", (*ptr).sf7.tristrip_pv);
+   debug_printf("\t\t.sf7.line_last_pixel_enable = 0x%x\n", (*ptr).sf7.line_last_pixel_enable);
+}
+
+void
+brw_dump_sf_viewport(const struct brw_sf_viewport *ptr)
+{
+   debug_printf("\t\t.viewport.m00 = %f\n", (*ptr).viewport.m00);
+   debug_printf("\t\t.viewport.m11 = %f\n", (*ptr).viewport.m11);
+   debug_printf("\t\t.viewport.m22 = %f\n", (*ptr).viewport.m22);
+   debug_printf("\t\t.viewport.m30 = %f\n", (*ptr).viewport.m30);
+   debug_printf("\t\t.viewport.m31 = %f\n", (*ptr).viewport.m31);
+   debug_printf("\t\t.viewport.m32 = %f\n", (*ptr).viewport.m32);
+   debug_printf("\t\t.scissor.xmin = 0x%x\n", (*ptr).scissor.xmin);
+   debug_printf("\t\t.scissor.ymin = 0x%x\n", (*ptr).scissor.ymin);
+   debug_printf("\t\t.scissor.xmax = 0x%x\n", (*ptr).scissor.xmax);
+   debug_printf("\t\t.scissor.ymax = 0x%x\n", (*ptr).scissor.ymax);
+}
+
+void
+brw_dump_ss0(const struct brw_ss0 *ptr)
+{
+   debug_printf("\t\t.shadow_function = 0x%x\n", (*ptr).shadow_function);
+   debug_printf("\t\t.lod_bias = 0x%x\n", (*ptr).lod_bias);
+   debug_printf("\t\t.min_filter = 0x%x\n", (*ptr).min_filter);
+   debug_printf("\t\t.mag_filter = 0x%x\n", (*ptr).mag_filter);
+   debug_printf("\t\t.mip_filter = 0x%x\n", (*ptr).mip_filter);
+   debug_printf("\t\t.base_level = 0x%x\n", (*ptr).base_level);
+   debug_printf("\t\t.lod_preclamp = 0x%x\n", (*ptr).lod_preclamp);
+   debug_printf("\t\t.default_color_mode = 0x%x\n", (*ptr).default_color_mode);
+   debug_printf("\t\t.disable = 0x%x\n", (*ptr).disable);
+}
+
+void
+brw_dump_ss1(const struct brw_ss1 *ptr)
+{
+   debug_printf("\t\t.r_wrap_mode = 0x%x\n", (*ptr).r_wrap_mode);
+   debug_printf("\t\t.t_wrap_mode = 0x%x\n", (*ptr).t_wrap_mode);
+   debug_printf("\t\t.s_wrap_mode = 0x%x\n", (*ptr).s_wrap_mode);
+   debug_printf("\t\t.max_lod = 0x%x\n", (*ptr).max_lod);
+   debug_printf("\t\t.min_lod = 0x%x\n", (*ptr).min_lod);
+}
+
+void
+brw_dump_ss2(const struct brw_ss2 *ptr)
+{
+   debug_printf("\t\t.default_color_pointer = 0x%x\n", (*ptr).default_color_pointer);
+}
+
+void
+brw_dump_ss3(const struct brw_ss3 *ptr)
+{
+   debug_printf("\t\t.max_aniso = 0x%x\n", (*ptr).max_aniso);
+   debug_printf("\t\t.chroma_key_mode = 0x%x\n", (*ptr).chroma_key_mode);
+   debug_printf("\t\t.chroma_key_index = 0x%x\n", (*ptr).chroma_key_index);
+   debug_printf("\t\t.chroma_key_enable = 0x%x\n", (*ptr).chroma_key_enable);
+   debug_printf("\t\t.monochrome_filter_width = 0x%x\n", (*ptr).monochrome_filter_width);
+   debug_printf("\t\t.monochrome_filter_height = 0x%x\n", (*ptr).monochrome_filter_height);
+}
+
+void
+brw_dump_state_base_address(const struct brw_state_base_address *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.modify_enable = 0x%x\n", (*ptr).bits0.modify_enable);
+   debug_printf("\t\t.bits0.general_state_address = 0x%x\n", (*ptr).bits0.general_state_address);
+   debug_printf("\t\t.bits1.modify_enable = 0x%x\n", (*ptr).bits1.modify_enable);
+   debug_printf("\t\t.bits1.surface_state_address = 0x%x\n", (*ptr).bits1.surface_state_address);
+   debug_printf("\t\t.bits2.modify_enable = 0x%x\n", (*ptr).bits2.modify_enable);
+   debug_printf("\t\t.bits2.indirect_object_state_address = 0x%x\n", (*ptr).bits2.indirect_object_state_address);
+   debug_printf("\t\t.bits3.modify_enable = 0x%x\n", (*ptr).bits3.modify_enable);
+   debug_printf("\t\t.bits3.general_state_upper_bound = 0x%x\n", (*ptr).bits3.general_state_upper_bound);
+   debug_printf("\t\t.bits4.modify_enable = 0x%x\n", (*ptr).bits4.modify_enable);
+   debug_printf("\t\t.bits4.indirect_object_state_upper_bound = 0x%x\n", (*ptr).bits4.indirect_object_state_upper_bound);
+}
+
+void
+brw_dump_state_prefetch(const struct brw_state_prefetch *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.prefetch_count = 0x%x\n", (*ptr).bits0.prefetch_count);
+   debug_printf("\t\t.bits0.prefetch_pointer = 0x%x\n", (*ptr).bits0.prefetch_pointer);
+}
+
+void
+brw_dump_surf_ss0(const struct brw_surf_ss0 *ptr)
+{
+   debug_printf("\t\t.cube_pos_z = 0x%x\n", (*ptr).cube_pos_z);
+   debug_printf("\t\t.cube_neg_z = 0x%x\n", (*ptr).cube_neg_z);
+   debug_printf("\t\t.cube_pos_y = 0x%x\n", (*ptr).cube_pos_y);
+   debug_printf("\t\t.cube_neg_y = 0x%x\n", (*ptr).cube_neg_y);
+   debug_printf("\t\t.cube_pos_x = 0x%x\n", (*ptr).cube_pos_x);
+   debug_printf("\t\t.cube_neg_x = 0x%x\n", (*ptr).cube_neg_x);
+   debug_printf("\t\t.mipmap_layout_mode = 0x%x\n", (*ptr).mipmap_layout_mode);
+   debug_printf("\t\t.vert_line_stride_ofs = 0x%x\n", (*ptr).vert_line_stride_ofs);
+   debug_printf("\t\t.vert_line_stride = 0x%x\n", (*ptr).vert_line_stride);
+   debug_printf("\t\t.color_blend = 0x%x\n", (*ptr).color_blend);
+   debug_printf("\t\t.writedisable_blue = 0x%x\n", (*ptr).writedisable_blue);
+   debug_printf("\t\t.writedisable_green = 0x%x\n", (*ptr).writedisable_green);
+   debug_printf("\t\t.writedisable_red = 0x%x\n", (*ptr).writedisable_red);
+   debug_printf("\t\t.writedisable_alpha = 0x%x\n", (*ptr).writedisable_alpha);
+   debug_printf("\t\t.surface_format = 0x%x\n", (*ptr).surface_format);
+   debug_printf("\t\t.data_return_format = 0x%x\n", (*ptr).data_return_format);
+   debug_printf("\t\t.surface_type = 0x%x\n", (*ptr).surface_type);
+}
+
+void
+brw_dump_surf_ss1(const struct brw_surf_ss1 *ptr)
+{
+   debug_printf("\t\t.base_addr = 0x%x\n", (*ptr).base_addr);
+}
+
+void
+brw_dump_surf_ss2(const struct brw_surf_ss2 *ptr)
+{
+   debug_printf("\t\t.mip_count = 0x%x\n", (*ptr).mip_count);
+   debug_printf("\t\t.width = 0x%x\n", (*ptr).width);
+   debug_printf("\t\t.height = 0x%x\n", (*ptr).height);
+}
+
+void
+brw_dump_surf_ss3(const struct brw_surf_ss3 *ptr)
+{
+   debug_printf("\t\t.tile_walk = 0x%x\n", (*ptr).tile_walk);
+   debug_printf("\t\t.tiled_surface = 0x%x\n", (*ptr).tiled_surface);
+   debug_printf("\t\t.pitch = 0x%x\n", (*ptr).pitch);
+   debug_printf("\t\t.depth = 0x%x\n", (*ptr).depth);
+}
+
+void
+brw_dump_surf_ss4(const struct brw_surf_ss4 *ptr)
+{
+   debug_printf("\t\t.multisample_position_palette_index = 0x%x\n", (*ptr).multisample_position_palette_index);
+   debug_printf("\t\t.num_multisamples = 0x%x\n", (*ptr).num_multisamples);
+   debug_printf("\t\t.render_target_view_extent = 0x%x\n", (*ptr).render_target_view_extent);
+   debug_printf("\t\t.min_array_elt = 0x%x\n", (*ptr).min_array_elt);
+   debug_printf("\t\t.min_lod = 0x%x\n", (*ptr).min_lod);
+}
+
+void
+brw_dump_surf_ss5(const struct brw_surf_ss5 *ptr)
+{
+   debug_printf("\t\t.llc_mapping = 0x%x\n", (*ptr).llc_mapping);
+   debug_printf("\t\t.mlc_mapping = 0x%x\n", (*ptr).mlc_mapping);
+   debug_printf("\t\t.gfdt = 0x%x\n", (*ptr).gfdt);
+   debug_printf("\t\t.gfdt_src = 0x%x\n", (*ptr).gfdt_src);
+   debug_printf("\t\t.y_offset = 0x%x\n", (*ptr).y_offset);
+   debug_printf("\t\t.x_offset = 0x%x\n", (*ptr).x_offset);
+}
+
+void
+brw_dump_surface_state(const struct brw_surface_state *ptr)
+{
+   debug_printf("\t\t.ss0.cube_pos_z = 0x%x\n", (*ptr).ss0.cube_pos_z);
+   debug_printf("\t\t.ss0.cube_neg_z = 0x%x\n", (*ptr).ss0.cube_neg_z);
+   debug_printf("\t\t.ss0.cube_pos_y = 0x%x\n", (*ptr).ss0.cube_pos_y);
+   debug_printf("\t\t.ss0.cube_neg_y = 0x%x\n", (*ptr).ss0.cube_neg_y);
+   debug_printf("\t\t.ss0.cube_pos_x = 0x%x\n", (*ptr).ss0.cube_pos_x);
+   debug_printf("\t\t.ss0.cube_neg_x = 0x%x\n", (*ptr).ss0.cube_neg_x);
+   debug_printf("\t\t.ss0.mipmap_layout_mode = 0x%x\n", (*ptr).ss0.mipmap_layout_mode);
+   debug_printf("\t\t.ss0.vert_line_stride_ofs = 0x%x\n", (*ptr).ss0.vert_line_stride_ofs);
+   debug_printf("\t\t.ss0.vert_line_stride = 0x%x\n", (*ptr).ss0.vert_line_stride);
+   debug_printf("\t\t.ss0.color_blend = 0x%x\n", (*ptr).ss0.color_blend);
+   debug_printf("\t\t.ss0.writedisable_blue = 0x%x\n", (*ptr).ss0.writedisable_blue);
+   debug_printf("\t\t.ss0.writedisable_green = 0x%x\n", (*ptr).ss0.writedisable_green);
+   debug_printf("\t\t.ss0.writedisable_red = 0x%x\n", (*ptr).ss0.writedisable_red);
+   debug_printf("\t\t.ss0.writedisable_alpha = 0x%x\n", (*ptr).ss0.writedisable_alpha);
+   debug_printf("\t\t.ss0.surface_format = 0x%x\n", (*ptr).ss0.surface_format);
+   debug_printf("\t\t.ss0.data_return_format = 0x%x\n", (*ptr).ss0.data_return_format);
+   debug_printf("\t\t.ss0.surface_type = 0x%x\n", (*ptr).ss0.surface_type);
+   debug_printf("\t\t.ss1.base_addr = 0x%x\n", (*ptr).ss1.base_addr);
+   debug_printf("\t\t.ss2.mip_count = 0x%x\n", (*ptr).ss2.mip_count);
+   debug_printf("\t\t.ss2.width = 0x%x\n", (*ptr).ss2.width);
+   debug_printf("\t\t.ss2.height = 0x%x\n", (*ptr).ss2.height);
+   debug_printf("\t\t.ss3.tile_walk = 0x%x\n", (*ptr).ss3.tile_walk);
+   debug_printf("\t\t.ss3.tiled_surface = 0x%x\n", (*ptr).ss3.tiled_surface);
+   debug_printf("\t\t.ss3.pitch = 0x%x\n", (*ptr).ss3.pitch);
+   debug_printf("\t\t.ss3.depth = 0x%x\n", (*ptr).ss3.depth);
+   debug_printf("\t\t.ss4.multisample_position_palette_index = 0x%x\n", (*ptr).ss4.multisample_position_palette_index);
+   debug_printf("\t\t.ss4.num_multisamples = 0x%x\n", (*ptr).ss4.num_multisamples);
+   debug_printf("\t\t.ss4.render_target_view_extent = 0x%x\n", (*ptr).ss4.render_target_view_extent);
+   debug_printf("\t\t.ss4.min_array_elt = 0x%x\n", (*ptr).ss4.min_array_elt);
+   debug_printf("\t\t.ss4.min_lod = 0x%x\n", (*ptr).ss4.min_lod);
+   debug_printf("\t\t.ss5.llc_mapping = 0x%x\n", (*ptr).ss5.llc_mapping);
+   debug_printf("\t\t.ss5.mlc_mapping = 0x%x\n", (*ptr).ss5.mlc_mapping);
+   debug_printf("\t\t.ss5.gfdt = 0x%x\n", (*ptr).ss5.gfdt);
+   debug_printf("\t\t.ss5.gfdt_src = 0x%x\n", (*ptr).ss5.gfdt_src);
+   debug_printf("\t\t.ss5.y_offset = 0x%x\n", (*ptr).ss5.y_offset);
+   debug_printf("\t\t.ss5.x_offset = 0x%x\n", (*ptr).ss5.x_offset);
+}
+
+void
+brw_dump_system_instruction_pointer(const struct brw_system_instruction_pointer *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.system_instruction_pointer = 0x%x\n", (*ptr).bits0.system_instruction_pointer);
+}
+
+void
+brw_dump_urb_fence(const struct brw_urb_fence *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.vs_realloc = 0x%x\n", (*ptr).header.vs_realloc);
+   debug_printf("\t\t.header.gs_realloc = 0x%x\n", (*ptr).header.gs_realloc);
+   debug_printf("\t\t.header.clp_realloc = 0x%x\n", (*ptr).header.clp_realloc);
+   debug_printf("\t\t.header.sf_realloc = 0x%x\n", (*ptr).header.sf_realloc);
+   debug_printf("\t\t.header.vfe_realloc = 0x%x\n", (*ptr).header.vfe_realloc);
+   debug_printf("\t\t.header.cs_realloc = 0x%x\n", (*ptr).header.cs_realloc);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.bits0.vs_fence = 0x%x\n", (*ptr).bits0.vs_fence);
+   debug_printf("\t\t.bits0.gs_fence = 0x%x\n", (*ptr).bits0.gs_fence);
+   debug_printf("\t\t.bits0.clp_fence = 0x%x\n", (*ptr).bits0.clp_fence);
+   debug_printf("\t\t.bits1.sf_fence = 0x%x\n", (*ptr).bits1.sf_fence);
+   debug_printf("\t\t.bits1.vf_fence = 0x%x\n", (*ptr).bits1.vf_fence);
+   debug_printf("\t\t.bits1.cs_fence = 0x%x\n", (*ptr).bits1.cs_fence);
+}
+
+void
+brw_dump_urb_immediate(const struct brw_urb_immediate *ptr)
+{
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+   debug_printf("\t\t.offset = 0x%x\n", (*ptr).offset);
+   debug_printf("\t\t.swizzle_control = 0x%x\n", (*ptr).swizzle_control);
+   debug_printf("\t\t.allocate = 0x%x\n", (*ptr).allocate);
+   debug_printf("\t\t.used = 0x%x\n", (*ptr).used);
+   debug_printf("\t\t.complete = 0x%x\n", (*ptr).complete);
+   debug_printf("\t\t.response_length = 0x%x\n", (*ptr).response_length);
+   debug_printf("\t\t.msg_length = 0x%x\n", (*ptr).msg_length);
+   debug_printf("\t\t.msg_target = 0x%x\n", (*ptr).msg_target);
+   debug_printf("\t\t.end_of_thread = 0x%x\n", (*ptr).end_of_thread);
+}
+
+void
+brw_dump_vb_array_state(const struct brw_vb_array_state *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.vb[0].vb0.pitch = 0x%x\n", (*ptr).vb[0].vb0.pitch);
+   debug_printf("\t\t.vb[0].vb0.access_type = 0x%x\n", (*ptr).vb[0].vb0.access_type);
+   debug_printf("\t\t.vb[0].vb0.vb_index = 0x%x\n", (*ptr).vb[0].vb0.vb_index);
+   debug_printf("\t\t.vb[0].start_addr = 0x%x\n", (*ptr).vb[0].start_addr);
+   debug_printf("\t\t.vb[0].max_index = 0x%x\n", (*ptr).vb[0].max_index);
+   debug_printf("\t\t.vb[0].instance_data_step_rate = 0x%x\n", (*ptr).vb[0].instance_data_step_rate);
+   debug_printf("\t\t.vb[1].vb0.pitch = 0x%x\n", (*ptr).vb[1].vb0.pitch);
+   debug_printf("\t\t.vb[1].vb0.access_type = 0x%x\n", (*ptr).vb[1].vb0.access_type);
+   debug_printf("\t\t.vb[1].vb0.vb_index = 0x%x\n", (*ptr).vb[1].vb0.vb_index);
+   debug_printf("\t\t.vb[1].start_addr = 0x%x\n", (*ptr).vb[1].start_addr);
+   debug_printf("\t\t.vb[1].max_index = 0x%x\n", (*ptr).vb[1].max_index);
+   debug_printf("\t\t.vb[1].instance_data_step_rate = 0x%x\n", (*ptr).vb[1].instance_data_step_rate);
+   debug_printf("\t\t.vb[2].vb0.pitch = 0x%x\n", (*ptr).vb[2].vb0.pitch);
+   debug_printf("\t\t.vb[2].vb0.access_type = 0x%x\n", (*ptr).vb[2].vb0.access_type);
+   debug_printf("\t\t.vb[2].vb0.vb_index = 0x%x\n", (*ptr).vb[2].vb0.vb_index);
+   debug_printf("\t\t.vb[2].start_addr = 0x%x\n", (*ptr).vb[2].start_addr);
+   debug_printf("\t\t.vb[2].max_index = 0x%x\n", (*ptr).vb[2].max_index);
+   debug_printf("\t\t.vb[2].instance_data_step_rate = 0x%x\n", (*ptr).vb[2].instance_data_step_rate);
+   debug_printf("\t\t.vb[3].vb0.pitch = 0x%x\n", (*ptr).vb[3].vb0.pitch);
+   debug_printf("\t\t.vb[3].vb0.access_type = 0x%x\n", (*ptr).vb[3].vb0.access_type);
+   debug_printf("\t\t.vb[3].vb0.vb_index = 0x%x\n", (*ptr).vb[3].vb0.vb_index);
+   debug_printf("\t\t.vb[3].start_addr = 0x%x\n", (*ptr).vb[3].start_addr);
+   debug_printf("\t\t.vb[3].max_index = 0x%x\n", (*ptr).vb[3].max_index);
+   debug_printf("\t\t.vb[3].instance_data_step_rate = 0x%x\n", (*ptr).vb[3].instance_data_step_rate);
+   debug_printf("\t\t.vb[4].vb0.pitch = 0x%x\n", (*ptr).vb[4].vb0.pitch);
+   debug_printf("\t\t.vb[4].vb0.access_type = 0x%x\n", (*ptr).vb[4].vb0.access_type);
+   debug_printf("\t\t.vb[4].vb0.vb_index = 0x%x\n", (*ptr).vb[4].vb0.vb_index);
+   debug_printf("\t\t.vb[4].start_addr = 0x%x\n", (*ptr).vb[4].start_addr);
+   debug_printf("\t\t.vb[4].max_index = 0x%x\n", (*ptr).vb[4].max_index);
+   debug_printf("\t\t.vb[4].instance_data_step_rate = 0x%x\n", (*ptr).vb[4].instance_data_step_rate);
+   debug_printf("\t\t.vb[5].vb0.pitch = 0x%x\n", (*ptr).vb[5].vb0.pitch);
+   debug_printf("\t\t.vb[5].vb0.access_type = 0x%x\n", (*ptr).vb[5].vb0.access_type);
+   debug_printf("\t\t.vb[5].vb0.vb_index = 0x%x\n", (*ptr).vb[5].vb0.vb_index);
+   debug_printf("\t\t.vb[5].start_addr = 0x%x\n", (*ptr).vb[5].start_addr);
+   debug_printf("\t\t.vb[5].max_index = 0x%x\n", (*ptr).vb[5].max_index);
+   debug_printf("\t\t.vb[5].instance_data_step_rate = 0x%x\n", (*ptr).vb[5].instance_data_step_rate);
+   debug_printf("\t\t.vb[6].vb0.pitch = 0x%x\n", (*ptr).vb[6].vb0.pitch);
+   debug_printf("\t\t.vb[6].vb0.access_type = 0x%x\n", (*ptr).vb[6].vb0.access_type);
+   debug_printf("\t\t.vb[6].vb0.vb_index = 0x%x\n", (*ptr).vb[6].vb0.vb_index);
+   debug_printf("\t\t.vb[6].start_addr = 0x%x\n", (*ptr).vb[6].start_addr);
+   debug_printf("\t\t.vb[6].max_index = 0x%x\n", (*ptr).vb[6].max_index);
+   debug_printf("\t\t.vb[6].instance_data_step_rate = 0x%x\n", (*ptr).vb[6].instance_data_step_rate);
+   debug_printf("\t\t.vb[7].vb0.pitch = 0x%x\n", (*ptr).vb[7].vb0.pitch);
+   debug_printf("\t\t.vb[7].vb0.access_type = 0x%x\n", (*ptr).vb[7].vb0.access_type);
+   debug_printf("\t\t.vb[7].vb0.vb_index = 0x%x\n", (*ptr).vb[7].vb0.vb_index);
+   debug_printf("\t\t.vb[7].start_addr = 0x%x\n", (*ptr).vb[7].start_addr);
+   debug_printf("\t\t.vb[7].max_index = 0x%x\n", (*ptr).vb[7].max_index);
+   debug_printf("\t\t.vb[7].instance_data_step_rate = 0x%x\n", (*ptr).vb[7].instance_data_step_rate);
+   debug_printf("\t\t.vb[8].vb0.pitch = 0x%x\n", (*ptr).vb[8].vb0.pitch);
+   debug_printf("\t\t.vb[8].vb0.access_type = 0x%x\n", (*ptr).vb[8].vb0.access_type);
+   debug_printf("\t\t.vb[8].vb0.vb_index = 0x%x\n", (*ptr).vb[8].vb0.vb_index);
+   debug_printf("\t\t.vb[8].start_addr = 0x%x\n", (*ptr).vb[8].start_addr);
+   debug_printf("\t\t.vb[8].max_index = 0x%x\n", (*ptr).vb[8].max_index);
+   debug_printf("\t\t.vb[8].instance_data_step_rate = 0x%x\n", (*ptr).vb[8].instance_data_step_rate);
+   debug_printf("\t\t.vb[9].vb0.pitch = 0x%x\n", (*ptr).vb[9].vb0.pitch);
+   debug_printf("\t\t.vb[9].vb0.access_type = 0x%x\n", (*ptr).vb[9].vb0.access_type);
+   debug_printf("\t\t.vb[9].vb0.vb_index = 0x%x\n", (*ptr).vb[9].vb0.vb_index);
+   debug_printf("\t\t.vb[9].start_addr = 0x%x\n", (*ptr).vb[9].start_addr);
+   debug_printf("\t\t.vb[9].max_index = 0x%x\n", (*ptr).vb[9].max_index);
+   debug_printf("\t\t.vb[9].instance_data_step_rate = 0x%x\n", (*ptr).vb[9].instance_data_step_rate);
+   debug_printf("\t\t.vb[10].vb0.pitch = 0x%x\n", (*ptr).vb[10].vb0.pitch);
+   debug_printf("\t\t.vb[10].vb0.access_type = 0x%x\n", (*ptr).vb[10].vb0.access_type);
+   debug_printf("\t\t.vb[10].vb0.vb_index = 0x%x\n", (*ptr).vb[10].vb0.vb_index);
+   debug_printf("\t\t.vb[10].start_addr = 0x%x\n", (*ptr).vb[10].start_addr);
+   debug_printf("\t\t.vb[10].max_index = 0x%x\n", (*ptr).vb[10].max_index);
+   debug_printf("\t\t.vb[10].instance_data_step_rate = 0x%x\n", (*ptr).vb[10].instance_data_step_rate);
+   debug_printf("\t\t.vb[11].vb0.pitch = 0x%x\n", (*ptr).vb[11].vb0.pitch);
+   debug_printf("\t\t.vb[11].vb0.access_type = 0x%x\n", (*ptr).vb[11].vb0.access_type);
+   debug_printf("\t\t.vb[11].vb0.vb_index = 0x%x\n", (*ptr).vb[11].vb0.vb_index);
+   debug_printf("\t\t.vb[11].start_addr = 0x%x\n", (*ptr).vb[11].start_addr);
+   debug_printf("\t\t.vb[11].max_index = 0x%x\n", (*ptr).vb[11].max_index);
+   debug_printf("\t\t.vb[11].instance_data_step_rate = 0x%x\n", (*ptr).vb[11].instance_data_step_rate);
+   debug_printf("\t\t.vb[12].vb0.pitch = 0x%x\n", (*ptr).vb[12].vb0.pitch);
+   debug_printf("\t\t.vb[12].vb0.access_type = 0x%x\n", (*ptr).vb[12].vb0.access_type);
+   debug_printf("\t\t.vb[12].vb0.vb_index = 0x%x\n", (*ptr).vb[12].vb0.vb_index);
+   debug_printf("\t\t.vb[12].start_addr = 0x%x\n", (*ptr).vb[12].start_addr);
+   debug_printf("\t\t.vb[12].max_index = 0x%x\n", (*ptr).vb[12].max_index);
+   debug_printf("\t\t.vb[12].instance_data_step_rate = 0x%x\n", (*ptr).vb[12].instance_data_step_rate);
+   debug_printf("\t\t.vb[13].vb0.pitch = 0x%x\n", (*ptr).vb[13].vb0.pitch);
+   debug_printf("\t\t.vb[13].vb0.access_type = 0x%x\n", (*ptr).vb[13].vb0.access_type);
+   debug_printf("\t\t.vb[13].vb0.vb_index = 0x%x\n", (*ptr).vb[13].vb0.vb_index);
+   debug_printf("\t\t.vb[13].start_addr = 0x%x\n", (*ptr).vb[13].start_addr);
+   debug_printf("\t\t.vb[13].max_index = 0x%x\n", (*ptr).vb[13].max_index);
+   debug_printf("\t\t.vb[13].instance_data_step_rate = 0x%x\n", (*ptr).vb[13].instance_data_step_rate);
+   debug_printf("\t\t.vb[14].vb0.pitch = 0x%x\n", (*ptr).vb[14].vb0.pitch);
+   debug_printf("\t\t.vb[14].vb0.access_type = 0x%x\n", (*ptr).vb[14].vb0.access_type);
+   debug_printf("\t\t.vb[14].vb0.vb_index = 0x%x\n", (*ptr).vb[14].vb0.vb_index);
+   debug_printf("\t\t.vb[14].start_addr = 0x%x\n", (*ptr).vb[14].start_addr);
+   debug_printf("\t\t.vb[14].max_index = 0x%x\n", (*ptr).vb[14].max_index);
+   debug_printf("\t\t.vb[14].instance_data_step_rate = 0x%x\n", (*ptr).vb[14].instance_data_step_rate);
+   debug_printf("\t\t.vb[15].vb0.pitch = 0x%x\n", (*ptr).vb[15].vb0.pitch);
+   debug_printf("\t\t.vb[15].vb0.access_type = 0x%x\n", (*ptr).vb[15].vb0.access_type);
+   debug_printf("\t\t.vb[15].vb0.vb_index = 0x%x\n", (*ptr).vb[15].vb0.vb_index);
+   debug_printf("\t\t.vb[15].start_addr = 0x%x\n", (*ptr).vb[15].start_addr);
+   debug_printf("\t\t.vb[15].max_index = 0x%x\n", (*ptr).vb[15].max_index);
+   debug_printf("\t\t.vb[15].instance_data_step_rate = 0x%x\n", (*ptr).vb[15].instance_data_step_rate);
+   debug_printf("\t\t.vb[16].vb0.pitch = 0x%x\n", (*ptr).vb[16].vb0.pitch);
+   debug_printf("\t\t.vb[16].vb0.access_type = 0x%x\n", (*ptr).vb[16].vb0.access_type);
+   debug_printf("\t\t.vb[16].vb0.vb_index = 0x%x\n", (*ptr).vb[16].vb0.vb_index);
+   debug_printf("\t\t.vb[16].start_addr = 0x%x\n", (*ptr).vb[16].start_addr);
+   debug_printf("\t\t.vb[16].max_index = 0x%x\n", (*ptr).vb[16].max_index);
+   debug_printf("\t\t.vb[16].instance_data_step_rate = 0x%x\n", (*ptr).vb[16].instance_data_step_rate);
+}
+
+void
+brw_dump_vertex_buffer_state(const struct brw_vertex_buffer_state *ptr)
+{
+   debug_printf("\t\t.vb0.pitch = 0x%x\n", (*ptr).vb0.pitch);
+   debug_printf("\t\t.vb0.access_type = 0x%x\n", (*ptr).vb0.access_type);
+   debug_printf("\t\t.vb0.vb_index = 0x%x\n", (*ptr).vb0.vb_index);
+   debug_printf("\t\t.start_addr = 0x%x\n", (*ptr).start_addr);
+   debug_printf("\t\t.max_index = 0x%x\n", (*ptr).max_index);
+   debug_printf("\t\t.instance_data_step_rate = 0x%x\n", (*ptr).instance_data_step_rate);
+}
+
+void
+brw_dump_vertex_element_packet(const struct brw_vertex_element_packet *ptr)
+{
+   debug_printf("\t\t.header.length = 0x%x\n", (*ptr).header.length);
+   debug_printf("\t\t.header.opcode = 0x%x\n", (*ptr).header.opcode);
+   debug_printf("\t\t.ve[0].ve0.src_offset = 0x%x\n", (*ptr).ve[0].ve0.src_offset);
+   debug_printf("\t\t.ve[0].ve0.src_format = 0x%x\n", (*ptr).ve[0].ve0.src_format);
+   debug_printf("\t\t.ve[0].ve0.valid = 0x%x\n", (*ptr).ve[0].ve0.valid);
+   debug_printf("\t\t.ve[0].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[0].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[0].ve1.dst_offset = 0x%x\n", (*ptr).ve[0].ve1.dst_offset);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[0].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[0].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[1].ve0.src_offset = 0x%x\n", (*ptr).ve[1].ve0.src_offset);
+   debug_printf("\t\t.ve[1].ve0.src_format = 0x%x\n", (*ptr).ve[1].ve0.src_format);
+   debug_printf("\t\t.ve[1].ve0.valid = 0x%x\n", (*ptr).ve[1].ve0.valid);
+   debug_printf("\t\t.ve[1].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[1].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[1].ve1.dst_offset = 0x%x\n", (*ptr).ve[1].ve1.dst_offset);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[1].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[1].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[2].ve0.src_offset = 0x%x\n", (*ptr).ve[2].ve0.src_offset);
+   debug_printf("\t\t.ve[2].ve0.src_format = 0x%x\n", (*ptr).ve[2].ve0.src_format);
+   debug_printf("\t\t.ve[2].ve0.valid = 0x%x\n", (*ptr).ve[2].ve0.valid);
+   debug_printf("\t\t.ve[2].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[2].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[2].ve1.dst_offset = 0x%x\n", (*ptr).ve[2].ve1.dst_offset);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[2].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[2].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[3].ve0.src_offset = 0x%x\n", (*ptr).ve[3].ve0.src_offset);
+   debug_printf("\t\t.ve[3].ve0.src_format = 0x%x\n", (*ptr).ve[3].ve0.src_format);
+   debug_printf("\t\t.ve[3].ve0.valid = 0x%x\n", (*ptr).ve[3].ve0.valid);
+   debug_printf("\t\t.ve[3].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[3].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[3].ve1.dst_offset = 0x%x\n", (*ptr).ve[3].ve1.dst_offset);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[3].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[3].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[4].ve0.src_offset = 0x%x\n", (*ptr).ve[4].ve0.src_offset);
+   debug_printf("\t\t.ve[4].ve0.src_format = 0x%x\n", (*ptr).ve[4].ve0.src_format);
+   debug_printf("\t\t.ve[4].ve0.valid = 0x%x\n", (*ptr).ve[4].ve0.valid);
+   debug_printf("\t\t.ve[4].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[4].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[4].ve1.dst_offset = 0x%x\n", (*ptr).ve[4].ve1.dst_offset);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[4].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[4].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[5].ve0.src_offset = 0x%x\n", (*ptr).ve[5].ve0.src_offset);
+   debug_printf("\t\t.ve[5].ve0.src_format = 0x%x\n", (*ptr).ve[5].ve0.src_format);
+   debug_printf("\t\t.ve[5].ve0.valid = 0x%x\n", (*ptr).ve[5].ve0.valid);
+   debug_printf("\t\t.ve[5].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[5].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[5].ve1.dst_offset = 0x%x\n", (*ptr).ve[5].ve1.dst_offset);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[5].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[5].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[6].ve0.src_offset = 0x%x\n", (*ptr).ve[6].ve0.src_offset);
+   debug_printf("\t\t.ve[6].ve0.src_format = 0x%x\n", (*ptr).ve[6].ve0.src_format);
+   debug_printf("\t\t.ve[6].ve0.valid = 0x%x\n", (*ptr).ve[6].ve0.valid);
+   debug_printf("\t\t.ve[6].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[6].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[6].ve1.dst_offset = 0x%x\n", (*ptr).ve[6].ve1.dst_offset);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[6].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[6].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[7].ve0.src_offset = 0x%x\n", (*ptr).ve[7].ve0.src_offset);
+   debug_printf("\t\t.ve[7].ve0.src_format = 0x%x\n", (*ptr).ve[7].ve0.src_format);
+   debug_printf("\t\t.ve[7].ve0.valid = 0x%x\n", (*ptr).ve[7].ve0.valid);
+   debug_printf("\t\t.ve[7].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[7].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[7].ve1.dst_offset = 0x%x\n", (*ptr).ve[7].ve1.dst_offset);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[7].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[7].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[8].ve0.src_offset = 0x%x\n", (*ptr).ve[8].ve0.src_offset);
+   debug_printf("\t\t.ve[8].ve0.src_format = 0x%x\n", (*ptr).ve[8].ve0.src_format);
+   debug_printf("\t\t.ve[8].ve0.valid = 0x%x\n", (*ptr).ve[8].ve0.valid);
+   debug_printf("\t\t.ve[8].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[8].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[8].ve1.dst_offset = 0x%x\n", (*ptr).ve[8].ve1.dst_offset);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[8].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[8].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[9].ve0.src_offset = 0x%x\n", (*ptr).ve[9].ve0.src_offset);
+   debug_printf("\t\t.ve[9].ve0.src_format = 0x%x\n", (*ptr).ve[9].ve0.src_format);
+   debug_printf("\t\t.ve[9].ve0.valid = 0x%x\n", (*ptr).ve[9].ve0.valid);
+   debug_printf("\t\t.ve[9].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[9].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[9].ve1.dst_offset = 0x%x\n", (*ptr).ve[9].ve1.dst_offset);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[9].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[9].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[10].ve0.src_offset = 0x%x\n", (*ptr).ve[10].ve0.src_offset);
+   debug_printf("\t\t.ve[10].ve0.src_format = 0x%x\n", (*ptr).ve[10].ve0.src_format);
+   debug_printf("\t\t.ve[10].ve0.valid = 0x%x\n", (*ptr).ve[10].ve0.valid);
+   debug_printf("\t\t.ve[10].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[10].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[10].ve1.dst_offset = 0x%x\n", (*ptr).ve[10].ve1.dst_offset);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[10].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[10].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[11].ve0.src_offset = 0x%x\n", (*ptr).ve[11].ve0.src_offset);
+   debug_printf("\t\t.ve[11].ve0.src_format = 0x%x\n", (*ptr).ve[11].ve0.src_format);
+   debug_printf("\t\t.ve[11].ve0.valid = 0x%x\n", (*ptr).ve[11].ve0.valid);
+   debug_printf("\t\t.ve[11].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[11].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[11].ve1.dst_offset = 0x%x\n", (*ptr).ve[11].ve1.dst_offset);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[11].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[11].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[12].ve0.src_offset = 0x%x\n", (*ptr).ve[12].ve0.src_offset);
+   debug_printf("\t\t.ve[12].ve0.src_format = 0x%x\n", (*ptr).ve[12].ve0.src_format);
+   debug_printf("\t\t.ve[12].ve0.valid = 0x%x\n", (*ptr).ve[12].ve0.valid);
+   debug_printf("\t\t.ve[12].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[12].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[12].ve1.dst_offset = 0x%x\n", (*ptr).ve[12].ve1.dst_offset);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[12].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[12].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[13].ve0.src_offset = 0x%x\n", (*ptr).ve[13].ve0.src_offset);
+   debug_printf("\t\t.ve[13].ve0.src_format = 0x%x\n", (*ptr).ve[13].ve0.src_format);
+   debug_printf("\t\t.ve[13].ve0.valid = 0x%x\n", (*ptr).ve[13].ve0.valid);
+   debug_printf("\t\t.ve[13].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[13].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[13].ve1.dst_offset = 0x%x\n", (*ptr).ve[13].ve1.dst_offset);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[13].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[13].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[14].ve0.src_offset = 0x%x\n", (*ptr).ve[14].ve0.src_offset);
+   debug_printf("\t\t.ve[14].ve0.src_format = 0x%x\n", (*ptr).ve[14].ve0.src_format);
+   debug_printf("\t\t.ve[14].ve0.valid = 0x%x\n", (*ptr).ve[14].ve0.valid);
+   debug_printf("\t\t.ve[14].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[14].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[14].ve1.dst_offset = 0x%x\n", (*ptr).ve[14].ve1.dst_offset);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[14].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[14].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[15].ve0.src_offset = 0x%x\n", (*ptr).ve[15].ve0.src_offset);
+   debug_printf("\t\t.ve[15].ve0.src_format = 0x%x\n", (*ptr).ve[15].ve0.src_format);
+   debug_printf("\t\t.ve[15].ve0.valid = 0x%x\n", (*ptr).ve[15].ve0.valid);
+   debug_printf("\t\t.ve[15].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[15].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[15].ve1.dst_offset = 0x%x\n", (*ptr).ve[15].ve1.dst_offset);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[15].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[15].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[16].ve0.src_offset = 0x%x\n", (*ptr).ve[16].ve0.src_offset);
+   debug_printf("\t\t.ve[16].ve0.src_format = 0x%x\n", (*ptr).ve[16].ve0.src_format);
+   debug_printf("\t\t.ve[16].ve0.valid = 0x%x\n", (*ptr).ve[16].ve0.valid);
+   debug_printf("\t\t.ve[16].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[16].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[16].ve1.dst_offset = 0x%x\n", (*ptr).ve[16].ve1.dst_offset);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[16].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[16].ve1.vfcomponent0);
+   debug_printf("\t\t.ve[17].ve0.src_offset = 0x%x\n", (*ptr).ve[17].ve0.src_offset);
+   debug_printf("\t\t.ve[17].ve0.src_format = 0x%x\n", (*ptr).ve[17].ve0.src_format);
+   debug_printf("\t\t.ve[17].ve0.valid = 0x%x\n", (*ptr).ve[17].ve0.valid);
+   debug_printf("\t\t.ve[17].ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve[17].ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve[17].ve1.dst_offset = 0x%x\n", (*ptr).ve[17].ve1.dst_offset);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent3 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent3);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent2 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent2);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent1 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent1);
+   debug_printf("\t\t.ve[17].ve1.vfcomponent0 = 0x%x\n", (*ptr).ve[17].ve1.vfcomponent0);
+}
+
+void
+brw_dump_vertex_element_state(const struct brw_vertex_element_state *ptr)
+{
+   debug_printf("\t\t.ve0.src_offset = 0x%x\n", (*ptr).ve0.src_offset);
+   debug_printf("\t\t.ve0.src_format = 0x%x\n", (*ptr).ve0.src_format);
+   debug_printf("\t\t.ve0.valid = 0x%x\n", (*ptr).ve0.valid);
+   debug_printf("\t\t.ve0.vertex_buffer_index = 0x%x\n", (*ptr).ve0.vertex_buffer_index);
+   debug_printf("\t\t.ve1.dst_offset = 0x%x\n", (*ptr).ve1.dst_offset);
+   debug_printf("\t\t.ve1.vfcomponent3 = 0x%x\n", (*ptr).ve1.vfcomponent3);
+   debug_printf("\t\t.ve1.vfcomponent2 = 0x%x\n", (*ptr).ve1.vfcomponent2);
+   debug_printf("\t\t.ve1.vfcomponent1 = 0x%x\n", (*ptr).ve1.vfcomponent1);
+   debug_printf("\t\t.ve1.vfcomponent0 = 0x%x\n", (*ptr).ve1.vfcomponent0);
+}
+
+void
+brw_dump_vf_statistics(const struct brw_vf_statistics *ptr)
+{
+   debug_printf("\t\t.statistics_enable = 0x%x\n", (*ptr).statistics_enable);
+   debug_printf("\t\t.opcode = 0x%x\n", (*ptr).opcode);
+}
+
+void
+brw_dump_vs_unit_state(const struct brw_vs_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.thread4.stats_enable = 0x%x\n", (*ptr).thread4.stats_enable);
+   debug_printf("\t\t.thread4.nr_urb_entries = 0x%x\n", (*ptr).thread4.nr_urb_entries);
+   debug_printf("\t\t.thread4.urb_entry_allocation_size = 0x%x\n", (*ptr).thread4.urb_entry_allocation_size);
+   debug_printf("\t\t.thread4.max_threads = 0x%x\n", (*ptr).thread4.max_threads);
+   debug_printf("\t\t.vs5.sampler_count = 0x%x\n", (*ptr).vs5.sampler_count);
+   debug_printf("\t\t.vs5.sampler_state_pointer = 0x%x\n", (*ptr).vs5.sampler_state_pointer);
+   debug_printf("\t\t.vs6.vs_enable = 0x%x\n", (*ptr).vs6.vs_enable);
+   debug_printf("\t\t.vs6.vert_cache_disable = 0x%x\n", (*ptr).vs6.vert_cache_disable);
+}
+
+void
+brw_dump_wm_unit_state(const struct brw_wm_unit_state *ptr)
+{
+   debug_printf("\t\t.thread0.grf_reg_count = 0x%x\n", (*ptr).thread0.grf_reg_count);
+   debug_printf("\t\t.thread0.kernel_start_pointer = 0x%x\n", (*ptr).thread0.kernel_start_pointer);
+   debug_printf("\t\t.thread1.ext_halt_exception_enable = 0x%x\n", (*ptr).thread1.ext_halt_exception_enable);
+   debug_printf("\t\t.thread1.sw_exception_enable = 0x%x\n", (*ptr).thread1.sw_exception_enable);
+   debug_printf("\t\t.thread1.mask_stack_exception_enable = 0x%x\n", (*ptr).thread1.mask_stack_exception_enable);
+   debug_printf("\t\t.thread1.timeout_exception_enable = 0x%x\n", (*ptr).thread1.timeout_exception_enable);
+   debug_printf("\t\t.thread1.illegal_op_exception_enable = 0x%x\n", (*ptr).thread1.illegal_op_exception_enable);
+   debug_printf("\t\t.thread1.depth_coef_urb_read_offset = 0x%x\n", (*ptr).thread1.depth_coef_urb_read_offset);
+   debug_printf("\t\t.thread1.floating_point_mode = 0x%x\n", (*ptr).thread1.floating_point_mode);
+   debug_printf("\t\t.thread1.thread_priority = 0x%x\n", (*ptr).thread1.thread_priority);
+   debug_printf("\t\t.thread1.binding_table_entry_count = 0x%x\n", (*ptr).thread1.binding_table_entry_count);
+   debug_printf("\t\t.thread1.single_program_flow = 0x%x\n", (*ptr).thread1.single_program_flow);
+   debug_printf("\t\t.thread2.per_thread_scratch_space = 0x%x\n", (*ptr).thread2.per_thread_scratch_space);
+   debug_printf("\t\t.thread2.scratch_space_base_pointer = 0x%x\n", (*ptr).thread2.scratch_space_base_pointer);
+   debug_printf("\t\t.thread3.dispatch_grf_start_reg = 0x%x\n", (*ptr).thread3.dispatch_grf_start_reg);
+   debug_printf("\t\t.thread3.urb_entry_read_offset = 0x%x\n", (*ptr).thread3.urb_entry_read_offset);
+   debug_printf("\t\t.thread3.urb_entry_read_length = 0x%x\n", (*ptr).thread3.urb_entry_read_length);
+   debug_printf("\t\t.thread3.const_urb_entry_read_offset = 0x%x\n", (*ptr).thread3.const_urb_entry_read_offset);
+   debug_printf("\t\t.thread3.const_urb_entry_read_length = 0x%x\n", (*ptr).thread3.const_urb_entry_read_length);
+   debug_printf("\t\t.wm4.stats_enable = 0x%x\n", (*ptr).wm4.stats_enable);
+   debug_printf("\t\t.wm4.depth_buffer_clear = 0x%x\n", (*ptr).wm4.depth_buffer_clear);
+   debug_printf("\t\t.wm4.sampler_count = 0x%x\n", (*ptr).wm4.sampler_count);
+   debug_printf("\t\t.wm4.sampler_state_pointer = 0x%x\n", (*ptr).wm4.sampler_state_pointer);
+   debug_printf("\t\t.wm5.enable_8_pix = 0x%x\n", (*ptr).wm5.enable_8_pix);
+   debug_printf("\t\t.wm5.enable_16_pix = 0x%x\n", (*ptr).wm5.enable_16_pix);
+   debug_printf("\t\t.wm5.enable_32_pix = 0x%x\n", (*ptr).wm5.enable_32_pix);
+   debug_printf("\t\t.wm5.enable_con_32_pix = 0x%x\n", (*ptr).wm5.enable_con_32_pix);
+   debug_printf("\t\t.wm5.enable_con_64_pix = 0x%x\n", (*ptr).wm5.enable_con_64_pix);
+   debug_printf("\t\t.wm5.legacy_global_depth_bias = 0x%x\n", (*ptr).wm5.legacy_global_depth_bias);
+   debug_printf("\t\t.wm5.line_stipple = 0x%x\n", (*ptr).wm5.line_stipple);
+   debug_printf("\t\t.wm5.depth_offset = 0x%x\n", (*ptr).wm5.depth_offset);
+   debug_printf("\t\t.wm5.polygon_stipple = 0x%x\n", (*ptr).wm5.polygon_stipple);
+   debug_printf("\t\t.wm5.line_aa_region_width = 0x%x\n", (*ptr).wm5.line_aa_region_width);
+   debug_printf("\t\t.wm5.line_endcap_aa_region_width = 0x%x\n", (*ptr).wm5.line_endcap_aa_region_width);
+   debug_printf("\t\t.wm5.early_depth_test = 0x%x\n", (*ptr).wm5.early_depth_test);
+   debug_printf("\t\t.wm5.thread_dispatch_enable = 0x%x\n", (*ptr).wm5.thread_dispatch_enable);
+   debug_printf("\t\t.wm5.program_uses_depth = 0x%x\n", (*ptr).wm5.program_uses_depth);
+   debug_printf("\t\t.wm5.program_computes_depth = 0x%x\n", (*ptr).wm5.program_computes_depth);
+   debug_printf("\t\t.wm5.program_uses_killpixel = 0x%x\n", (*ptr).wm5.program_uses_killpixel);
+   debug_printf("\t\t.wm5.legacy_line_rast = 0x%x\n", (*ptr).wm5.legacy_line_rast);
+   debug_printf("\t\t.wm5.transposed_urb_read_enable = 0x%x\n", (*ptr).wm5.transposed_urb_read_enable);
+   debug_printf("\t\t.wm5.max_threads = 0x%x\n", (*ptr).wm5.max_threads);
+   debug_printf("\t\t.global_depth_offset_constant = %f\n", (*ptr).global_depth_offset_constant);
+   debug_printf("\t\t.global_depth_offset_scale = %f\n", (*ptr).global_depth_offset_scale);
+   debug_printf("\t\t.wm8.grf_reg_count_1 = 0x%x\n", (*ptr).wm8.grf_reg_count_1);
+   debug_printf("\t\t.wm8.kernel_start_pointer_1 = 0x%x\n", (*ptr).wm8.kernel_start_pointer_1);
+   debug_printf("\t\t.wm9.grf_reg_count_2 = 0x%x\n", (*ptr).wm9.grf_reg_count_2);
+   debug_printf("\t\t.wm9.kernel_start_pointer_2 = 0x%x\n", (*ptr).wm9.kernel_start_pointer_2);
+   debug_printf("\t\t.wm10.grf_reg_count_3 = 0x%x\n", (*ptr).wm10.grf_reg_count_3);
+   debug_printf("\t\t.wm10.kernel_start_pointer_3 = 0x%x\n", (*ptr).wm10.kernel_start_pointer_3);
+}
+
diff --git a/src/gallium/drivers/i965/brw_structs_dump.h b/src/gallium/drivers/i965/brw_structs_dump.h
new file mode 100644
index 00000000000..7c02dbfe33f
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.h
@@ -0,0 +1,276 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Dump i965 data structures.
+ *
+ * Generated automatically from brw_structs.h by brw_structs_dump.py.
+ */
+
+#ifndef BRW_STRUCTS_DUMP_H
+#define BRW_STRUCTS_DUMP_H
+
+struct brw_3d_control;
+struct brw_3d_primitive;
+struct brw_aa_line_parameters;
+struct brw_binding_table_pointers;
+struct brw_blend_constant_color;
+struct brw_cc0;
+struct brw_cc1;
+struct brw_cc2;
+struct brw_cc3;
+struct brw_cc4;
+struct brw_cc5;
+struct brw_cc6;
+struct brw_cc7;
+struct brw_cc_unit_state;
+struct brw_cc_viewport;
+struct brw_clip_unit_state;
+struct brw_clipper_viewport;
+struct brw_constant_buffer;
+struct brw_cs_urb_state;
+struct brw_depthbuffer;
+struct brw_depthbuffer_g4x;
+struct brw_drawrect;
+struct brw_global_depth_offset_clamp;
+struct brw_gs_unit_state;
+struct brw_indexbuffer;
+struct brw_line_stipple;
+struct brw_mi_flush;
+struct brw_pipe_control;
+struct brw_pipeline_select;
+struct brw_pipelined_state_pointers;
+struct brw_polygon_stipple;
+struct brw_polygon_stipple_offset;
+struct brw_sampler_default_color;
+struct brw_sampler_state;
+struct brw_sf_unit_state;
+struct brw_sf_viewport;
+struct brw_ss0;
+struct brw_ss1;
+struct brw_ss2;
+struct brw_ss3;
+struct brw_state_base_address;
+struct brw_state_prefetch;
+struct brw_surf_ss0;
+struct brw_surf_ss1;
+struct brw_surf_ss2;
+struct brw_surf_ss3;
+struct brw_surf_ss4;
+struct brw_surf_ss5;
+struct brw_surface_state;
+struct brw_system_instruction_pointer;
+struct brw_urb_fence;
+struct brw_urb_immediate;
+struct brw_vb_array_state;
+struct brw_vertex_buffer_state;
+struct brw_vertex_element_packet;
+struct brw_vertex_element_state;
+struct brw_vf_statistics;
+struct brw_vs_unit_state;
+struct brw_wm_unit_state;
+
+void
+brw_dump_3d_control(const struct brw_3d_control *ptr);
+
+void
+brw_dump_3d_primitive(const struct brw_3d_primitive *ptr);
+
+void
+brw_dump_aa_line_parameters(const struct brw_aa_line_parameters *ptr);
+
+void
+brw_dump_binding_table_pointers(const struct brw_binding_table_pointers *ptr);
+
+void
+brw_dump_blend_constant_color(const struct brw_blend_constant_color *ptr);
+
+void
+brw_dump_cc0(const struct brw_cc0 *ptr);
+
+void
+brw_dump_cc1(const struct brw_cc1 *ptr);
+
+void
+brw_dump_cc2(const struct brw_cc2 *ptr);
+
+void
+brw_dump_cc3(const struct brw_cc3 *ptr);
+
+void
+brw_dump_cc4(const struct brw_cc4 *ptr);
+
+void
+brw_dump_cc5(const struct brw_cc5 *ptr);
+
+void
+brw_dump_cc6(const struct brw_cc6 *ptr);
+
+void
+brw_dump_cc7(const struct brw_cc7 *ptr);
+
+void
+brw_dump_cc_unit_state(const struct brw_cc_unit_state *ptr);
+
+void
+brw_dump_cc_viewport(const struct brw_cc_viewport *ptr);
+
+void
+brw_dump_clip_unit_state(const struct brw_clip_unit_state *ptr);
+
+void
+brw_dump_clipper_viewport(const struct brw_clipper_viewport *ptr);
+
+void
+brw_dump_constant_buffer(const struct brw_constant_buffer *ptr);
+
+void
+brw_dump_cs_urb_state(const struct brw_cs_urb_state *ptr);
+
+void
+brw_dump_depthbuffer(const struct brw_depthbuffer *ptr);
+
+void
+brw_dump_depthbuffer_g4x(const struct brw_depthbuffer_g4x *ptr);
+
+void
+brw_dump_drawrect(const struct brw_drawrect *ptr);
+
+void
+brw_dump_global_depth_offset_clamp(const struct brw_global_depth_offset_clamp *ptr);
+
+void
+brw_dump_gs_unit_state(const struct brw_gs_unit_state *ptr);
+
+void
+brw_dump_indexbuffer(const struct brw_indexbuffer *ptr);
+
+void
+brw_dump_line_stipple(const struct brw_line_stipple *ptr);
+
+void
+brw_dump_mi_flush(const struct brw_mi_flush *ptr);
+
+void
+brw_dump_pipe_control(const struct brw_pipe_control *ptr);
+
+void
+brw_dump_pipeline_select(const struct brw_pipeline_select *ptr);
+
+void
+brw_dump_pipelined_state_pointers(const struct brw_pipelined_state_pointers *ptr);
+
+void
+brw_dump_polygon_stipple(const struct brw_polygon_stipple *ptr);
+
+void
+brw_dump_polygon_stipple_offset(const struct brw_polygon_stipple_offset *ptr);
+
+void
+brw_dump_sampler_default_color(const struct brw_sampler_default_color *ptr);
+
+void
+brw_dump_sampler_state(const struct brw_sampler_state *ptr);
+
+void
+brw_dump_sf_unit_state(const struct brw_sf_unit_state *ptr);
+
+void
+brw_dump_sf_viewport(const struct brw_sf_viewport *ptr);
+
+void
+brw_dump_ss0(const struct brw_ss0 *ptr);
+
+void
+brw_dump_ss1(const struct brw_ss1 *ptr);
+
+void
+brw_dump_ss2(const struct brw_ss2 *ptr);
+
+void
+brw_dump_ss3(const struct brw_ss3 *ptr);
+
+void
+brw_dump_state_base_address(const struct brw_state_base_address *ptr);
+
+void
+brw_dump_state_prefetch(const struct brw_state_prefetch *ptr);
+
+void
+brw_dump_surf_ss0(const struct brw_surf_ss0 *ptr);
+
+void
+brw_dump_surf_ss1(const struct brw_surf_ss1 *ptr);
+
+void
+brw_dump_surf_ss2(const struct brw_surf_ss2 *ptr);
+
+void
+brw_dump_surf_ss3(const struct brw_surf_ss3 *ptr);
+
+void
+brw_dump_surf_ss4(const struct brw_surf_ss4 *ptr);
+
+void
+brw_dump_surf_ss5(const struct brw_surf_ss5 *ptr);
+
+void
+brw_dump_surface_state(const struct brw_surface_state *ptr);
+
+void
+brw_dump_system_instruction_pointer(const struct brw_system_instruction_pointer *ptr);
+
+void
+brw_dump_urb_fence(const struct brw_urb_fence *ptr);
+
+void
+brw_dump_urb_immediate(const struct brw_urb_immediate *ptr);
+
+void
+brw_dump_vb_array_state(const struct brw_vb_array_state *ptr);
+
+void
+brw_dump_vertex_buffer_state(const struct brw_vertex_buffer_state *ptr);
+
+void
+brw_dump_vertex_element_packet(const struct brw_vertex_element_packet *ptr);
+
+void
+brw_dump_vertex_element_state(const struct brw_vertex_element_state *ptr);
+
+void
+brw_dump_vf_statistics(const struct brw_vf_statistics *ptr);
+
+void
+brw_dump_vs_unit_state(const struct brw_vs_unit_state *ptr);
+
+void
+brw_dump_wm_unit_state(const struct brw_wm_unit_state *ptr);
+
+
+#endif /* BRW_STRUCTS_DUMP_H */
diff --git a/src/gallium/drivers/i965/brw_structs_dump.py b/src/gallium/drivers/i965/brw_structs_dump.py
new file mode 100755
index 00000000000..6dba49ad911
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_structs_dump.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python
+'''
+Generates dumpers for the i965 state strucutures using pygccxml.
+
+Run as 
+
+  PYTHONPATH=/path/to/pygccxml-1.0.0 python brw_structs_dump.py
+
+Jose Fonseca <jfonseca@vmware.com>
+'''
+
+copyright = '''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+ '''
+
+import os
+import sys
+import re
+
+from pygccxml import parser
+from pygccxml import declarations
+
+from pygccxml.declarations import algorithm
+from pygccxml.declarations import decl_visitor
+from pygccxml.declarations import type_traits
+from pygccxml.declarations import type_visitor
+
+
+enums = True
+
+
+def vars_filter(variable):
+    name = variable.name
+    return not re.match('^pad\d*', name) and name != 'dword' 
+
+
+class decl_dumper_t(decl_visitor.decl_visitor_t):
+
+    def __init__(self, stream, instance = '', decl = None):
+        decl_visitor.decl_visitor_t.__init__(self)
+        self.stream = stream
+        self._instance = instance
+        self.decl = decl
+
+    def clone(self):
+        return decl_dumper_t(self.stream, self._instance, self.decl)
+
+    def visit_class(self):
+        class_ = self.decl
+        assert self.decl.class_type in ('struct', 'union')
+
+        for variable in class_.variables(recursive = False):
+            if vars_filter(variable):
+                dump_type(self.stream, self._instance + '.' + variable.name, variable.type)
+
+    def visit_enumeration(self):
+        if enums:
+            self.stream.write('   switch(%s) {\n' % ("(*ptr)" + self._instance,))
+            for name, value in self.decl.values:
+                self.stream.write('   case %s:\n' % (name,))
+                self.stream.write('      debug_printf("\\t\\t%s = %s\\n");\n' % (self._instance, name))
+                self.stream.write('      break;\n')
+            self.stream.write('   default:\n')
+            self.stream.write('      debug_printf("\\t\\t%s = %%i\\n", %s);\n' % (self._instance, "(*ptr)" + self._instance))
+            self.stream.write('      break;\n')
+            self.stream.write('   }\n')
+        else:
+            self.stream.write('   debug_printf("\\t\\t%s = %%i\\n", %s);\n' % (self._instance, "(*ptr)" + self._instance))
+
+
+def dump_decl(stream, instance, decl):
+    dumper = decl_dumper_t(stream, instance, decl)
+    algorithm.apply_visitor(dumper, decl)
+
+
+class type_dumper_t(type_visitor.type_visitor_t):
+
+    def __init__(self, stream, instance, type_):
+        type_visitor.type_visitor_t.__init__(self)
+        self.stream = stream
+        self.instance = instance
+        self.type = type_
+
+    def clone(self):
+        return type_dumper_t(self.instance, self.type)
+
+    def visit_bool(self):
+        self.print_instance('%i')
+        
+    def visit_char(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_unsigned_char(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+
+    def visit_signed_char(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+    
+    def visit_wchar(self):
+        self.print_instance('0x%x')
+        
+    def visit_short_int(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_short_unsigned_int(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+        
+    def visit_int(self):
+        #self.print_instance('%i')
+        self.print_instance('0x%x')
+        
+    def visit_unsigned_int(self):
+        #self.print_instance('%u')
+        self.print_instance('0x%x')
+        
+    def visit_long_int(self):
+        #self.print_instance('%li')
+        self.print_instance('0x%lx')
+        
+    def visit_long_unsigned_int(self):
+        #self.print_instance('%lu')
+        self.print_instance('%0xlx')
+        
+    def visit_long_long_int(self):
+        #self.print_instance('%lli')
+        self.print_instance('%0xllx')
+        
+    def visit_long_long_unsigned_int(self):
+        #self.print_instance('%llu')
+        self.print_instance('0x%llx')
+        
+    def visit_float(self):
+        self.print_instance('%f')
+        
+    def visit_double(self):
+        self.print_instance('%f')
+        
+    def visit_array(self):
+        for i in range(type_traits.array_size(self.type)):
+            dump_type(self.stream, self.instance + '[%i]' % i, type_traits.base_type(self.type))
+
+    def visit_pointer(self):
+        self.print_instance('%p')
+
+    def visit_declarated(self):
+        #stream.write('decl = %r\n' % self.type.decl_string)
+        decl = type_traits.remove_declarated(self.type)
+        dump_decl(self.stream, self.instance, decl)
+
+    def print_instance(self, format):
+        self.stream.write('   debug_printf("\\t\\t%s = %s\\n", %s);\n' % (self.instance, format, "(*ptr)" + self.instance))
+
+
+
+def dump_type(stream, instance, type_):
+    type_ = type_traits.remove_alias(type_)
+    visitor = type_dumper_t(stream, instance, type_)
+    algorithm.apply_visitor(visitor, type_)
+
+
+def dump_struct_interface(stream, class_, suffix = ';'):
+    name = class_.name
+    assert name.startswith('brw_');
+    name = name[:4] + 'dump_' + name[4:]
+    stream.write('void\n')
+    stream.write('%s(const struct %s *ptr)%s\n' % (name, class_.name, suffix))
+
+
+def dump_struct_implementation(stream, decls, class_):
+    dump_struct_interface(stream, class_, suffix = '')
+    stream.write('{\n')
+    dump_decl(stream, '', class_)
+    stream.write('}\n')
+    stream.write('\n')
+
+
+def dump_header(stream):
+    stream.write(copyright.strip() + '\n')
+    stream.write('\n')
+    stream.write('/**\n')
+    stream.write(' * @file\n')
+    stream.write(' * Dump i965 data structures.\n')
+    stream.write(' *\n')
+    stream.write(' * Generated automatically from brw_structs.h by brw_structs_dump.py.\n')
+    stream.write(' */\n')
+    stream.write('\n')
+
+
+def dump_interfaces(decls, global_ns, names):
+    stream = open('brw_structs_dump.h', 'wt')
+    
+    dump_header(stream)
+    
+    stream.write('#ifndef BRW_STRUCTS_DUMP_H\n')
+    stream.write('#define BRW_STRUCTS_DUMP_H\n')
+    stream.write('\n')
+    
+    for name in names:
+        stream.write('struct %s;\n' % (name,))
+    stream.write('\n')
+
+    for name in names:
+        (class_,) = global_ns.classes(name = name)
+        dump_struct_interface(stream, class_)
+        stream.write('\n')
+    stream.write('\n')
+
+    stream.write('#endif /* BRW_STRUCTS_DUMP_H */\n')
+
+
+def dump_implementations(decls, global_ns, names):
+    stream = open('brw_structs_dump.c', 'wt')
+    
+    dump_header(stream)
+
+    stream.write('#include "util/u_debug.h"\n')
+    stream.write('\n')
+    stream.write('#include "brw_types.h"\n')
+    stream.write('#include "brw_structs.h"\n')
+    stream.write('#include "brw_structs_dump.h"\n')
+    stream.write('\n')
+
+    for name in names:
+        (class_,) = global_ns.classes(name = name)
+        dump_struct_implementation(stream, decls, class_)
+
+
+def decl_filter(decl):
+    '''Filter the declarations we're interested in'''
+    name = decl.name
+    return name.startswith('brw_') and name not in ('brw_instruction',) 
+
+
+def main():
+
+    config = parser.config_t(
+        include_paths = [
+            '../../include',
+        ],
+        compiler = 'gcc',
+    )
+
+    headers = [
+        'brw_types.h', 
+        'brw_structs.h', 
+    ]
+
+    decls = parser.parse(headers, config, parser.COMPILATION_MODE.ALL_AT_ONCE)
+    global_ns = declarations.get_global_namespace(decls)
+
+    names = []
+    for class_ in global_ns.classes(decl_filter):
+        names.append(class_.name)
+    names.sort()
+
+    dump_interfaces(decls, global_ns, names)
+    dump_implementations(decls, global_ns, names)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/drivers/i965/brw_swtnl.c b/src/gallium/drivers/i965/brw_swtnl.c
new file mode 100644
index 00000000000..464013e7c40
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_swtnl.c
@@ -0,0 +1,95 @@
+
+#include "brw_context.h"
+#include "brw_pipe_rast.h"
+
+
+static GLboolean need_swtnl( struct brw_context *brw )
+{
+   const struct pipe_rasterizer_state *rast = &brw->curr.rast->templ;
+
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->flags.no_swtnl)
+      return FALSE;
+
+   if (brw->flags.force_swtnl)
+      return TRUE;
+
+   /* Exceeding hw limits on number of VS inputs?
+    */
+   if (brw->curr.num_vertex_elements == 0 ||
+       brw->curr.num_vertex_elements >= BRW_VEP_MAX) {
+      return TRUE;
+   }
+
+   /* Position array with zero stride?
+    *
+    * XXX: position isn't always at zero...
+    * XXX: eliminate zero-stride arrays
+    */
+   {
+      int ve0_vb = brw->curr.vertex_element[0].vertex_buffer_index;
+      
+      if (brw->curr.vertex_buffer[ve0_vb].stride == 0)
+	 return TRUE;
+   }
+
+   /* XXX: short-circuit
+    */
+   return FALSE;
+
+   if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+      if (rast->poly_smooth)
+	 return TRUE;
+
+   }
+   
+   if (brw->reduced_primitive == PIPE_PRIM_LINES ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_LINE ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_LINE)))
+   {
+      /* BRW hardware will do AA lines, but they are non-conformant it
+       * seems.  TBD whether we keep this fallback:
+       */
+      if (rast->line_smooth)
+	 return TRUE;
+
+      /* XXX: was a fallback in mesa (gs doesn't get enough
+       * information to know when to reset stipple counter), but there
+       * must be a way around it.
+       */
+      if (rast->line_stipple_enable &&
+	  (brw->reduced_primitive == PIPE_PRIM_TRIANGLES ||
+	   brw->primitive == PIPE_PRIM_LINE_LOOP || 
+	   brw->primitive == PIPE_PRIM_LINE_STRIP))
+	 return TRUE;
+   }
+
+   
+   if (brw->reduced_primitive == PIPE_PRIM_POINTS ||
+       (brw->reduced_primitive == PIPE_PRIM_TRIANGLES &&
+	(rast->fill_cw == PIPE_POLYGON_MODE_POINT ||
+	 rast->fill_ccw == PIPE_POLYGON_MODE_POINT)))
+   {
+      if (rast->point_smooth)
+	 return TRUE;
+   }
+
+   /* BRW hardware doesn't handle CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats CLAMP
+    * as CLAMP_TO_EDGE instead.  If we're using CLAMP, and
+    * we want strict conformance, force the fallback.
+    *
+    * XXX: need a workaround for this.
+    */
+      
+   /* Nothing stopping us from the fast path now */
+   return FALSE;
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_types.h b/src/gallium/drivers/i965/brw_types.h
new file mode 100644
index 00000000000..89e08a5c804
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_types.h
@@ -0,0 +1,21 @@
+#ifndef BRW_TYPES_H
+#define BRW_TYPES_H
+
+#include "pipe/p_compiler.h"
+
+typedef uint32_t GLuint;
+typedef uint8_t GLubyte;
+typedef uint16_t GLushort;
+typedef int32_t GLint;
+typedef int8_t GLbyte;
+typedef int16_t GLshort;
+typedef float GLfloat;
+
+/* no GLenum, translate all away */
+
+typedef uint8_t GLboolean;
+
+#define GL_FALSE FALSE
+#define GL_TRUE TRUE
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_urb.c b/src/gallium/drivers/i965/brw_urb.c
new file mode 100644
index 00000000000..907ec56c6ca
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_urb.c
@@ -0,0 +1,263 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+        
+
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_debug.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/** @file brw_urb.c
+ *
+ * Manages the division of the URB space between the various fixed-function
+ * units.
+ *
+ * See the Thread Initiation Management section of the GEN4 B-Spec, and
+ * the individual *_STATE structures for restrictions on numbers of
+ * entries and threads.
+ */
+
+/*
+ * Generally, a unit requires a min_nr_entries based on how many entries
+ * it produces before the downstream unit gets unblocked and can use and
+ * dereference some of its handles.
+ *
+ * The SF unit preallocates a PUE at the start of thread dispatch, and only
+ * uses that one.  So it requires one entry per thread.
+ *
+ * For CLIP, the SF unit will hold the previous primitive while the
+ * next is getting assembled, meaning that linestrips require 3 CLIP VUEs
+ * (vertices) to ensure continued processing, trifans require 4, and tristrips
+ * require 5.  There can be 1 or 2 threads, and each has the same requirement.
+ *
+ * GS has the same requirement as CLIP, but it never handles tristrips,
+ * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces.
+ * We only run it single-threaded.
+ *
+ * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X).
+ * Each thread processes 2 preallocated VUEs (vertices) at a time, and they
+ * get streamed down as soon as threads processing earlier vertices get
+ * theirs accepted.
+ *
+ * Each unit will take the number of URB entries we give it (based on the
+ * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs,
+ * and brw_curbe.c for the CURBEs) and decide its maximum number of
+ * threads it can support based on that. in brw_*_state.c.
+ *
+ * XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct urb_limits {
+   GLuint min_nr_entries;
+   GLuint preferred_nr_entries;
+   GLuint min_entry_size;
+   GLuint max_entry_size;
+} limits[CS+1] = {
+   { 16, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 5, 10,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static GLboolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= URB_SIZES(brw);
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static int recalculate_urb_fence( struct brw_context *brw )
+{
+   GLuint csize = brw->curbe.total_size;
+   GLuint vsize = brw->vs.prog_data->urb_entry_size;
+   GLuint sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > vsize ||
+				 brw->urb.sfsize > sfsize ||
+				 brw->urb.csize > csize))) {
+      
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;	
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;	
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
+      if (BRW_DEBUG & DEBUG_MIN_URB) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
+	 brw->urb.constrained = 1;
+      }
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;	
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;	
+
+	 /* Mark us as operating with constrained nr_entries, so that next
+	  * time we recalculate we'll resize the fences in the hope of
+	  * escaping constrained mode and getting back to normal performance.
+	  */
+	 brw->urb.constrained = 1;
+	 
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    debug_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+	 
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
+      }
+
+done:
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start, 
+		      URB_SIZES(brw));
+      
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+
+   return 0;
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .prepare = recalculate_urb_fence
+};
+
+
+
+
+
+int brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256/384 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start; 
+   uf.bits0.clp_fence = brw->urb.sf_start; 
+   uf.bits1.sf_fence  = brw->urb.cs_start; 
+   uf.bits1.cs_fence  = URB_SIZES(brw);
+
+   BRW_BATCH_STRUCT(brw, &uf);
+   return 0;
+}
diff --git a/src/gallium/drivers/i965/brw_util.c b/src/gallium/drivers/i965/brw_util.c
new file mode 100644
index 00000000000..458058d668d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_util.c
@@ -0,0 +1,38 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+         
+
+#include "brw_util.h"
+#include "brw_defines.h"
+
+
+
+
diff --git a/src/gallium/drivers/i965/brw_util.h b/src/gallium/drivers/i965/brw_util.h
new file mode 100644
index 00000000000..b5f9a36e7bc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_util.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "brw_types.h"
+
+extern GLuint brw_count_bits( GLuint val );
+extern GLuint brw_translate_blend_factor( unsigned factor );
+extern GLuint brw_translate_blend_equation( unsigned mode );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_vs.c b/src/gallium/drivers/i965/brw_vs.c
new file mode 100644
index 00000000000..e3ea5a3a135
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.c
@@ -0,0 +1,131 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "tgsi/tgsi_dump.h"           
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_pipe_rast.h"
+
+
+
+static enum pipe_error do_vs_prog( struct brw_context *brw, 
+                                   struct brw_vertex_shader *vp,
+                                   struct brw_vs_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   GLuint program_size;
+   const GLuint *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(brw, &c.func);
+   c.vp = vp;
+
+   c.prog_data.nr_outputs = vp->info.num_outputs;
+   c.prog_data.nr_inputs = vp->info.num_inputs;
+
+   if (1)
+      tgsi_dump(c.vp->tokens, 0);
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   ret = brw_get_program(&c.func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_VS_PROG,
+                           &c.key, brw_vs_prog_key_size(&c.key),
+                           NULL, 0,
+                           program, program_size,
+                           &c.prog_data,
+                           &brw->vs.prog_data,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error brw_upload_vs_prog(struct brw_context *brw)
+{
+   struct brw_vs_prog_key key;
+   struct brw_vertex_shader *vp = brw->curr.vertex_shader;
+   struct brw_fs_signature *sig = &brw->curr.fragment_shader->signature;
+   enum pipe_error ret;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw->curr.ucp.nr;
+
+   memcpy(&key.fs_signature, sig, brw_fs_signature_size(sig));
+
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache, BRW_VS_PROG,
+                        &key, brw_vs_prog_key_size(&key),
+                        NULL, 0,
+                        &brw->vs.prog_data,
+                        &brw->vs.prog_bo))
+      return PIPE_OK;
+
+   ret = do_vs_prog(brw, vp, &key, &brw->vs.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_CLIP | 
+                PIPE_NEW_RAST |
+                PIPE_NEW_FRAGMENT_SIGNATURE),
+      .brw   = BRW_NEW_VERTEX_PROGRAM,
+      .cache = 0
+   },
+   .prepare = brw_upload_vs_prog
+};
diff --git a/src/gallium/drivers/i965/brw_vs.h b/src/gallium/drivers/i965/brw_vs.h
new file mode 100644
index 00000000000..944d88c84cc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs.h
@@ -0,0 +1,106 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+struct brw_vs_prog_key {
+   GLuint program_string_id;
+   GLuint nr_userclip:4;
+   GLuint pad:26;
+   struct brw_fs_signature fs_signature;
+};
+
+#define brw_vs_prog_key_size(s) (offsetof(struct brw_vs_prog_key, fs_signature) + \
+                                 brw_fs_signature_size(&(s)->fs_signature))
+
+
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+   struct brw_chipset chipset;
+
+   struct brw_vertex_shader *vp;
+
+   GLuint nr_inputs;
+   GLuint nr_outputs;
+   GLuint nr_immediates;
+   GLfloat immediate[128][4];
+
+   GLuint overflow_grf_start;
+   GLuint overflow_count;
+
+   GLuint first_tmp;
+   GLuint last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[TGSI_FILE_COUNT][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {	
+       GLboolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+
+   struct brw_instruction *if_inst[MAX_IF_DEPTH];
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   GLuint insn;
+   GLuint if_depth;
+   GLuint loop_depth;
+   GLuint end_offset;
+
+   struct brw_indirect stack_index;
+};
+
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
new file mode 100644
index 00000000000..8a16205d2f6
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -0,0 +1,1654 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_shader_tokens.h"
+            
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+/* Choose one of the 4 vec4's which can be packed into each 16-wide reg.
+ */
+static INLINE struct brw_reg brw_vec4_grf_repeat( GLuint reg, GLuint slot )
+{
+   int nr = reg + slot/2;
+   int subnr = (slot%2) * 4;
+
+   return stride(brw_vec4_grf(nr, subnr), 0, 4, 1);
+}
+
+
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+			       
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+static boolean is_position_output( struct brw_vs_compile *c,
+                                   unsigned vs_output )
+{
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+      
+   return (semantic == TGSI_SEMANTIC_POSITION &&
+           index == 0);
+}
+
+
+static boolean find_output_slot( struct brw_vs_compile *c,
+                                  unsigned vs_output,
+                                  unsigned *fs_input_slot )
+{
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+   unsigned i;
+
+   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+      if (c->key.fs_signature.input[i].semantic == semantic &&
+          c->key.fs_signature.input[i].semantic_index == index) {
+         *fs_input_slot = i;
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible.  Allocate and populate all regs
+ * ahead of time.
+ */
+static void brw_vs_alloc_regs( struct brw_vs_compile *c )
+{
+   GLuint i, reg = 0, subreg = 0, mrf;
+   int attributes_in_vue;
+
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1 +
+       c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
+       c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
+   else {
+      /* XXX: immediates can go elsewhere if necessary:
+       */
+      assert(c->vp->info.file_max[TGSI_FILE_IMMEDIATE] + 1 +
+	     c->vp->info.file_max[TGSI_FILE_TEMPORARY] + 1 + 21 <= BRW_MAX_GRF);
+
+      c->vp->use_const_buffer = GL_FALSE;
+   }
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
+
+   /* r0 -- reserved as usual
+    */
+   c->r0 = brw_vec8_grf(reg, 0);
+   reg++;
+
+   /* User clip planes from curbe: 
+    */
+   if (c->key.nr_userclip) {
+      /* Skip over fixed planes:  Or never read them into vs unit?
+       */
+      subreg += 6;
+
+      for (i = 0; i < c->key.nr_userclip; i++, subreg++) {
+	 c->userplane[i] = 
+            stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+      }     
+
+      /* Deal with curbe alignment:
+       */
+      subreg = align(subreg, 2);
+      /*reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;*/
+   }
+
+
+   /* Immediates: always in the curbe.
+    *
+    * XXX: Can try to encode some immediates as brw immediates
+    * XXX: Make sure ureg sets minimal immediate size and respect it
+    * here.
+    */
+   for (i = 0; i < c->vp->info.immediate_count; i++, subreg++) {
+      c->regs[TGSI_FILE_IMMEDIATE][i] = 
+         stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+   }
+   c->prog_data.nr_params = c->vp->info.immediate_count * 4;
+
+
+   /* Vertex constant buffer.
+    *
+    * Constants from the buffer can be either cached in the curbe or
+    * loaded as needed from the actual constant buffer.
+    */
+   if (!c->vp->use_const_buffer) {
+      GLuint nr_params = c->vp->info.file_max[TGSI_FILE_CONSTANT] + 1;
+
+      for (i = 0; i < nr_params; i++, subreg++) {
+         c->regs[TGSI_FILE_CONSTANT][i] =
+            stride( brw_vec4_grf(reg+subreg/2, (subreg%2) * 4), 0, 4, 1);
+      }
+
+      c->prog_data.nr_params += nr_params * 4;
+   }
+
+   /* All regs allocated
+    */
+   reg += (subreg + 1) / 2;
+   c->prog_data.curb_read_length = reg - 1;
+
+
+   /* Allocate input regs:  
+    */
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+      c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* If there are no inputs, we'll still be reading one attribute's worth
+    * because it's required -- see urb_read_length setting.
+    */
+   if (c->nr_inputs == 0)
+      reg++;
+
+
+
+   /* Allocate outputs.  The non-position outputs go straight into message regs.
+    */
+   c->nr_outputs = c->prog_data.nr_outputs;
+
+   if (c->chipset.is_igdng)
+      mrf = 8;
+   else
+      mrf = 4;
+
+   
+   if (c->key.fs_signature.nr_inputs > BRW_MAX_MRF) {
+      c->overflow_grf_start = reg;
+      c->overflow_count = c->key.fs_signature.nr_inputs - BRW_MAX_MRF;
+      reg += c->overflow_count;
+   }
+
+   /* XXX: need to access vertex output semantics here:
+    */
+   for (i = 0; i < c->nr_outputs; i++) {
+      unsigned slot;
+
+      /* XXX: Put output position in slot zero always.  Clipper, etc,
+       * need access to this reg.
+       */
+      if (is_position_output(c, i)) {
+	 c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0); /* copy to mrf 0 */
+	 reg++;
+      }
+      else if (find_output_slot(c, i, &slot)) {
+         
+         if (0 /* is_psize_output(c, i) */ ) {
+            /* c->psize_out.grf = reg; */
+            /* c->psize_out.mrf = i; */
+         }
+         
+         /* The first (16-4) outputs can go straight into the message regs.
+          */
+         if (slot + mrf < BRW_MAX_MRF) {
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(slot + mrf);
+         }
+         else {
+            int grf = c->overflow_grf_start + slot - BRW_MAX_MRF;
+            c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(grf, 0);
+         }
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_null_reg();
+      }
+   }     
+
+   /* Allocate program temporaries:
+    */
+   
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* Address reg(s).  Don't try to use the internal address reg until
+    * deref time.
+    */
+   for (i = 0; i < c->vp->info.file_max[TGSI_FILE_ADDRESS]+1; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+					     reg,
+					     0,
+					     BRW_REGISTER_TYPE_D,
+					     BRW_VERTICAL_STRIDE_8,
+					     BRW_WIDTH_8,
+					     BRW_HORIZONTAL_STRIDE_1,
+					     BRW_SWIZZLE_XXXX,
+					     BRW_WRITEMASK_X);
+      reg++;
+   }
+
+   if (c->vp->use_const_buffer) {
+      for (i = 0; i < 3; i++) {
+         c->current_const[i].index = -1;
+         c->current_const[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
+#if 0
+   for (i = 0; i < 128; i++) {
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+#endif
+
+   if (c->vp->has_flow_control) {
+      c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+      reg += 2;
+   }
+
+   /* Some opcodes need an internal temporary:
+    */
+   c->first_tmp = reg;
+   c->last_tmp = reg;		/* for allocation purposes */
+
+   /* Each input reg holds data from two vertices.  The
+    * urb_read_length is the number of registers read from *each*
+    * vertex urb, so is half the amount:
+    */
+   c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+
+   /* Setting this field to 0 leads to undefined behavior according to the
+    * the VS_STATE docs.  Our VUEs will always have at least one attribute
+    * sitting in them, even if it's padding.
+    */
+   if (c->prog_data.urb_read_length == 0)
+      c->prog_data.urb_read_length = 1;
+
+   /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+    * them to fit the biggest thing they need to.
+    */
+   attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
+
+   if (c->chipset.is_igdng)
+      c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+   else
+      c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
+
+   c->prog_data.total_grf = reg;
+
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("%s NumAddrRegs %d\n", __FUNCTION__, 
+		   c->vp->info.file_max[TGSI_FILE_ADDRESS]+1);
+      debug_printf("%s NumTemps %d\n", __FUNCTION__,
+		   c->vp->info.file_max[TGSI_FILE_TEMPORARY]+1);
+      debug_printf("%s reg = %d\n", __FUNCTION__, reg);
+   }
+}
+
+
+/**
+ * If an instruction uses a temp reg both as a src and the dest, we
+ * sometimes need to allocate an intermediate temporary.
+ */
+static void unalias1( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if (dst.file == arg0.file && dst.nr == arg0.nr) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0);
+   }
+}
+
+/**
+ * \sa unalias2
+ * Checkes if 2-operand instruction needs an intermediate temporary.
+ */
+static void unalias2( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1);
+   }
+}
+
+/**
+ * \sa unalias2
+ * Checkes if 3-operand instruction needs an intermediate temporary.
+ */
+static void unalias3( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      struct brw_reg arg2,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr) ||
+       (dst.file == arg2.file && dst.nr == arg2.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1, arg2);
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1, arg2);
+   }
+}
+
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1, 
+		      GLuint cond)
+{
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
+static void emit_slt( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_max( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg1, arg0);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void emit_min( struct brw_compile *p, 
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+static void emit_math1( struct brw_vs_compile *c,
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			GLuint precision)
+{
+   /* There are various odd behaviours with SEND on the simulator.  In
+    * addition there are documented issues with the fact that the GEN4
+    * processor doesn't do dependency control properly on SEND
+    * results.  So, on balance, this kludge to get around failures
+    * with writemasked math results looks like it might be necessary
+    * whether that turns out to be a simulator bug or not:
+    */
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_math(p, 
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+static void emit_math2( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_message_reg(3), arg1);
+   
+   brw_math(p, 
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+ 	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+static void emit_exp_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X) {
+      struct brw_reg tmp = get_tmp(c);
+      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+
+      /* tmp_d = floor(arg0.x) */
+      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
+
+      /* result[0] = 2.0 ^ tmp */
+
+      /* Adjust exponent for floating point: 
+       * exp += 127 
+       */
+      brw_ADD(p, brw_writemask(tmp_d, BRW_WRITEMASK_X), tmp_d, brw_imm_d(127));
+
+      /* Install exponent and sign.  
+       * Excess drops off the edge: 
+       */
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), BRW_WRITEMASK_X), 
+	      tmp_d, brw_imm_d(23));
+
+      release_tmp(c, tmp);
+   }
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y) {
+      /* result[1] = arg0.x - floor(arg0.x) */
+      brw_FRC(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0, 0));
+   }
+   
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
+      /* As with the LOG instruction, we might be better off just
+       * doing a taylor expansion here, seeing as we have to do all
+       * the prep work.
+       *
+       * If mathbox partial precision is too low, consider also:
+       * result[3] = result[0] * EXP(result[1])
+       */
+      emit_math1(c, 
+		 BRW_MATH_FUNCTION_EXP, 
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
+		 brw_swizzle1(arg0, 0), 
+		 BRW_MATH_PRECISION_FULL);
+   }  
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), brw_imm_f(1));
+   }
+}
+
+
+static void emit_log_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
+   GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) {
+      tmp = get_tmp(c);
+      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   }
+   
+   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+    * according to spec:
+    *
+    * These almost look likey they could be joined up, but not really
+    * practical:
+    *
+    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
+    */
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_XZ) {
+      brw_AND(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1U<<31)-1));
+
+      brw_SHR(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_X), 
+	      tmp_ud,
+	      brw_imm_ud(23));
+
+      brw_ADD(p, 
+	      brw_writemask(tmp, BRW_WRITEMASK_X), 
+	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
+	      brw_imm_d(-127));
+   }
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_YZ) {
+      brw_AND(p, 
+	      brw_writemask(tmp_ud, BRW_WRITEMASK_Y),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1<<23)-1));
+
+      brw_OR(p, 
+	     brw_writemask(tmp_ud, BRW_WRITEMASK_Y), 
+	     tmp_ud,
+	     brw_imm_ud(127<<23));
+   }
+   
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z) {
+      /* result[2] = result[0] + LOG2(result[1]); */
+
+      /* Why bother?  The above is just a hint how to do this with a
+       * taylor series.  Maybe we *should* use a taylor series as by
+       * the time all the above has been done it's almost certainly
+       * quicker than calling the mathbox, even with low precision.
+       * 
+       * Options are:
+       *    - result[0] + mathbox.LOG2(result[1])
+       *    - mathbox.LOG2(arg0.x)
+       *    - result[0] + inline_taylor_approx(result[1])
+       */
+      emit_math1(c, 
+		 BRW_MATH_FUNCTION_LOG, 
+		 brw_writemask(tmp, BRW_WRITEMASK_Z), 
+		 brw_swizzle1(tmp, 1), 
+		 BRW_MATH_PRECISION_FULL);
+      
+      brw_ADD(p, 
+	      brw_writemask(tmp, BRW_WRITEMASK_Z), 
+	      brw_swizzle1(tmp, 2), 
+	      brw_swizzle1(tmp, 0));
+   }  
+
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_W), brw_imm_f(1));
+   }
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
+ */
+static void emit_dst_noalias( struct brw_vs_compile *c, 
+			      struct brw_reg dst,
+			      struct brw_reg arg0,
+			      struct brw_reg arg1)
+{
+   struct brw_compile *p = &c->func;
+
+   /* There must be a better way to do this: 
+    */
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, BRW_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & BRW_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_W), arg1);
+}
+
+
+static void emit_xpd( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg t,
+		      struct brw_reg u)
+{
+   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
+   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
+}
+
+
+static void emit_lit_noalias( struct brw_vs_compile *c, 
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) 
+      tmp = get_tmp(c);
+   
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_YZ), brw_imm_f(0)); 
+   brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_XW), brw_imm_f(1)); 
+
+   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_8);
+   {
+      brw_MOV(p, brw_writemask(dst, BRW_WRITEMASK_Y), brw_swizzle1(arg0,0));
+
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
+      brw_MOV(p, brw_writemask(tmp, BRW_WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      emit_math2(c, 
+		 BRW_MATH_FUNCTION_POW, 
+		 brw_writemask(dst, BRW_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 2),
+		 brw_swizzle1(arg0, 3),
+		 BRW_MATH_PRECISION_PARTIAL);      
+   }
+
+   brw_ENDIF(p, if_insn);
+
+   release_tmp(c, tmp);
+}
+
+static void emit_lrp_noalias(struct brw_vs_compile *c,
+			     struct brw_reg dst,
+			     struct brw_reg arg0,
+			     struct brw_reg arg1,
+			     struct brw_reg arg2)
+{
+   struct brw_compile *p = &c->func;
+
+   brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
+   brw_MUL(p, brw_null_reg(), dst, arg2);
+   brw_MAC(p, dst, arg0, arg1);
+}
+
+/** 3 or 4-component vector normalization */
+static void emit_nrm( struct brw_vs_compile *c, 
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      int num_comps)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* tmp = dot(arg0, arg0) */
+   if (num_comps == 3)
+      brw_DP3(p, tmp, arg0, arg0);
+   else
+      brw_DP4(p, tmp, arg0, arg0);
+
+   /* tmp = 1 / sqrt(tmp) */
+   emit_math1(c, BRW_MATH_FUNCTION_RSQ, tmp, tmp, BRW_MATH_PRECISION_FULL);
+
+   /* dst = arg0 * tmp */
+   brw_MUL(p, dst, arg0, tmp);
+
+   release_tmp(c, tmp);
+}
+
+
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+	     GLuint argIndex,
+	     GLuint index,
+	     GLboolean relAddr)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg;
+   struct brw_reg const2_reg;
+
+   assert(argIndex < 3);
+
+   if (c->current_const[argIndex].index != index || relAddr) {
+      struct brw_reg addrReg = c->regs[TGSI_FILE_ADDRESS][0];
+
+      c->current_const[argIndex].index = index;
+
+#if 0
+      printf("  fetch const[%d] for arg %d into reg %d\n",
+             src.Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+      /* need to fetch the constant now */
+      brw_dp_READ_4_vs(p,
+                       c->current_const[argIndex].reg,/* writeback dest */
+                       0,                             /* oword */
+                       relAddr,                       /* relative indexing? */
+                       addrReg,                       /* address register */
+                       16 * index,               /* byte offset */
+                       SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+                       );
+
+      if (relAddr) {
+         /* second read */
+         const2_reg = get_tmp(c);
+
+         /* use upper half of address reg for second read */
+         addrReg = stride(addrReg, 0, 4, 0);
+         addrReg.subnr = 16;
+
+         brw_dp_READ_4_vs(p,
+                          const2_reg,              /* writeback dest */
+                          1,                       /* oword */
+                          relAddr,                 /* relative indexing? */
+                          addrReg,                 /* address register */
+                          16 * index,         /* byte offset */
+                          SURF_INDEX_VERT_CONST_BUFFER
+                          );
+      }
+   }
+
+   const_reg = c->current_const[argIndex].reg;
+
+   if (relAddr) {
+      /* merge the two Owords into the constant register */
+      /* const_reg[7..4] = const2_reg[7..4] */
+      brw_MOV(p,
+              suboffset(stride(const_reg, 0, 4, 1), 4),
+              suboffset(stride(const2_reg, 0, 4, 1), 4));
+      release_tmp(c, const2_reg);
+   }
+   else {
+      /* replicate lower four floats into upper half (to get XYZWXYZW) */
+      const_reg = stride(const_reg, 0, 4, 0);
+      const_reg.subnr = 0;
+   }
+
+   return const_reg;
+}
+
+
+
+/* TODO: relative addressing!
+ */
+static struct brw_reg get_reg( struct brw_vs_compile *c,
+			       enum tgsi_file_type file,
+			       GLuint index )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_CONSTANT:
+      assert(c->regs[file][index].nr != 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:			/* undef values */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+/**
+ * Indirect addressing:  get reg[[arg] + offset].
+ */
+static struct brw_reg deref( struct brw_vs_compile *c,
+			     struct brw_reg arg,
+			     GLint offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg addr_reg = c->regs[TGSI_FILE_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
+   GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* This is pretty clunky - load the address register twice and
+       * fetch each 4-dword value in turn.  There must be a way to do
+       * this in a single pass, but I couldn't get it to work.
+       */
+      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
+      brw_MOV(p, tmp, indirect);
+
+      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
+      brw_MOV(p, suboffset(tmp, 4), indirect);
+
+      brw_pop_insn_state(p);
+   }
+   
+   /* NOTE: tmp not released */
+   return vec8(tmp);
+}
+
+
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+	     GLuint argIndex,
+	     GLuint file,
+	     GLint index,
+	     GLboolean relAddr )
+{
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+      if (relAddr) {
+         return deref(c, c->regs[file][0], index);
+      }
+      else {
+         assert(c->regs[file][index].nr != 0);
+         return c->regs[file][index];
+      }
+
+   case TGSI_FILE_IMMEDIATE:
+      return c->regs[file][index];
+
+   case TGSI_FILE_CONSTANT:
+      if (c->vp->use_const_buffer) {
+         return get_constant(c, argIndex, index, relAddr);
+      }
+      else if (relAddr) {
+         return deref(c, c->regs[TGSI_FILE_CONSTANT][0], index);
+      }
+      else {
+         assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+         return c->regs[TGSI_FILE_CONSTANT][index];
+      }
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:
+      /* this is a normal case since we loop over all three src args */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+static void emit_arl( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+   
+   if (need_tmp) 
+      tmp = get_tmp(c);
+
+   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
+   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
+
+   if (need_tmp)
+      release_tmp(c, tmp);
+}
+
+
+/**
+ * Return the brw reg for the given instruction's src argument.
+ */
+static struct brw_reg get_arg( struct brw_vs_compile *c,
+                               const struct tgsi_full_src_register *src,
+                               GLuint argIndex )
+{
+   struct brw_reg reg;
+
+   if (src->Register.File == TGSI_FILE_NULL)
+      return brw_null_reg();
+
+   reg = get_src_reg(c, argIndex,
+		     src->Register.File,
+		     src->Register.Index,
+		     src->Register.Indirect);
+
+   /* Convert 3-bit swizzle to 2-bit.  
+    */
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->Register.SwizzleX,
+				       src->Register.SwizzleY,
+				       src->Register.SwizzleZ,
+				       src->Register.SwizzleW);
+
+   reg.negate = src->Register.Negate ? 1 : 0;   
+
+   /* XXX: abs, absneg
+    */
+
+   return reg;
+}
+
+
+/**
+ * Get brw register for the given program dest register.
+ */
+static struct brw_reg get_dst( struct brw_vs_compile *c,
+			       unsigned file,
+			       unsigned index,
+			       unsigned writemask )
+{
+   struct brw_reg reg;
+
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_OUTPUT:
+      assert(c->regs[file][index].nr != 0);
+      reg = c->regs[file][index];
+      break;
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      reg = c->regs[file][index];
+      break;
+   case TGSI_FILE_NULL:
+      /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+      reg = brw_null_reg();
+      break;
+   default:
+      assert(0);
+      reg = brw_null_reg();
+   }
+
+   reg.dw1.bits.writemask = writemask;
+
+   return reg;
+}
+
+
+
+
+/**
+ * Post-vertex-program processing.  Send the results to the URB.
+ */
+static void emit_vertex_write( struct brw_vs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg m0 = brw_message_reg(0);
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_HPOS];
+   struct brw_reg ndc;
+   int eot;
+   int i;
+   GLuint len_vertext_header = 2;
+
+   /* Build ndc coords */
+   ndc = get_tmp(c);
+   /* ndc = 1.0 / pos.w */
+   emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+   /* ndc.xyz = pos * ndc */
+   brw_MUL(p, brw_writemask(ndc, BRW_WRITEMASK_XYZ), pos, ndc);
+
+   /* Update the header for point size, user clipping flags, and -ve rhw
+    * workaround.
+    */
+   if (c->prog_data.writes_psiz ||
+       c->key.nr_userclip || 
+       c->chipset.is_965)
+   {
+      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+      GLuint i;
+
+      brw_MOV(p, header1, brw_imm_ud(0));
+
+      brw_set_access_mode(p, BRW_ALIGN_16);	
+
+      if (c->prog_data.writes_psiz) {
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][VERT_RESULT_PSIZ];
+	 brw_MUL(p, brw_writemask(header1, BRW_WRITEMASK_W), brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(0x7ff<<8));
+      }
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      /* i965 clipping workaround: 
+       * 1) Test for -ve rhw
+       * 2) If set, 
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (c->chipset.is_965) {
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+   
+	 brw_OR(p, brw_writemask(header1, BRW_WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      release_tmp(c, header1);
+   }
+   else {
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+   }
+
+   /* Emit the (interleaved) headers for the two vertices - an 8-reg
+    * of zeros followed by two sets of NDC coordinates:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, offset(m0, 2), ndc);
+
+   if (c->chipset.is_igdng) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
+        * Seems it is useless for us.
+        * m6 is used for aligning, so that the remainder of vertex element is 
+        * reg-aligned.
+        */
+       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+       len_vertext_header = 6;
+   } else {
+       brw_MOV(p, offset(m0, 3), pos);
+       len_vertext_header = 2;
+   }
+
+   eot = (c->overflow_count == 0);
+
+   brw_urb_WRITE(p, 
+		 brw_null_reg(), /* dest */
+		 0,		/* starting mrf reg nr */
+		 c->r0,		/* src */
+		 0,		/* allocate */
+		 1,		/* used */
+		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
+		 0,		/* response len */
+		 eot, 		/* eot */
+		 eot, 		/* writes complete */
+		 0, 		/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* Not all of the vertex outputs/results fit into the MRF.
+    * Move the overflowed attributes from the GRF to the MRF and
+    * issue another brw_urb_WRITE().
+    */
+   for (i = 0; i < c->overflow_count; i += BRW_MAX_MRF) {
+      unsigned nr = MIN2(c->overflow_count - i, BRW_MAX_MRF);
+      GLuint j;
+
+      eot = (i + nr >= c->overflow_count);
+
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      for (j = 0; j < nr; j++) {
+	 brw_MOV(p, brw_message_reg(4+j), 
+                 brw_vec8_grf(c->overflow_grf_start + i + j, 0));
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    nr+1,          /* msg len */
+                    0,              /* response len */
+                    eot,            /* eot */
+                    eot,            /* writes complete */
+                    i-1,            /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
+}
+
+
+/**
+ * Called after code generation to resolve subroutine calls and the
+ * END instruction.
+ * \param end_inst  points to brw code for END instruction
+ * \param last_inst  points to last instruction emitted before vertex write
+ */
+static void 
+post_vs_emit( struct brw_vs_compile *c,
+              struct brw_instruction *end_inst,
+              struct brw_instruction *last_inst )
+{
+   GLint offset;
+
+   brw_resolve_cals(&c->func);
+
+   /* patch up the END code to jump past subroutines, etc */
+   offset = last_inst - end_inst;
+   if (offset > 1) {
+      brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   } else {
+      end_inst->header.opcode = BRW_OPCODE_NOP;
+   }
+}
+
+static uint32_t
+get_predicate(const struct tgsi_full_instruction *inst)
+{
+   /* XXX: disabling for now
+    */
+#if 0
+   if (inst->dst.CondMask == COND_TR)
+      return BRW_PREDICATE_NONE;
+
+   /* All of GLSL only produces predicates for COND_NE and one channel per
+    * vector.  Fail badly if someone starts doing something else, as it might
+    * mean infinite looping or something.
+    *
+    * We'd like to support all the condition codes, but our hardware doesn't
+    * quite match the Mesa IR, which is modeled after the NV extensions.  For
+    * those, the instruction may update the condition codes or not, then any
+    * later instruction may use one of those condition codes.  For gen4, the
+    * instruction may update the flags register based on one of the condition
+    * codes output by the instruction, and then further instructions may
+    * predicate on that.  We can probably support this, but it won't
+    * necessarily be easy.
+    */
+/*   assert(inst->dst.CondMask == COND_NE); */
+
+   switch (inst->dst.CondSwizzle) {
+   case SWIZZLE_XXXX:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case SWIZZLE_YYYY:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case SWIZZLE_ZZZZ:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case SWIZZLE_WWWW:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      debug_printf("Unexpected predicate: 0x%08x\n",
+		    inst->dst.CondMask);
+      return BRW_PREDICATE_NORMAL;
+   }
+#else
+   return BRW_PREDICATE_NORMAL;
+#endif
+}
+
+static void emit_insn(struct brw_vs_compile *c,
+		      const struct tgsi_full_instruction *inst)
+{
+   unsigned opcode = inst->Instruction.Opcode;
+   unsigned label = inst->Label.Label;
+   struct brw_compile *p = &c->func;
+   struct brw_reg args[3], dst;
+   GLuint i;
+
+#if 0
+   printf("%d: ", insn);
+   _mesa_print_instruction(inst);
+#endif
+
+   /* Get argument regs.
+    */
+   for (i = 0; i < 3; i++) {
+      args[i] = get_arg(c, &inst->Src[i], i);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */ 
+   dst = get_dst(c, 
+		 inst->Dst[0].Register.File,
+		 inst->Dst[0].Register.Index,
+		 inst->Dst[0].Register.WriteMask);
+
+   /* XXX: saturate
+    */
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
+      debug_printf("Unsupported saturate in vertex shader");
+   }
+
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_NRM:
+      emit_nrm(c, dst, args[0], 3);
+      break;
+   case TGSI_OPCODE_NRM4:
+      emit_nrm(c, dst, args[0], 4);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias); 
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_LRP:
+      unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL); 
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, 
+                 brw_swizzle(args[0], 0,0,0,0), BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_TRUNC:
+      /* round toward zero */
+      brw_RNDZ(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_depth < MAX_IF_DEPTH);
+      c->if_inst[c->if_depth] = brw_IF(p, BRW_EXECUTE_8);
+      /* Note that brw_IF smashes the predicate_control field. */
+      c->if_inst[c->if_depth]->header.predicate_control = get_predicate(inst);
+      c->if_depth++;
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_depth-1] = brw_ELSE(p, c->if_inst[c->if_depth-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_depth > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_depth]);
+      break;			
+   case TGSI_OPCODE_BGNLOOP:
+      c->loop_inst[c->loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP: 
+   {
+      struct brw_instruction *inst0, *inst1;
+      GLuint br = 1;
+
+      c->loop_depth--;
+
+      if (c->chipset.is_igdng)
+	 br = 2;
+
+      inst0 = inst1 = brw_WHILE(p, c->loop_inst[c->loop_depth]);
+      /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+      while (inst0 > c->loop_inst[c->loop_depth]) {
+	 inst0--;
+	 if (inst0->header.opcode == TGSI_OPCODE_BRK) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+	 else if (inst0->header.opcode == TGSI_OPCODE_CONT) {
+	    inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	    inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+   }
+   break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, get_predicate(inst));
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1d(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+      brw_save_call(p, label, p->nr_insn);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_RET:
+      brw_ADD(p, get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1d(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      break;
+   case TGSI_OPCODE_END:	
+      c->end_offset = p->nr_insn;
+      /* this instruction will get patched later to jump past subroutine
+       * code, etc.
+       */
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_BGNSUB:
+      brw_save_label(p, p->nr_insn, p->nr_insn);
+      break;
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+   default:
+      debug_printf("Unsupported opcode %i (%s) in vertex shader",
+		   opcode, 
+		   tgsi_get_opcode_name(opcode));
+   }
+
+   /* Set the predication update on the last instruction of the native
+    * instruction sequence.
+    *
+    * This would be problematic if it was set on a math instruction,
+    * but that shouldn't be the case with the current GLSL compiler.
+    */
+#if 0
+   /* XXX: disabled
+    */
+   if (inst->CondUpdate) {
+      struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+      assert(hw_insn->header.destreg__conditionalmod == 0);
+      hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+   }
+#endif
+
+   release_tmps(c);
+}
+
+
+/* Emit the vertex program instructions here.
+ */
+void brw_vs_emit(struct brw_vs_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   const struct tgsi_token *tokens = c->vp->tokens;
+   struct brw_instruction *end_inst, *last_inst;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+
+   if (BRW_DEBUG & DEBUG_VS)
+      tgsi_dump(c->vp->tokens, 0); 
+
+   c->stack_index = brw_indirect(0, 0);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   
+
+   /* Static register allocation
+    */
+   brw_vs_alloc_regs(c);
+
+   if (c->vp->has_flow_control) {
+      brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+   }
+
+   /* Instructions
+    */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn( c, inst );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free( &parse );
+
+   end_inst = &p->store[c->end_offset];
+   last_inst = &p->store[p->nr_insn];
+
+   /* The END instruction will be patched to jump to this code */
+   emit_vertex_write(c);
+
+   post_vs_emit(c, end_inst, last_inst);
+
+   if (BRW_DEBUG & DEBUG_VS) {
+      debug_printf("vs-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_vs_state.c b/src/gallium/drivers/i965/brw_vs_state.c
new file mode 100644
index 00000000000..dadbb622e4d
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_state.c
@@ -0,0 +1,201 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+            
+#include "util/u_math.h"
+
+
+#include "brw_debug.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+struct brw_vs_unit_key {
+   unsigned int total_grf;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+
+   unsigned int curbe_offset;
+
+   unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
+};
+
+static void
+vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->total_grf = brw->vs.prog_data->total_grf;
+   key->urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->vs.prog_data->curb_read_length;
+
+   /* BRW_NEW_URB_FENCE */
+   key->nr_urb_entries = brw->urb.nr_vs_entries;
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
+   /* PIPE_NEW_CLIP */
+   if (brw->curr.ucp.nr) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      key->curbe_offset = brw->curbe.clip_start;
+   }
+   else {
+      key->curbe_offset = brw->curbe.vs_start;
+   }
+}
+
+static enum pipe_error
+vs_unit_create_from_key(struct brw_context *brw, 
+                        struct brw_vs_unit_key *key,
+                        struct brw_winsys_reloc *reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   struct brw_vs_unit_state vs;
+   int chipset_max_threads;
+
+   memset(&vs, 0, sizeof(vs));
+
+   vs.thread0.kernel_start_pointer = 0; /* reloc */
+   vs.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   /* Choosing multiple program flow means that we may get 2-vertex threads,
+    * which will have the channel mask for dwords 4-7 enabled in the thread,
+    * and those dwords will be written to the second URB handle when we
+    * brw_urb_WRITE() results.
+    */
+   vs.thread1.single_program_flow = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+   vs.thread3.urb_entry_read_offset = 0;
+   vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
+   vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
+
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
+      chipset_max_threads = 32;
+   else
+      chipset_max_threads = 16;
+
+   vs.thread4.max_threads = CLAMP(key->nr_urb_entries / 2,
+				  1, chipset_max_threads) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   /* It has to be set to 0 for IGDNG
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_VS_UNIT,
+                          key, sizeof(*key),
+                          reloc, 1,
+                          &vs, sizeof(vs),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static int prepare_vs_unit(struct brw_context *brw)
+{
+   struct brw_vs_unit_key key;
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[1];
+   unsigned grf_reg_count;
+
+   vs_unit_populate_key(brw, &key);
+
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+
+   /* Emit VS program relocation */
+   make_reloc(&reloc[0],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_vs_unit_state, thread0),
+              brw->vs.prog_bo);
+
+
+   if (brw_search_cache(&brw->cache, BRW_VS_UNIT,
+                        &key, sizeof(key),
+                        reloc, 1,
+                        NULL,
+                        &brw->vs.state_bo))
+      return PIPE_OK;
+
+   ret = vs_unit_create_from_key(brw, &key, reloc, &brw->vs.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_CLIP),
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .prepare = prepare_vs_unit,
+};
diff --git a/src/gallium/drivers/i965/brw_vs_surface_state.c b/src/gallium/drivers/i965/brw_vs_surface_state.c
new file mode 100644
index 00000000000..177a5170d23
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_vs_surface_state.c
@@ -0,0 +1,232 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_winsys.h"
+
+/* XXX: disabled true constant buffer functionality
+ */
+
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+#if 0
+static struct brw_winsys_buffer *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   /* XXX: true constant buffers
+    */
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = brw->sws->bo_alloc(brw->sws, 
+				     BRW_BUFFER_TYPE_SHADER_CONSTANTS,
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   brw->sws->bo_subdata(const_buffer, 0, size, params->ParameterValues,
+                        NULL, 0);
+
+   return const_buffer;
+}
+#endif
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+#if 0
+static void
+brw_update_vs_constant_surface( struct brw_context *brw,
+                                GLuint surf)
+{
+   struct brw_surface_key key;
+   struct pipe_buffer *cb = brw->curr.vs_constants;
+   enum pipe_error ret;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   ret = brw_vs_update_constant_buffer(brw, &vp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == NULL) {
+      bo_reference(brw->vs.surf_bo[surf], NULL);
+      return PIPE_OK;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, key.bo ? 1 : 0,
+                        NULL,
+                        &brw->vs.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key
+                                     &brw->vs.surf_bo[surf]);
+   if (ret)
+      return ret;
+   
+   return PIPE_OK;
+}
+#endif
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static enum pipe_error
+brw_vs_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out)
+{
+#if 0
+   static GLuint data[BRW_VS_MAX_SURF]; /* always zero */
+   struct brw_winsys_reloc reloc[BRW_VS_MAX_SURF];
+   int i;
+
+   /* Emit binding table relocations to surface state */
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      make_reloc(&reloc[i],
+                 BRW_USAGE_STATE,
+                 0,
+                 i * 4,
+                 brw->vs.surf_bo[i]);
+   }
+   
+   ret = brw_cache_data( &brw->surface_cache, 
+                         BRW_SS_SURF_BIND,
+                         NULL, 0,
+                         reloc, nr_reloc,
+                         data, sizeof data,
+                         NULL, NULL,
+                         bo_out);
+   if (ret)
+      return ret;
+
+   FREE(data);
+   return PIPE_OK;
+#else
+   return PIPE_OK;
+#endif
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static enum pipe_error prepare_vs_surfaces(struct brw_context *brw )
+{
+   enum pipe_error ret;
+
+#if 0
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+#endif
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      ret = brw_vs_get_binding_table(brw, &brw->vs.bind_bo);
+      if (ret)
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (PIPE_NEW_VERTEX_CONSTANTS |
+	       PIPE_NEW_VERTEX_SHADER),
+      .brw = 0,
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/src/gallium/drivers/i965/brw_winsys.h b/src/gallium/drivers/i965/brw_winsys.h
new file mode 100644
index 00000000000..a242e31218a
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys.h
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright © 2009 Jakob Bornecrantz
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_refcnt.h"
+
+struct brw_winsys;
+struct pipe_fence_handle;
+
+/* Not sure why the winsys needs this:
+ */
+#define BRW_BATCH_SIZE (32*1024)
+
+struct brw_winsys_screen;
+
+/* Need a tiny bit of information inside the abstract buffer struct:
+ */
+struct brw_winsys_buffer {
+   struct pipe_reference reference;
+   struct brw_winsys_screen *sws;
+   unsigned size;
+};
+
+
+/* Should be possible to validate usages above against buffer creation
+ * types, below:
+ */
+enum brw_buffer_type
+{
+   BRW_BUFFER_TYPE_TEXTURE,
+   BRW_BUFFER_TYPE_SCANOUT,          /**< a texture used for scanning out from */
+   BRW_BUFFER_TYPE_VERTEX,
+   BRW_BUFFER_TYPE_CURBE,
+   BRW_BUFFER_TYPE_QUERY,
+   BRW_BUFFER_TYPE_SHADER_CONSTANTS,
+   BRW_BUFFER_TYPE_SHADER_SCRATCH,
+   BRW_BUFFER_TYPE_BATCH,
+   BRW_BUFFER_TYPE_GENERAL_STATE,
+   BRW_BUFFER_TYPE_SURFACE_STATE,
+   BRW_BUFFER_TYPE_PIXEL,            /* image uploads, pbo's, etc */
+   BRW_BUFFER_TYPE_GENERIC,          /* unknown */
+   BRW_BUFFER_TYPE_MAX               /* Count of possible values */
+};
+
+
+/* Describe the usage of a particular buffer in a relocation.  The DRM
+ * winsys will translate these back to GEM read/write domain flags.
+ */
+enum brw_buffer_usage {
+   BRW_USAGE_STATE,         /* INSTRUCTION, 0 */
+   BRW_USAGE_QUERY_RESULT,  /* INSTRUCTION, INSTRUCTION */
+   BRW_USAGE_RENDER_TARGET, /* RENDER,      0 */
+   BRW_USAGE_DEPTH_BUFFER,  /* RENDER,      RENDER */
+   BRW_USAGE_BLIT_SOURCE,   /* RENDER,      0 */
+   BRW_USAGE_BLIT_DEST,     /* RENDER,      RENDER */
+   BRW_USAGE_SAMPLER,       /* SAMPLER,     0 */
+   BRW_USAGE_VERTEX,        /* VERTEX,      0 */
+   BRW_USAGE_SCRATCH,       /* 0,           0 */
+   BRW_USAGE_MAX
+};
+
+enum brw_buffer_data_type {
+   BRW_DATA_GS_CC_VP,
+   BRW_DATA_GS_CC_UNIT,
+   BRW_DATA_GS_WM_PROG,
+   BRW_DATA_GS_SAMPLER_DEFAULT_COLOR,
+   BRW_DATA_GS_SAMPLER,
+   BRW_DATA_GS_WM_UNIT,
+   BRW_DATA_GS_SF_PROG,
+   BRW_DATA_GS_SF_VP,
+   BRW_DATA_GS_SF_UNIT,
+   BRW_DATA_GS_VS_UNIT,
+   BRW_DATA_GS_VS_PROG,
+   BRW_DATA_GS_GS_UNIT,
+   BRW_DATA_GS_GS_PROG,
+   BRW_DATA_GS_CLIP_VP,
+   BRW_DATA_GS_CLIP_UNIT,
+   BRW_DATA_GS_CLIP_PROG,
+   BRW_DATA_SS_SURFACE,
+   BRW_DATA_SS_SURF_BIND,
+   BRW_DATA_CONSTANT_BUFFER,
+   BRW_DATA_BATCH_BUFFER,
+   BRW_DATA_OTHER,
+   BRW_DATA_MAX
+};
+
+
+/* Matches the i915_drm definitions:
+ */
+#define BRW_TILING_NONE  0
+#define BRW_TILING_X     1
+#define BRW_TILING_Y     2
+
+
+/* Relocations to be applied with subdata in a call to sws->bo_subdata, below.
+ *
+ * Effectively this encodes:
+ *
+ *    (unsigned *)(subdata + offset) = bo->offset + delta
+ */
+struct brw_winsys_reloc {
+   enum brw_buffer_usage usage; /* debug only */
+   unsigned delta;
+   unsigned offset;
+   struct brw_winsys_buffer *bo;
+};
+
+static INLINE void make_reloc(struct brw_winsys_reloc *reloc,
+                              enum brw_buffer_usage usage,
+                              unsigned delta,
+                              unsigned offset,
+                              struct brw_winsys_buffer *bo)
+{
+   reloc->usage = usage;
+   reloc->delta = delta;
+   reloc->offset = offset;
+   reloc->bo = bo;              /* Note - note taking a reference yet */
+}
+
+
+
+struct brw_winsys_screen {
+
+
+   /**
+    * Buffer functions.
+    */
+
+   /*@{*/
+   /**
+    * Create a buffer.
+    */
+   enum pipe_error (*bo_alloc)(struct brw_winsys_screen *sws,
+                               enum brw_buffer_type type,
+                               unsigned size,
+                               unsigned alignment,
+                               struct brw_winsys_buffer **bo_out);
+
+   /* Destroy a buffer when our refcount goes to zero:
+    */
+   void (*bo_destroy)(struct brw_winsys_buffer *buffer);
+
+   /* delta -- added to b2->offset, and written into buffer
+    * offset -- location above value is written to within buffer
+    */
+   enum pipe_error (*bo_emit_reloc)(struct brw_winsys_buffer *buffer,
+                                    enum brw_buffer_usage usage,
+                                    unsigned delta,
+                                    unsigned offset,
+                                    struct brw_winsys_buffer *b2);
+
+   enum pipe_error (*bo_exec)(struct brw_winsys_buffer *buffer,
+                              unsigned bytes_used);
+
+   enum pipe_error (*bo_subdata)(struct brw_winsys_buffer *buffer,
+                                 enum brw_buffer_data_type data_type,
+                                 size_t offset,
+                                 size_t size,
+                                 const void *data,
+                                 const struct brw_winsys_reloc *reloc,
+                                 unsigned nr_reloc );
+
+   boolean (*bo_is_busy)(struct brw_winsys_buffer *buffer);
+   boolean (*bo_references)(struct brw_winsys_buffer *a,
+                            struct brw_winsys_buffer *b);
+
+   /* XXX: couldn't this be handled by returning true/false on
+    * bo_emit_reloc?
+    */
+   enum pipe_error (*check_aperture_space)(struct brw_winsys_screen *iws,
+                                           struct brw_winsys_buffer **buffers,
+                                           unsigned count);
+
+   /**
+    * Map a buffer.
+    */
+   void *(*bo_map)(struct brw_winsys_buffer *buffer,
+                   enum brw_buffer_data_type data_type,
+                   unsigned offset,
+                   unsigned length,
+                   boolean write,
+                   boolean discard,
+                   boolean flush_explicit);
+
+   void (*bo_flush_range)(struct brw_winsys_buffer *buffer,
+                          unsigned offset,
+                          unsigned length);
+
+   /**
+    * Unmap a buffer.
+    */
+   void (*bo_unmap)(struct brw_winsys_buffer *buffer);
+   /*@}*/
+
+   
+   /* Wait for buffer to go idle.  Similar to map+unmap, but doesn't
+    * mark buffer contents as dirty.
+    */
+   void (*bo_wait_idle)(struct brw_winsys_buffer *buffer);
+   
+   /**
+    * Destroy the winsys.
+    */
+   void (*destroy)(struct brw_winsys_screen *iws);
+};
+
+static INLINE void *
+bo_map_read(struct brw_winsys_screen *sws, struct brw_winsys_buffer *buf)
+{
+   return sws->bo_map( buf,
+                       BRW_DATA_OTHER,
+                       0, buf->size,
+                       FALSE, FALSE, FALSE );
+}
+
+static INLINE void
+bo_reference(struct brw_winsys_buffer **ptr, struct brw_winsys_buffer *buf)
+{
+   struct brw_winsys_buffer *old_buf = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &buf->reference))
+      old_buf->sws->bo_destroy(old_buf);
+
+   *ptr = buf;
+}
+
+
+/**
+ * Create brw pipe_screen.
+ */
+struct pipe_screen *brw_create_screen(struct brw_winsys_screen *iws, unsigned pci_id);
+
+/**
+ * Create a brw pipe_context.
+ */
+struct pipe_context *brw_create_context(struct pipe_screen *screen);
+
+/**
+ * Get the brw_winsys buffer backing the texture.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture;
+boolean brw_texture_get_winsys_buffer(struct pipe_texture *texture,
+                                      struct brw_winsys_buffer **buffer,
+                                      unsigned *stride);
+
+/**
+ * Wrap a brw_winsys buffer with a texture blanket.
+ *
+ * TODO UGLY
+ */
+struct pipe_texture * 
+brw_texture_blanket_winsys_buffer(struct pipe_screen *screen,
+                                  const struct pipe_texture *template,
+                                  unsigned pitch,
+				  unsigned tiling,
+                                  struct brw_winsys_buffer *buffer);
+
+
+/*************************************************************************
+ * Cooperative dumping between winsys and driver.  TODO: make this
+ * driver-only by wrapping calls to winsys->bo_subdata().
+ */
+
+#ifdef DEBUG
+extern int BRW_DUMP;
+#else
+#define BRW_DUMP 0
+#endif 
+
+#define DUMP_ASM	        0x1
+#define DUMP_STATE	        0x2
+#define DUMP_BATCH	        0x4
+
+void brw_dump_data( unsigned pci_id,
+		    enum brw_buffer_data_type data_type,
+		    unsigned offset,
+		    const void *data,
+		    size_t size );
+
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_winsys_debug.c b/src/gallium/drivers/i965/brw_winsys_debug.c
new file mode 100644
index 00000000000..f8f6a539bc9
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_winsys_debug.c
@@ -0,0 +1,87 @@
+#include "brw_winsys.h"
+#include "brw_disasm.h"
+#include "brw_structs_dump.h"
+#include "brw_structs.h"
+#include "intel_decode.h"
+
+
+void brw_dump_data( unsigned pci_id,
+		    enum brw_buffer_data_type data_type,
+		    unsigned offset,
+		    const void *data,
+		    size_t size )
+{
+   if (BRW_DUMP & DUMP_ASM) {
+      switch (data_type) {
+      case BRW_DATA_GS_WM_PROG:
+      case BRW_DATA_GS_SF_PROG:
+      case BRW_DATA_GS_VS_PROG:
+      case BRW_DATA_GS_GS_PROG:
+      case BRW_DATA_GS_CLIP_PROG:
+         brw_disasm( stderr, data, size / sizeof(struct brw_instruction) );
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (BRW_DUMP & DUMP_STATE) {
+      switch (data_type) {
+      case BRW_DATA_GS_CC_VP:
+         brw_dump_cc_viewport( data );
+         break;
+      case BRW_DATA_GS_CC_UNIT:
+         brw_dump_cc_unit_state( data );
+         break;
+      case BRW_DATA_GS_SAMPLER_DEFAULT_COLOR:
+         brw_dump_sampler_default_color( data );
+         break;
+      case BRW_DATA_GS_SAMPLER:
+         brw_dump_sampler_state( data );
+         break;
+      case BRW_DATA_GS_WM_UNIT:
+         brw_dump_wm_unit_state( data );
+         break;
+      case BRW_DATA_GS_SF_VP:
+         brw_dump_sf_viewport( data );
+         break;
+      case BRW_DATA_GS_SF_UNIT:
+         brw_dump_sf_unit_state( data );
+         break;
+      case BRW_DATA_GS_VS_UNIT:
+         brw_dump_vs_unit_state( data );
+         break;
+      case BRW_DATA_GS_GS_UNIT:
+         brw_dump_gs_unit_state( data );
+         break;
+      case BRW_DATA_GS_CLIP_VP:
+         brw_dump_clipper_viewport( data );
+         break;
+      case BRW_DATA_GS_CLIP_UNIT:
+         brw_dump_clip_unit_state( data );
+         break;
+      case BRW_DATA_SS_SURFACE:
+         brw_dump_surface_state( data );
+         break;
+      case BRW_DATA_SS_SURF_BIND:
+         break;
+      case BRW_DATA_OTHER:
+         break;
+      case BRW_DATA_CONSTANT_BUFFER:
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (BRW_DUMP & DUMP_BATCH) {
+      switch (data_type) {
+      case BRW_DATA_BATCH_BUFFER:
+         intel_decode(data, size / 4, offset, pci_id);
+         break;
+      default:
+         break;
+      }
+   }
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm.c b/src/gallium/drivers/i965/brw_wm.c
new file mode 100644
index 00000000000..fdf820a9aae
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm.c
@@ -0,0 +1,319 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "brw_state.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
+
+
+/** Return number of src args for given instruction */
+GLuint brw_wm_nr_args( GLuint opcode )
+{
+   switch (opcode) {
+   case WM_FRONTFACING:
+   case WM_PIXELXY:
+      return 0;
+   case WM_CINTERP:
+   case WM_WPOSXY:
+   case WM_DELTAXY:
+      return 1;
+   case WM_LINTERP:
+   case WM_PIXELW:
+      return 2;
+   case WM_FB_WRITE:
+   case WM_PINTERP:
+      return 3;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+      /* sampler arg is held as a field in the instruction, not in an
+       * actual register:
+       */
+      return tgsi_get_opcode_info(opcode)->num_src - 1;
+
+   default:
+      assert(opcode < MAX_OPCODE);
+      return tgsi_get_opcode_info(opcode)->num_src;
+   }
+}
+
+
+GLuint brw_wm_is_scalar_result( GLuint opcode )
+{
+   switch (opcode) {
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_DST:
+      return 1;
+      
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Do GPU code generation for shaders without flow control.  Shaders
+ * without flow control instructions can more readily be analysed for
+ * SSA-style optimizations.
+ */
+static void
+brw_wm_linear_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    * Divide by two because we operate on 16 pixels at a time and require
+    * two GRF entries for each logical shader register.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   /* how many general-purpose registers are used */
+   c->prog_data.total_grf = c->max_wm_grf;
+
+   /* Scratch space is used for register spilling */
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
+static enum pipe_error do_wm_prog( struct brw_context *brw,
+                                   struct brw_fragment_shader *fp, 
+                                   struct brw_wm_prog_key *key,
+                                   struct brw_winsys_buffer **bo_out)
+{
+   enum pipe_error ret;
+   struct brw_wm_compile *c;
+   const GLuint *program;
+   GLuint program_size;
+
+   if (brw->wm.compile_data == NULL) {
+      brw->wm.compile_data = MALLOC(sizeof(*brw->wm.compile_data));
+      if (!brw->wm.compile_data) 
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   c = brw->wm.compile_data;
+   memset(c, 0, sizeof *c);
+
+   c->key = *key;
+   c->fp = fp;
+   c->env_param = NULL; /*brw->intel.ctx.FragmentProgram.Parameters;*/
+
+   brw_init_compile(brw, &c->func);
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->has_flow_control) {
+      c->dispatch_width = 8;
+      /* XXX: GLSL support
+       */
+      exit(1);
+      /* brw_wm_branching_shader_emit(brw, c); */
+   }
+   else {
+      c->dispatch_width = 16;
+      brw_wm_linear_shader_emit(brw, c);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM)
+      debug_printf("\n");
+
+   /* get the program
+    */
+   ret = brw_get_program(&c->func, &program, &program_size);
+   if (ret)
+      return ret;
+
+   ret = brw_upload_cache( &brw->cache, BRW_WM_PROG,
+                           &c->key, sizeof(c->key),
+                           NULL, 0,
+                           program, program_size,
+                           &c->prog_data,
+                           &brw->wm.prog_data,
+                           bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   unsigned lookup, line_aa;
+   unsigned i;
+
+   memset(key, 0, sizeof(*key));
+
+   /* PIPE_NEW_FRAGMENT_SHADER
+    * PIPE_NEW_DEPTH_STENCIL_ALPHA
+    */
+   lookup = (brw->curr.zstencil->iz_lookup |
+	     brw->curr.fragment_shader->iz_lookup);
+
+
+   /* PIPE_NEW_RAST
+    * BRW_NEW_REDUCED_PRIMITIVE 
+    */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_POINTS:
+      line_aa = AA_NEVER;
+      break;
+   case PIPE_PRIM_LINES:
+      line_aa = (brw->curr.rast->templ.line_smooth ? 
+                 AA_ALWAYS : AA_NEVER);
+      break;
+   default:
+      line_aa = brw->curr.rast->unfilled_aa_line;
+      break;
+   }
+	 
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    brw->curr.fragment_shader->uses_depth,
+		    key);
+
+   /* PIPE_NEW_RAST */
+   key->flat_shade = brw->curr.rast->templ.flatshade;
+
+
+   /* PIPE_NEW_BOUND_TEXTURES */
+   for (i = 0; i < brw->curr.num_textures; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.texture[i]);
+	 
+      if (tex->base.format == PIPE_FORMAT_YCBCR)
+	 key->yuvtex_mask |= 1 << i;
+
+      if (tex->base.format == PIPE_FORMAT_YCBCR_REV)
+	 key->yuvtex_swap_mask |= 1 << i;
+
+      /* XXX: shadow texture
+       */
+      /* key->shadowtex_mask |= 1<<i; */
+   }
+
+   /* CACHE_NEW_VS_PROG */
+   key->vp_nr_outputs = brw->vs.prog_data->nr_outputs;
+
+   key->nr_cbufs = brw->curr.fb.nr_cbufs;
+
+   key->nr_inputs = brw->curr.fragment_shader->info.num_inputs;
+
+   /* The unique fragment program ID */
+   key->program_string_id = brw->curr.fragment_shader->id;
+}
+
+
+static enum pipe_error brw_prepare_wm_prog(struct brw_context *brw)
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_shader *fs = brw->curr.fragment_shader;
+   enum pipe_error ret;
+     
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache, BRW_WM_PROG,
+                        &key, sizeof(key),
+                        NULL, 0,
+                        &brw->wm.prog_data,
+                        &brw->wm.prog_bo))
+      return PIPE_OK;
+
+   ret = do_wm_prog(brw, fs, &key, &brw->wm.prog_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .mesa  = (PIPE_NEW_FRAGMENT_SHADER |
+		PIPE_NEW_DEPTH_STENCIL_ALPHA |
+		PIPE_NEW_RAST |
+		PIPE_NEW_NR_CBUFS |
+		PIPE_NEW_BOUND_TEXTURES),
+      .brw   = (BRW_NEW_WM_INPUT_DIMENSIONS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG,
+   },
+   .prepare = brw_prepare_wm_prog
+};
+
diff --git a/src/gallium/drivers/i965/brw_wm.h b/src/gallium/drivers/i965/brw_wm.h
new file mode 100644
index 00000000000..f1ca9f63696
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm.h
@@ -0,0 +1,344 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define SATURATE (1<<5)
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_BIT_MAX                  0x40
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   GLuint source_depth_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_depth_regs:3;
+   GLuint computes_depth:1;
+   GLuint source_depth_to_render_target:1;
+   GLuint flat_shade:1;
+   GLuint runtime_check_aads_emit:1;
+
+   GLuint shadowtex_mask:16;
+   GLuint yuvtex_mask:16;
+   GLuint yuvtex_swap_mask:16;	/* UV swaped */
+
+   GLuint vp_nr_outputs:6;
+   GLuint nr_inputs:6;
+   GLuint nr_cbufs:3;
+   GLuint has_flow_control:1;
+
+   GLuint program_string_id;
+};
+
+
+/* A bit of a glossary:
+ *
+ * brw_wm_value: A computed value or program input.  Values are
+ * constant, they are created once and are never modified.  When a
+ * fragment program register is written or overwritten, new values are
+ * created fresh, preserving the rule that values are constant.
+ *
+ * brw_wm_ref: A reference to a value.  Wherever a value used is by an
+ * instruction or as a program output, that is tracked with an
+ * instance of this struct.  All references to a value occur after it
+ * is created.  After the last reference, a value is dead and can be
+ * discarded.
+ *
+ * brw_wm_grf: Represents a physical hardware register.  May be either
+ * empty or hold a value.  Register allocation is the process of
+ * assigning values to grf registers.  This occurs in pass2 and the
+ * brw_wm_grf struct is not used before that.
+ *
+ * Fragment program registers: These are time-varying constructs that
+ * are hard to reason about and which we translate away in pass0.  A
+ * single fragment program register element (eg. temp[0].x) will be
+ * translated to one or more brw_wm_value structs, one for each time
+ * that temp[0].x is written to during the program. 
+ */
+
+
+
+/* Used in pass2 to track register allocation.
+ */
+struct brw_wm_grf {
+   struct brw_wm_value *value;
+   GLuint nextuse;
+};
+
+struct brw_wm_value {
+   struct brw_reg hw_reg;	/* emitted to this reg, may not always be there */
+   struct brw_wm_ref *lastuse;
+   struct brw_wm_grf *resident; 
+   GLuint contributes_to_output:1;
+   GLuint spill_slot:16;	/* if non-zero, spill immediately after calculation */
+};
+
+struct brw_wm_ref {
+   struct brw_reg hw_reg;	/* nr filled in in pass2, everything else, pass0 */
+   struct brw_wm_value *value;
+   struct brw_wm_ref *prevuse;
+   GLuint unspill_reg:7;	/* unspill to reg */
+   GLuint emitted:1;
+   GLuint insn:24;
+};
+
+struct brw_wm_instruction {
+   struct brw_wm_value *dst[4];
+   struct brw_wm_ref *src[3][4];
+   GLuint opcode:8;
+   GLuint saturate:1;
+   GLuint writemask:4;
+   GLuint sampler:4;
+   GLuint tex_unit:4;   /* texture/sampler unit for texture instructions */
+   GLuint target:4;     /* TGSI_TEXTURE_x for texture instructions,
+                         * target binding table index for FB_WRITE
+                         */
+   GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
+};
+
+
+#define BRW_WM_MAX_INSN  2048
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+#define BRW_WM_MAX_SUBROUTINE 16
+
+
+/* New opcodes to track internal operations required for WM unit.
+ * These are added early so that the registers used can be tracked,
+ * freed and reused like those of other instructions.
+ */
+#define MAX_OPCODE        TGSI_OPCODE_LAST
+#define WM_PIXELXY        (MAX_OPCODE)
+#define WM_DELTAXY        (MAX_OPCODE + 1)
+#define WM_PIXELW         (MAX_OPCODE + 2)
+#define WM_LINTERP        (MAX_OPCODE + 3)
+#define WM_PINTERP        (MAX_OPCODE + 4)
+#define WM_CINTERP        (MAX_OPCODE + 5)
+#define WM_WPOSXY         (MAX_OPCODE + 6)
+#define WM_FB_WRITE       (MAX_OPCODE + 7)
+#define WM_FRONTFACING    (MAX_OPCODE + 8)
+#define MAX_WM_OPCODE     (MAX_OPCODE + 9)
+
+#define BRW_FILE_PAYLOAD   (TGSI_FILE_COUNT)
+#define PAYLOAD_DEPTH      (PIPE_MAX_SHADER_INPUTS) /* ?? */
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct brw_fp_src {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned swizzle:8;
+   unsigned indirect:1;
+   unsigned negate:1;
+   unsigned abs:1;
+};
+
+struct brw_fp_dst {
+   unsigned file:4;
+   unsigned index:16;
+   unsigned writemask:4;
+   unsigned indirect:1;
+   unsigned saturate:1;
+};
+
+struct brw_fp_instruction {
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   unsigned opcode:8;
+   unsigned target:8; /* XXX: special usage for FB_WRITE */
+   unsigned tex_unit:4;
+   unsigned sampler:4;
+   unsigned pad:8;
+};
+
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data;
+
+   struct brw_fragment_shader *fp;
+
+   GLfloat (*env_param)[4];
+
+   enum {
+      START,
+      PASS2_DONE
+   } state;
+
+   /* Initial pass - translate fp instructions to fp instructions,
+    * simplifying and adding instructions for interpolation and
+    * framebuffer writes.
+    */
+   struct {
+      GLfloat v[4];
+      unsigned nr;
+   } immediate[BRW_WM_MAX_CONST+3];
+   GLuint nr_immediates;
+   
+   struct brw_fp_instruction fp_instructions[BRW_WM_MAX_INSN];
+   GLuint nr_fp_insns;
+   GLuint fp_temp;
+   GLuint fp_interp_emitted;
+   GLuint fp_fragcolor_emitted;
+   GLuint fp_first_internal_temp;
+
+   struct brw_fp_src fp_pixel_xy;
+   struct brw_fp_src fp_delta_xy;
+   struct brw_fp_src fp_pixel_w;
+
+
+   /* Subsequent passes using SSA representation:
+    */
+   struct brw_wm_value vreg[BRW_WM_MAX_VREG];
+   GLuint nr_vreg;
+
+   struct brw_wm_value creg[BRW_WM_MAX_PARAM];
+   GLuint nr_creg;
+
+   struct {
+      struct brw_wm_value depth[4]; /* includes r0/r1 */
+      struct brw_wm_value input_interp[PIPE_MAX_SHADER_INPUTS];
+   } payload;
+
+
+   const struct brw_wm_ref *pass0_fp_reg[BRW_FILE_PAYLOAD+1][256][4];
+
+   struct brw_wm_ref undef_ref;
+   struct brw_wm_value undef_value;
+
+   struct brw_wm_ref refs[BRW_WM_MAX_REF];
+   GLuint nr_refs;
+
+   struct brw_wm_instruction instruction[BRW_WM_MAX_INSN];
+   GLuint nr_insns;
+
+   struct brw_wm_grf pass2_grf[BRW_WM_MAX_GRF/2];
+
+   GLuint grf_limit;
+   GLuint max_wm_grf;
+   GLuint last_scratch;
+
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
+   /** Mapping from Mesa registers to hardware registers */
+   struct {
+      GLboolean inited;
+      struct brw_reg reg;
+   } wm_regs[BRW_FILE_PAYLOAD+1][256][4];
+
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
+   struct brw_reg stack;
+   struct brw_reg emit_mask_reg;
+   GLuint tmp_regs[BRW_WM_MAX_GRF];
+   GLuint tmp_index;
+   GLuint tmp_max;
+   GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
+
+   GLuint error;
+};
+
+
+GLuint brw_wm_nr_args( GLuint opcode );
+GLuint brw_wm_is_scalar_result( GLuint opcode );
+
+int brw_wm_pass_fp( struct brw_wm_compile *c );
+void brw_wm_pass0( struct brw_wm_compile *c );
+void brw_wm_pass1( struct brw_wm_compile *c );
+void brw_wm_pass2( struct brw_wm_compile *c );
+void brw_wm_emit( struct brw_wm_compile *c );
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+			 struct brw_wm_value *value );
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref );
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst );
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage );
+
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage );
+
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key );
+
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0);
+
+#endif
diff --git a/src/gallium/drivers/i965/brw_wm_constant_buffer.c b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
new file mode 100644
index 00000000000..6434c6acf73
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_constant_buffer.c
@@ -0,0 +1,165 @@
+/* XXX: Constant buffers disabled
+ */
+
+
+/**
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
+ * read from this buffer with Data Port Read instructions/messages.
+ */
+enum pipe_error
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key,
+                             struct brw_winsys_buffer **bo_out )
+{
+   const GLint w = key->width - 1;
+   struct brw_winsys_buffer *bo;
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+      /* Emit relocation to surface contents */
+   make_reloc(&reloc[0],
+              BRW_USAGE_SAMPLER,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              key->bo);
+
+   
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = BRW_SURFACE_BUFFER;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   surf.ss1.base_addr = 0; /* reloc */
+
+   surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
+   surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+ 
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          key, sizeof(*key),
+                          reloc, Elements(reloc),
+                          &surf, sizeof(surf),
+                          NULL, NULL,
+                          &bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+/**
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static enum pipe_error
+brw_update_wm_constant_surface( struct brw_context *brw,
+                                GLuint surf)
+{
+   struct brw_surface_key key;
+   struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+   struct pipe_buffer *cbuf = brw->curr.fragment_constants;
+   int pitch = cbuf->size / (4 * sizeof(float));
+   enum pipe_error ret;
+
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   ret = brw_wm_update_constant_buffer(brw, &fp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (cbuf == NULL) {
+      bo_reference(&brw->wm.surf_bo[surf], NULL);
+      return PIPE_OK;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   key.ss0.surface_type = BRW_SURFACE_BUFFER;
+   key.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   key.bo = brw_buffer(cbuf)->bo;
+
+   key.ss2.width = (pitch-1) & 0x7f;            /* bits 6:0 of size or width */
+   key.ss2.height = ((pitch-1) >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   key.ss3.depth = ((pitch-1) >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   key.ss3.pitch = (pitch * 4 * sizeof(float)) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &key, sizeof(key),
+                        &key.bo, 1,
+                        NULL,
+                        &brw->wm.surf_bo[surf]))
+      return PIPE_OK;
+
+   ret = brw_create_constant_surface(brw, &key, &brw->wm.surf_bo[surf]);
+   if (ret)
+      return ret;
+
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+   return PIPE_OK;
+}
+
+/**
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
+ */
+static enum pipe_error prepare_wm_constant_surface(struct brw_context *brw )
+{
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+
+   ret = brw_wm_update_constant_buffer(brw,
+                                       &fp->const_buffer);
+   if (ret)
+      return ret;
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 bo_reference(&brw->wm.surf_bo[surf], NULL);
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return PIPE_OK;
+   }
+
+   ret = brw_update_wm_constant_surface(ctx, surf);
+   if (ret)
+      return ret;
+
+   return PIPE_OK
+}
+
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c
new file mode 100644
index 00000000000..3d11fa074cc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_debug.c
@@ -0,0 +1,256 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_wm.h"
+
+static void print_writemask( unsigned writemask )
+{
+   if (writemask != BRW_WRITEMASK_XYZW)
+      debug_printf(".%s%s%s%s", 
+		   (writemask & BRW_WRITEMASK_X) ? "x" : "",
+		   (writemask & BRW_WRITEMASK_Y) ? "y" : "",
+		   (writemask & BRW_WRITEMASK_Z) ? "z" : "",
+		   (writemask & BRW_WRITEMASK_W) ? "w" : "");
+}
+
+static void print_swizzle( unsigned swizzle )
+{
+   char *swz = "xyzw";
+   if (swizzle != BRW_SWIZZLE_XYZW)
+      debug_printf(".%c%c%c%c", 
+		   swz[BRW_GET_SWZ(swizzle, X)],
+		   swz[BRW_GET_SWZ(swizzle, Y)],
+		   swz[BRW_GET_SWZ(swizzle, Z)],
+		   swz[BRW_GET_SWZ(swizzle, W)]);
+}
+
+static void print_opcode( unsigned opcode )
+{
+   switch (opcode) {
+   case WM_PIXELXY:
+      debug_printf("PIXELXY");
+      break;
+   case WM_DELTAXY:
+      debug_printf("DELTAXY");
+      break;
+   case WM_PIXELW:
+      debug_printf("PIXELW");
+      break;
+   case WM_WPOSXY:
+      debug_printf("WPOSXY");
+      break;
+   case WM_PINTERP:
+      debug_printf("PINTERP");
+      break;
+   case WM_LINTERP:
+      debug_printf("LINTERP");
+      break;
+   case WM_CINTERP:
+      debug_printf("CINTERP");
+      break;
+   case WM_FB_WRITE:
+      debug_printf("FB_WRITE");
+      break;
+   case WM_FRONTFACING:
+      debug_printf("FRONTFACING");
+      break;
+   default:
+      debug_printf("%s", tgsi_get_opcode_info(opcode)->mnemonic);
+      break;
+   }
+}
+
+void brw_wm_print_value( struct brw_wm_compile *c,
+		       struct brw_wm_value *value )
+{
+   assert(value);
+   if (c->state >= PASS2_DONE) 
+      brw_print_reg(value->hw_reg);
+   else if( value == &c->undef_value )
+      debug_printf("undef");
+   else if( value - c->vreg >= 0 &&
+	    value - c->vreg < BRW_WM_MAX_VREG)
+      debug_printf("r%d", value - c->vreg);
+   else if (value - c->creg >= 0 &&
+	    value - c->creg < BRW_WM_MAX_PARAM)
+      debug_printf("c%d", value - c->creg);
+   else if (value - c->payload.input_interp >= 0 &&
+	    value - c->payload.input_interp < PIPE_MAX_SHADER_INPUTS)
+      debug_printf("i%d", value - c->payload.input_interp);
+   else if (value - c->payload.depth >= 0 &&
+	    value - c->payload.depth < PIPE_MAX_SHADER_INPUTS)
+      debug_printf("d%d", value - c->payload.depth);
+   else 
+      debug_printf("?");
+}
+
+void brw_wm_print_ref( struct brw_wm_compile *c,
+		       struct brw_wm_ref *ref )
+{
+   struct brw_reg hw_reg = ref->hw_reg;
+
+   if (ref->unspill_reg)
+      debug_printf("UNSPILL(%x)/", ref->value->spill_slot);
+
+   if (c->state >= PASS2_DONE)
+      brw_print_reg(ref->hw_reg);
+   else {
+      debug_printf("%s", hw_reg.negate ? "-" : "");
+      debug_printf("%s", hw_reg.abs ? "abs/" : "");
+      brw_wm_print_value(c, ref->value);
+      if ((hw_reg.nr&1) || hw_reg.subnr) {
+	 debug_printf("->%d.%d", (hw_reg.nr&1), hw_reg.subnr);
+      }
+   }
+}
+
+void brw_wm_print_insn( struct brw_wm_compile *c,
+			struct brw_wm_instruction *inst )
+{
+   GLuint i, arg;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   debug_printf("[");
+   for (i = 0; i < 4; i++) {
+      if (inst->dst[i]) {
+	 brw_wm_print_value(c, inst->dst[i]);
+	 if (inst->dst[i]->spill_slot)
+	    debug_printf("/SPILL(%x)",inst->dst[i]->spill_slot);
+      }
+      else
+	 debug_printf("#");
+      if (i < 3)      
+	 debug_printf(",");
+   }
+   debug_printf("]");
+   print_writemask(inst->writemask);
+   
+   debug_printf(" = ");
+   print_opcode(inst->opcode);
+  
+   if (inst->saturate)
+      debug_printf("_SAT");
+
+   for (arg = 0; arg < nr_args; arg++) {
+
+      debug_printf(" [");
+
+      for (i = 0; i < 4; i++) {
+	 if (inst->src[arg][i]) {
+	    brw_wm_print_ref(c, inst->src[arg][i]);
+	 }
+	 else
+	    debug_printf("%%");
+
+	 if (i < 3) 
+	    debug_printf(",");
+	 else
+	    debug_printf("]");
+      }
+   }
+   debug_printf("\n");
+}
+
+void brw_wm_print_program( struct brw_wm_compile *c,
+			   const char *stage )
+{
+   GLuint insn;
+
+   debug_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_insns; insn++)
+      brw_wm_print_insn(c, &c->instruction[insn]);
+   debug_printf("\n");
+}
+
+static const char *file_strings[TGSI_FILE_COUNT+1] = {
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMPLER",
+   "ADDR",
+   "IMM",
+   "LOOP",
+   "PAYLOAD"
+};
+
+static void brw_wm_print_fp_insn( struct brw_wm_compile *c,
+                                  struct brw_fp_instruction *inst )
+{
+   GLuint i;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+
+   print_opcode(inst->opcode);
+   if (inst->dst.saturate)
+      debug_printf("_SAT");
+   debug_printf(" ");
+
+   if (inst->dst.indirect)
+      debug_printf("[");
+
+   debug_printf("%s[%d]",
+                file_strings[inst->dst.file],
+                inst->dst.index );
+   print_writemask(inst->dst.writemask);
+
+   if (inst->dst.indirect)
+      debug_printf("]");
+
+   debug_printf(nr_args ? ", " : "\n");
+   
+   for (i = 0; i < nr_args; i++) {
+      debug_printf("%s%s%s[%d]%s",
+                   inst->src[i].negate ? "-" : "",
+                   inst->src[i].abs ? "ABS(" : "",
+                   file_strings[inst->src[i].file],
+                   inst->src[i].index,
+                   inst->src[i].abs ? ")" : "");
+      print_swizzle(inst->src[i].swizzle);
+      debug_printf("%s", i == nr_args - 1 ? "\n" : ", ");
+   }
+}
+
+
+void brw_wm_print_fp_program( struct brw_wm_compile *c,
+                              const char *stage )
+{
+   GLuint insn;
+
+   debug_printf("%s:\n", stage);
+   for (insn = 0; insn < c->nr_fp_insns; insn++)
+      brw_wm_print_fp_insn(c, &c->fp_instructions[insn]);
+   debug_printf("\n");
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_emit.c b/src/gallium/drivers/i965/brw_wm_emit.c
new file mode 100644
index 00000000000..8f983a60ae8
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_emit.c
@@ -0,0 +1,1521 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+#include "tgsi/tgsi_info.h"
+
+#include "brw_context.h"
+#include "brw_wm.h"
+#include "brw_debug.h"
+#include "brw_disasm.h"
+
+/* Not quite sure how correct this is - need to understand horiz
+ * vs. vertical strides a little better.
+ */
+static INLINE struct brw_reg sechalf( struct brw_reg reg )
+{
+   if (reg.vstride)
+      reg.nr++;
+   return reg;
+}
+
+/* Payload R0:
+ *
+ * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 quads,
+ *         corresponding to each of the 16 execution channels.
+ * R0.1..8 -- ?
+ * R1.0 -- triangle vertex 0.X
+ * R1.1 -- triangle vertex 0.Y
+ * R1.2 -- quad 0 x,y coords (2 packed uwords)
+ * R1.3 -- quad 1 x,y coords (2 packed uwords)
+ * R1.4 -- quad 2 x,y coords (2 packed uwords)
+ * R1.5 -- quad 3 x,y coords (2 packed uwords)
+ * R1.6 -- ?
+ * R1.7 -- ?
+ * R1.8 -- ?
+ */
+
+
+static void emit_pixel_xy(struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask)
+{
+   struct brw_reg r1 = brw_vec1_grf(1, 0);
+   struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   /* Calculate pixel centers by adding 1 or 0 to each of the
+    * micro-tile coordinates passed in r1.
+    */
+   if (mask & BRW_WRITEMASK_X) {
+      brw_ADD(p,
+	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+   }
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_ADD(p,
+	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
+	      stride(suboffset(r1_uw,5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+
+   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+}
+
+
+
+static void emit_delta_xy(struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0)
+{
+   struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+   /* Calc delta X,Y by subtracting origin in r1 from the pixel
+    * centers.
+    */
+   if (mask & BRW_WRITEMASK_X) {
+      brw_ADD(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+   }
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_ADD(p,
+	      dst[1],
+	      retype(arg0[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+
+   }
+}
+
+static void emit_wpos_xy(struct brw_wm_compile *c,
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0)
+{
+   struct brw_compile *p = &c->func;
+
+   if (mask & BRW_WRITEMASK_X) {
+      /* X' = X */
+      brw_MOV(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
+   }
+
+   /* XXX: is this needed any more, or is this a NOOP?
+    */
+   if (mask & BRW_WRITEMASK_Y) {
+#if 0
+      /* Y' = height - 1 - Y */
+      brw_ADD(p,
+	      dst[1],
+	      negate(retype(arg0[1], BRW_REGISTER_TYPE_W)),
+	      brw_imm_d(c->key.drawable_height - 1));
+#else
+      brw_MOV(p,
+	      dst[0],
+	      retype(arg0[0], BRW_REGISTER_TYPE_W));
+#endif
+   }
+}
+
+
+static void emit_pixel_w( struct brw_compile *p,
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0,
+			  const struct brw_reg *deltas)
+{
+   /* Don't need this if all you are doing is interpolating color, for
+    * instance.
+    */
+   if (mask & BRW_WRITEMASK_W) {      
+      struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), deltas[1]);
+
+      /* Calc w */
+      brw_math_16( p, dst[3],
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+
+
+
+static void emit_linterp( struct brw_compile *p, 
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0,
+			 const struct brw_reg *deltas )
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+      }
+   }
+}
+
+
+static void emit_pinterp( struct brw_compile *p, 
+			  const struct brw_reg *dst,
+			  GLuint mask,
+			  const struct brw_reg *arg0,
+			  const struct brw_reg *deltas,
+			  const struct brw_reg *w)
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
+	 brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
+      }
+   }
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MUL(p, dst[i], dst[i], w[3]);
+      }
+   }
+}
+
+
+static void emit_cinterp( struct brw_compile *p, 
+			 const struct brw_reg *dst,
+			 GLuint mask,
+			 const struct brw_reg *arg0 )
+{
+   struct brw_reg interp[4];
+   GLuint nr = arg0[0].nr;
+   GLuint i;
+
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+         brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
+      }
+   }
+}
+
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing( struct brw_compile *p,
+			      const struct brw_reg *dst,
+			      GLuint mask )
+{
+   struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+   GLuint i;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(0.0));
+      }
+   }
+
+   /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+    * us front face
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+      }
+   }
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: q0.tl q0.tr q0.bl q0.br q1.tl q1.tr q1.bl q1.br
+ *
+ * and we're trying to produce:
+ *
+ *           DDX                     DDY
+ * dst: (q0.tr - q0.tl)     (q0.tl - q0.bl)
+ *      (q0.tr - q0.tl)     (q0.tr - q0.br)
+ *      (q0.br - q0.bl)     (q0.tl - q0.bl)
+ *      (q0.br - q0.bl)     (q0.tr - q0.br)
+ *      (q1.tr - q1.tl)     (q1.tl - q1.bl)
+ *      (q1.tr - q1.tl)     (q1.tr - q1.br)
+ *      (q1.br - q1.bl)     (q1.tl - q1.bl)
+ *      (q1.br - q1.bl)     (q1.tr - q1.br)
+ *
+ * and add two more quads if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
+ * between each other.  We could probably do it like ddx and swizzle the right
+ * order later, but bail for now and just produce
+ * ((q0.tl - q0.bl)x4 (q1.tl - q1.bl)x4)
+ */
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0)
+{
+   int i;
+   struct brw_reg src0, src1;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+   for (i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 if (is_ddx) {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	 } else {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, BRW_WRITEMASK_XYZW);
+	 }
+	 brw_ADD(p, dst[i], src0, negate(src1));
+      }
+   }
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+static void emit_alu1( struct brw_compile *p, 
+		       struct brw_instruction *(*func)(struct brw_compile *, 
+						       struct brw_reg, 
+						       struct brw_reg),
+		       const struct brw_reg *dst,
+		       GLuint mask,
+		       const struct brw_reg *arg0 )
+{
+   GLuint i;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 func(p, dst[i], arg0[i]);
+      }
+   }
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_alu2( struct brw_compile *p, 
+		       struct brw_instruction *(*func)(struct brw_compile *, 
+						       struct brw_reg, 
+						       struct brw_reg, 
+						       struct brw_reg),
+		       const struct brw_reg *dst,
+		       GLuint mask,
+		       const struct brw_reg *arg0,
+		       const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 func(p, dst[i], arg0[i], arg1[i]);
+      }
+   }
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_mad( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MUL(p, dst[i], arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_ADD(p, dst[i], dst[i], arg2[i]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
+
+static void emit_lrp( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   /* Uses dst as a temporary:
+    */
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 /* Can I use the LINE instruction for this? 
+	  */
+	 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MAC(p, dst[i], arg0[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_sop( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      GLuint cond,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_MOV(p, dst[i], brw_imm_f(0));
+	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_slt( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
+}
+
+static void emit_sle( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
+}
+
+static void emit_sgt( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
+}
+
+static void emit_sge( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
+}
+
+static void emit_seq( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
+}
+
+static void emit_sne( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
+}
+
+static void emit_cmp( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1,
+		      const struct brw_reg *arg2 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg2[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_max( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg0[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+static void emit_min( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {	
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg1[i]);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MOV(p, dst[i], arg0[i]);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+}
+
+
+static void emit_dp3( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_dp4( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+   brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_dph( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   const int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
+   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
+
+   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_xpd( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0,
+		      const struct brw_reg *arg1 )
+{
+   GLuint i;
+
+   assert((mask & BRW_WRITEMASK_W) != BRW_WRITEMASK_W);
+   
+   for (i = 0 ; i < 3; i++) {
+      if (mask & (1<<i)) {
+	 GLuint i2 = (i+2)%3;
+	 GLuint i1 = (i+1)%3;
+
+	 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
+
+	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+	 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+
+static void emit_math1( struct brw_compile *p, 
+			GLuint function,
+			const struct brw_reg *dst,
+			GLuint mask,
+			const struct brw_reg *arg0 )
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_MOV(p, brw_message_reg(2), arg0[0]);
+
+   /* Send two messages to perform all 16 operations:
+    */
+   brw_math_16(p, 
+	       dst[dst_chan],
+	       function,
+	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	       2,
+	       brw_null_reg(),
+	       BRW_MATH_PRECISION_FULL);
+}
+
+
+static void emit_math2( struct brw_compile *p, 
+			GLuint function,
+			const struct brw_reg *dst,
+			GLuint mask,
+			const struct brw_reg *arg0,
+			const struct brw_reg *arg1)
+{
+   int dst_chan = ffs(mask & BRW_WRITEMASK_XYZW) - 1;
+
+   if (!(mask & BRW_WRITEMASK_XYZW))
+      return; /* Do not emit dead code */
+
+   assert(util_is_power_of_two(mask & BRW_WRITEMASK_XYZW));
+
+   brw_push_insn_state(p);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_MOV(p, brw_message_reg(2), arg0[0]);
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(p, brw_message_reg(4), sechalf(arg0[0]));
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_MOV(p, brw_message_reg(3), arg1[0]);
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
+
+   
+   /* Send two messages to perform all 16 operations:
+    */
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_math(p, 
+	    dst[dst_chan],
+	    function,
+	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+
+   brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+   brw_math(p, 
+	    offset(dst[dst_chan],1),
+	    function,
+	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    4,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+   
+   brw_pop_insn_state(p);
+}
+		     
+
+
+static void emit_tex( struct brw_wm_compile *c,
+		      const struct brw_wm_instruction *inst,
+		      struct brw_reg *dst,
+		      GLuint dst_flags,
+		      struct brw_reg *coord,
+		      GLuint sampler)
+{
+   struct brw_compile *p = &c->func;
+   GLuint msgLength, responseLength;
+   GLuint i, nr;
+   GLuint emit;
+   GLuint msg_type;
+   GLboolean shadow = FALSE;
+
+   /* How many input regs are there?
+    */
+   switch (inst->target) {
+   case TGSI_TEXTURE_1D:
+      emit = BRW_WRITEMASK_X;
+      nr = 1;
+      break;
+   case TGSI_TEXTURE_SHADOW1D:
+      emit = BRW_WRITEMASK_XW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_2D:
+      emit = BRW_WRITEMASK_XY;
+      nr = 2;
+      break;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      emit = BRW_WRITEMASK_XYW;
+      nr = 4;
+      shadow = TRUE;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      emit = BRW_WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   default:
+      /* unexpected target */
+      abort();
+   }
+
+   msgLength = 1;
+
+   for (i = 0; i < nr; i++) {
+      static const GLuint swz[4] = {0,1,2,2};
+      if (emit & (1<<i)) 
+	 brw_MOV(p, brw_message_reg(msgLength+1), coord[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msgLength+1), brw_imm_f(0));
+      msgLength += 2;
+   }
+
+   responseLength = 8;		/* always */
+
+   if (BRW_IS_IGDNG(p->brw)) {
+       if (shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   } else {
+       if (shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+   }
+
+   brw_SAMPLE(p, 
+	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
+	      inst->writemask,
+	      msg_type, 
+	      responseLength,
+	      msgLength,
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+}
+
+
+static void emit_txb( struct brw_wm_compile *c,
+		      const struct brw_wm_instruction *inst,
+		      struct brw_reg *dst,
+		      GLuint dst_flags,
+		      struct brw_reg *coord,
+		      GLuint sampler )
+{
+   struct brw_compile *p = &c->func;
+   GLuint msgLength;
+   GLuint msg_type;
+   /* Shadow ignored for txb.
+    */
+   switch (inst->target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
+      brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      brw_MOV(p, brw_message_reg(2), coord[0]);
+      brw_MOV(p, brw_message_reg(4), coord[1]);
+      brw_MOV(p, brw_message_reg(6), coord[2]);
+      break;
+   default:
+      /* unexpected target */
+      abort();
+   }
+
+   brw_MOV(p, brw_message_reg(8), coord[3]);
+   msgLength = 9;
+
+   if (BRW_IS_IGDNG(p->brw))
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
+   else
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+
+   brw_SAMPLE(p, 
+	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
+              BTI_TEXTURE(inst->tex_unit),
+	      sampler,          /* sampler index */
+	      inst->writemask,
+	      msg_type,
+	      8,		/* responseLength */
+	      msgLength,
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
+}
+
+
+static void emit_lit( struct brw_compile *p, 
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0 )
+{
+   assert((mask & BRW_WRITEMASK_XW) == 0);
+
+   if (mask & BRW_WRITEMASK_Y) {
+      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
+      brw_MOV(p, dst[1], arg0[0]);
+      brw_set_saturate(p, 0);
+   }
+
+   if (mask & BRW_WRITEMASK_Z) {
+      emit_math2(p, BRW_MATH_FUNCTION_POW,
+		 &dst[2],
+		 BRW_WRITEMASK_X | (mask & SATURATE),
+		 &arg0[1],
+		 &arg0[3]);
+   }
+
+   /* Ordinarily you'd use an iff statement to skip or shortcircuit
+    * some of the POW calculations above, but 16-wide iff statements
+    * seem to lock c1 hardware, so this is a nasty workaround:
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
+   {
+      if (mask & BRW_WRITEMASK_Y) 
+	 brw_MOV(p, dst[1], brw_imm_f(0));
+
+      if (mask & BRW_WRITEMASK_Z) 
+	 brw_MOV(p, dst[2], brw_imm_f(0)); 
+   }
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+/* Kill pixel - set execution mask to zero for those pixels which
+ * fail.
+ */
+static void emit_kil( struct brw_wm_compile *c,
+		      struct brw_reg *arg0)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   GLuint i;
+   
+   /* XXX - usually won't need 4 compares!
+    */
+   for (i = 0; i < 4; i++) {
+      brw_push_insn_state(p);
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
+      brw_set_predicate_control_flag_value(p, 0xff);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_AND(p, r0uw, brw_flag_reg(), r0uw);
+      brw_pop_insn_state(p);
+   }
+}
+
+/* KILLP kills the pixels that are currently executing, not based on a test
+ * of the arguments.
+ */
+static void emit_killp( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
+   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
+   brw_pop_insn_state(p);
+}
+
+static void fire_fb_write( struct brw_wm_compile *c,
+			   GLuint base_reg,
+			   GLuint nr,
+			   GLuint target,
+			   GLuint eot )
+{
+   struct brw_compile *p = &c->func;
+   
+   /* Pass through control information:
+    */
+/*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, 
+	       brw_message_reg(base_reg + 1),
+	       brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   /* Send framebuffer write message: */
+/*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
+   brw_fb_WRITE(p,
+		retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		target,		
+		nr,
+		0, 
+		eot);
+}
+
+
+static void emit_aa( struct brw_wm_compile *c,
+		     struct brw_reg *arg1,
+		     GLuint reg )
+{
+   struct brw_compile *p = &c->func;
+   GLuint comp = c->key.aa_dest_stencil_reg / 2;
+   GLuint off = c->key.aa_dest_stencil_reg % 2;
+   struct brw_reg aa = offset(arg1[comp], off);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
+   brw_MOV(p, brw_message_reg(reg), aa);
+   brw_pop_insn_state(p);
+}
+
+
+/* Post-fragment-program processing.  Send the results to the
+ * framebuffer.
+ * \param arg0  the fragment color
+ * \param arg1  the pass-through depth value
+ * \param arg2  the shader-computed depth value
+ */
+static void emit_fb_write( struct brw_wm_compile *c,
+			   struct brw_reg *arg0,
+			   struct brw_reg *arg1,
+			   struct brw_reg *arg2,
+			   GLuint target,
+			   GLuint eot)
+{
+   struct brw_compile *p = &c->func;
+   GLuint nr = 2;
+   GLuint channel;
+
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
+
+   /* I don't really understand how this achieves the color interleave
+    * (ie RGBARGBA) in the result:  [Do the saturation here]
+    */
+   {
+      brw_push_insn_state(p);
+      
+      for (channel = 0; channel < 4; channel++) {
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel),
+		 arg0[channel]);
+       
+	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
+	 brw_MOV(p,
+		 brw_message_reg(nr + channel + 4),
+		 sechalf(arg0[channel]));
+      }
+
+      /* skip over the regs populated above:
+       */
+      nr += 8;
+   
+      brw_pop_insn_state(p);
+   }
+
+   if (c->key.source_depth_to_render_target)
+   {
+      if (c->key.computes_depth) 
+	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
+      else 
+	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
+
+      nr += 2;
+   }
+
+   if (c->key.dest_depth_reg)
+   {
+      GLuint comp = c->key.dest_depth_reg / 2;
+      GLuint off = c->key.dest_depth_reg % 2;
+
+      if (off != 0) {
+         brw_push_insn_state(p);
+         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+         brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+         /* 2nd half? */
+         brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+         brw_pop_insn_state(p);
+      }
+      else {
+         brw_MOV(p, brw_message_reg(nr), arg1[comp]);
+      }
+      nr += 2;
+   }
+
+   if (!c->key.runtime_check_aads_emit) {
+      if (c->key.aa_dest_stencil_reg)
+	 emit_aa(c, arg1, 2);
+
+      fire_fb_write(c, 0, nr, target, eot);
+   }
+   else {
+      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+      struct brw_reg ip = brw_ip_reg();
+      struct brw_instruction *jmp;
+      
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+      brw_AND(p, 
+	      v1_null_ud, 
+	      get_element_ud(brw_vec8_grf(1,0), 6), 
+	      brw_imm_ud(1<<26)); 
+
+      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
+      {
+	 emit_aa(c, arg1, 2);
+	 fire_fb_write(c, 0, nr, target, eot);
+	 /* note - thread killed in subroutine */
+      }
+      brw_land_fwd_jump(p, jmp);
+
+      /* ELSE: Shuffle up one register to fill in the hole left for AA:
+       */
+      fire_fb_write(c, 1, nr-1, target, eot);
+   }
+}
+
+
+/**
+ * Move a GPR to scratch memory. 
+ */
+static void emit_spill( struct brw_wm_compile *c,
+			struct brw_reg reg,
+			GLuint slot )
+{
+   struct brw_compile *p = &c->func;
+
+   /*
+     mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
+   */
+   brw_MOV(p, brw_message_reg(2), reg);
+
+   /*
+     mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
+     send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
+   */
+   brw_dp_WRITE_16(p, 
+		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
+		   slot);
+}
+
+
+/**
+ * Load a GPR from scratch memory. 
+ */
+static void emit_unspill( struct brw_wm_compile *c,
+			  struct brw_reg reg,
+			  GLuint slot )
+{
+   struct brw_compile *p = &c->func;
+
+   /* Slot 0 is the undef value.
+    */
+   if (slot == 0) {
+      brw_MOV(p, reg, brw_imm_f(0));
+      return;
+   }
+
+   /*
+     mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
+     send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
+   */
+
+   brw_dp_READ_16(p,
+		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
+		  slot);
+}
+
+
+/**
+ * Retrieve up to 4 GEN4 register pairs for the given wm reg:
+ * Args with unspill_reg != 0 will be loaded from scratch memory.
+ */
+static void get_argument_regs( struct brw_wm_compile *c,
+			       struct brw_wm_ref *arg[],
+			       struct brw_reg *regs )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (arg[i]) {
+	 if (arg[i]->unspill_reg)
+	    emit_unspill(c,
+			 brw_vec8_grf(arg[i]->unspill_reg, 0),
+			 arg[i]->value->spill_slot);
+
+	 regs[i] = arg[i]->hw_reg;
+      }
+      else {
+	 regs[i] = brw_null_reg();
+      }
+   }
+}
+
+
+/**
+ * For values that have a spill_slot!=0, write those regs to scratch memory.
+ */
+static void spill_values( struct brw_wm_compile *c,
+			  struct brw_wm_value *values,
+			  GLuint nr )
+{
+   GLuint i;
+
+   for (i = 0; i < nr; i++)
+      if (values[i].spill_slot) 
+	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
+}
+
+
+/* Emit the fragment program instructions here.
+ */
+void brw_wm_emit( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   GLuint insn;
+
+   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+
+   /* Check if any of the payload regs need to be spilled:
+    */
+   spill_values(c, c->payload.depth, 4);
+   spill_values(c, c->creg, c->nr_creg);
+   spill_values(c, c->payload.input_interp, PIPE_MAX_SHADER_INPUTS);
+   
+
+   for (insn = 0; insn < c->nr_insns; insn++) {
+
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+      struct brw_reg args[3][4], dst[4];
+      GLuint i, dst_flags;
+      
+      /* Get argument regs:
+       */
+      for (i = 0; i < 3; i++) 
+	 get_argument_regs(c, inst->src[i], args[i]);
+
+      /* Get dest regs:
+       */
+      for (i = 0; i < 4; i++)
+	 if (inst->dst[i])
+	    dst[i] = inst->dst[i]->hw_reg;
+	 else
+	    dst[i] = brw_null_reg();
+      
+      /* Flags
+       */
+      dst_flags = inst->writemask;
+      if (inst->saturate) 
+	 dst_flags |= SATURATE;
+
+      switch (inst->opcode) {
+	 /* Generated instructions for calculating triangle interpolants:
+	  */
+      case WM_PIXELXY:
+	 emit_pixel_xy(p, dst, dst_flags);
+	 break;
+
+      case WM_DELTAXY:
+	 emit_delta_xy(p, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_WPOSXY:
+	 emit_wpos_xy(c, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_PIXELW:
+	 emit_pixel_w(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case WM_LINTERP:
+	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case WM_PINTERP:
+	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case WM_CINTERP:
+	 emit_cinterp(p, dst, dst_flags, args[0]);
+	 break;
+
+      case WM_FB_WRITE:
+	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
+	 break;
+
+      case WM_FRONTFACING:
+	 emit_frontfacing(p, dst, dst_flags);
+	 break;
+
+	 /* Straightforward arithmetic:
+	  */
+      case TGSI_OPCODE_ADD:
+	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_FRC:
+	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_FLR:
+	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DDX:
+	 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DDY:
+	 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
+	 break;
+
+      case TGSI_OPCODE_DP3:
+	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_DP4:
+	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_DPH:
+	 emit_dph(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_LRP:
+	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MAD:	
+	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MOV:
+	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_MUL:
+	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_XPD:
+	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+	 /* Higher math functions:
+	  */
+      case TGSI_OPCODE_RCP:
+	 emit_math1(p, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_RSQ:
+	 emit_math1(p, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_SIN:
+	 emit_math1(p, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_COS:
+	 emit_math1(p, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_EX2:
+	 emit_math1(p, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_LG2:
+	 emit_math1(p, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
+	 break;
+
+      case TGSI_OPCODE_SCS:
+	 /* There is an scs math function, but it would need some
+	  * fixup for 16-element execution.
+	  */
+	 if (dst_flags & BRW_WRITEMASK_X)
+	    emit_math1(p, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 if (dst_flags & BRW_WRITEMASK_Y)
+	    emit_math1(p, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|BRW_WRITEMASK_X, args[0]);
+	 break;
+
+      case TGSI_OPCODE_POW:
+	 emit_math2(p, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
+	 break;
+
+	 /* Comparisons:
+	  */
+      case TGSI_OPCODE_CMP:
+	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
+	 break;
+
+      case TGSI_OPCODE_MAX:
+	 emit_max(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_MIN:
+	 emit_min(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_SLT:
+	 emit_slt(p, dst, dst_flags, args[0], args[1]);
+	 break;
+
+      case TGSI_OPCODE_SLE:
+	 emit_sle(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SGT:
+	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SGE:
+	 emit_sge(p, dst, dst_flags, args[0], args[1]);
+	 break;
+      case TGSI_OPCODE_SEQ:
+	 emit_seq(p, dst, dst_flags, args[0], args[1]);
+	break;
+      case TGSI_OPCODE_SNE:
+	 emit_sne(p, dst, dst_flags, args[0], args[1]);
+	break;
+
+      case TGSI_OPCODE_LIT:
+	 emit_lit(p, dst, dst_flags, args[0]);
+	 break;
+
+	 /* Texturing operations:
+	  */
+      case TGSI_OPCODE_TEX:
+	 emit_tex(c, inst, dst, dst_flags, args[0], inst->sampler);
+	 break;
+
+      case TGSI_OPCODE_TXB:
+	 emit_txb(c, inst, dst, dst_flags, args[0], inst->sampler);
+	 break;
+
+      case TGSI_OPCODE_KIL:
+	 emit_kil(c, args[0]);
+	 break;
+
+      case TGSI_OPCODE_KILP:
+	 emit_killp(c);
+	 break;
+
+      default:
+	 debug_printf("Unsupported opcode %i (%s) in fragment shader\n",
+		      inst->opcode, 
+		      tgsi_get_opcode_info(inst->opcode)->mnemonic);
+      }
+      
+      for (i = 0; i < 4; i++)
+	if (inst->dst[i] && inst->dst[i]->spill_slot) 
+	   emit_spill(c, 
+		      inst->dst[i]->hw_reg, 
+		      inst->dst[i]->spill_slot);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_fp.c b/src/gallium/drivers/i965/brw_wm_fp.c
new file mode 100644
index 00000000000..9c5b527f897
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_fp.c
@@ -0,0 +1,1224 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+               
+
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_util.h"
+
+#include "brw_wm.h"
+#include "brw_util.h"
+#include "brw_debug.h"
+
+
+/***********************************************************************
+ * Source regs
+ */
+
+static struct brw_fp_src src_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_src reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.swizzle = BRW_SWIZZLE_XYZW;
+   reg.indirect = 0;
+   reg.negate = 0;
+   reg.abs = 0;
+   return reg;
+}
+
+static struct brw_fp_src src_reg_from_dst(struct brw_fp_dst dst)
+{
+   return src_reg(dst.file, dst.index);
+}
+
+static struct brw_fp_src src_undef( void )
+{
+   return src_reg(TGSI_FILE_NULL, 0);
+}
+
+static GLboolean src_is_undef(struct brw_fp_src src)
+{
+   return src.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_src src_swizzle( struct brw_fp_src reg, int x, int y, int z, int w )
+{
+   unsigned swz = reg.swizzle;
+
+   reg.swizzle = ( BRW_GET_SWZ(swz, x) << 0 |
+		   BRW_GET_SWZ(swz, y) << 2 |
+		   BRW_GET_SWZ(swz, z) << 4 |
+		   BRW_GET_SWZ(swz, w) << 6 );
+
+   return reg;
+}
+
+static struct brw_fp_src src_scalar( struct brw_fp_src reg, int x )
+{
+   return src_swizzle(reg, x, x, x, x);
+}
+
+static struct brw_fp_src src_abs( struct brw_fp_src src )
+{
+   src.negate = 0;
+   src.abs = 1;
+   return src;
+}
+
+static struct brw_fp_src src_negate( struct brw_fp_src src )
+{
+   src.negate = 1;
+   src.abs = 0;
+   return src;
+}
+
+
+static int match_or_expand_immediate( const float *v,
+                                      unsigned nr,
+                                      float *v2,
+                                      unsigned *nr2,
+                                      unsigned *swizzle )
+{
+   unsigned i, j;
+   
+   *swizzle = 0;
+
+   for (i = 0; i < nr; i++) {
+      boolean found = FALSE;
+
+      for (j = 0; j < *nr2 && !found; j++) {
+         if (v[i] == v2[j]) {
+            *swizzle |= j << (i * 2);
+            found = TRUE;
+         }
+      }
+
+      if (!found) {
+         if (*nr2 >= 4) 
+            return FALSE;
+
+         v2[*nr2] = v[i];
+         *swizzle |= *nr2 << (i * 2);
+         (*nr2)++;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+/* Internally generated immediates: overkill...
+ */
+static struct brw_fp_src src_imm( struct brw_wm_compile *c, 
+				  const GLfloat *v, 
+				  unsigned nr)
+{
+   unsigned i, j;
+   unsigned swizzle;
+
+   /* Could do a first pass where we examine all existing immediates
+    * without expanding.
+    */
+
+   for (i = 0; i < c->nr_immediates; i++) {
+      if (match_or_expand_immediate( v, 
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   if (c->nr_immediates < Elements(c->immediate)) {
+      i = c->nr_immediates++;
+      if (match_or_expand_immediate( v,
+                                     nr,
+                                     c->immediate[i].v,
+                                     &c->immediate[i].nr, 
+                                     &swizzle ))
+         goto out;
+   }
+
+   c->error = 1;
+   return src_undef();
+
+out:
+   /* Make sure that all referenced elements are from this immediate.
+    * Has the effect of making size-one immediates into scalars.
+    */
+   for (j = nr; j < 4; j++)
+      swizzle |= (swizzle & 0x3) << (j * 2);
+
+   return src_swizzle( src_reg( TGSI_FILE_IMMEDIATE, i ),
+		       BRW_GET_SWZ(swizzle, X),
+		       BRW_GET_SWZ(swizzle, Y),
+		       BRW_GET_SWZ(swizzle, Z),
+		       BRW_GET_SWZ(swizzle, W) );
+}
+
+
+
+static struct brw_fp_src src_imm1f( struct brw_wm_compile *c,
+				    GLfloat f )
+{
+   return src_imm(c, &f, 1);
+}
+
+static struct brw_fp_src src_imm4f( struct brw_wm_compile *c,
+				    GLfloat x,
+				    GLfloat y,
+				    GLfloat z,
+				    GLfloat w)
+{
+   GLfloat f[4] = {x,y,z,w};
+   return src_imm(c, f, 4);
+}
+
+
+
+/***********************************************************************
+ * Dest regs
+ */
+
+static struct brw_fp_dst dst_reg(GLuint file, GLuint idx)
+{
+   struct brw_fp_dst reg;
+   reg.file = file;
+   reg.index = idx;
+   reg.writemask = BRW_WRITEMASK_XYZW;
+   reg.indirect = 0;
+   reg.saturate = 0;
+   return reg;
+}
+
+static struct brw_fp_dst dst_mask( struct brw_fp_dst reg, int mask )
+{
+   reg.writemask &= mask;
+   return reg;
+}
+
+static struct brw_fp_dst dst_undef( void )
+{
+   return dst_reg(TGSI_FILE_NULL, 0);
+}
+
+static boolean dst_is_undef( struct brw_fp_dst dst )
+{
+   return dst.file == TGSI_FILE_NULL;
+}
+
+static struct brw_fp_dst dst_saturate( struct brw_fp_dst reg, boolean flag )
+{
+   reg.saturate = flag;
+   return reg;
+}
+
+static struct brw_fp_dst get_temp( struct brw_wm_compile *c )
+{
+   int bit = ffs( ~c->fp_temp );
+
+   if (!bit) {
+      debug_printf("%s: out of temporaries\n", __FILE__);
+   }
+
+   c->fp_temp |= 1<<(bit-1);
+   return dst_reg(TGSI_FILE_TEMPORARY, c->fp_first_internal_temp+(bit-1));
+}
+
+
+static void release_temp( struct brw_wm_compile *c, struct brw_fp_dst temp )
+{
+   c->fp_temp &= ~(1 << (temp.index - c->fp_first_internal_temp));
+}
+
+
+/***********************************************************************
+ * Instructions 
+ */
+
+static struct brw_fp_instruction *get_fp_inst(struct brw_wm_compile *c)
+{
+   return &c->fp_instructions[c->nr_fp_insns++];
+}
+
+static struct brw_fp_instruction * emit_tex_op(struct brw_wm_compile *c,
+					     GLuint op,
+					     struct brw_fp_dst dest,
+					     GLuint tex_unit,
+					     GLuint target,
+					     GLuint sampler,
+					     struct brw_fp_src src0,
+					     struct brw_fp_src src1,
+					     struct brw_fp_src src2 )
+{
+   struct brw_fp_instruction *inst = get_fp_inst(c);
+
+   if (tex_unit || target)
+      assert(op == TGSI_OPCODE_TXP ||
+             op == TGSI_OPCODE_TXB ||
+             op == TGSI_OPCODE_TEX ||
+             op == WM_FB_WRITE);
+
+   inst->opcode = op;
+   inst->dst = dest;
+   inst->tex_unit = tex_unit;
+   inst->target = target;
+   inst->sampler = sampler;
+   inst->src[0] = src0;
+   inst->src[1] = src1;
+   inst->src[2] = src2;
+
+   return inst;
+}
+   
+
+static INLINE void emit_op3(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1,
+			    struct brw_fp_src src2 )
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src2);
+}
+
+
+static INLINE void emit_op2(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0,
+			    struct brw_fp_src src1)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src1, src_undef());
+}
+
+static INLINE void emit_op1(struct brw_wm_compile *c,
+			    GLuint op,
+			    struct brw_fp_dst dest,
+			    struct brw_fp_src src0)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src0, src_undef(), src_undef());
+}
+
+static INLINE void emit_op0(struct brw_wm_compile *c,
+			   GLuint op,
+			   struct brw_fp_dst dest)
+{
+   emit_tex_op(c, op, dest, 0, 0, 0, src_undef(), src_undef(), src_undef());
+}
+
+
+
+/* Many opcodes produce the same value across all the result channels.
+ * We'd rather not have to support that splatting in the opcode implementations,
+ * and brw_wm_pass*.c wants to optimize them out by shuffling references around
+ * anyway.  We can easily get both by emitting the opcode to one channel, and
+ * then MOVing it to the others, which brw_wm_pass*.c already understands.
+ */
+static void emit_scalar_insn(struct brw_wm_compile *c,
+			     unsigned opcode,
+			     struct brw_fp_dst dst,
+			     struct brw_fp_src src0,
+			     struct brw_fp_src src1,
+			     struct brw_fp_src src2 )
+{
+   unsigned first_chan = ffs(dst.writemask) - 1;
+   unsigned first_mask = 1 << first_chan;
+
+   if (dst.writemask == 0)
+      return;
+
+   emit_op3( c, opcode,
+	     dst_mask(dst, first_mask),
+	     src0, src1, src2 );
+
+   if (dst.writemask != first_mask) {
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       dst_mask(dst, ~first_mask),
+	       src_scalar(src_reg_from_dst(dst), first_chan));
+   }
+}
+
+
+/***********************************************************************
+ * Special instructions for interpolation and other tasks
+ */
+
+static struct brw_fp_src get_pixel_xy( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_pixel_xy)) {
+      struct brw_fp_dst pixel_xy = get_temp(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+      
+      
+      /* Emit the out calculations, and hold onto the results.  Use
+       * two instructions as a temporary is required.
+       */   
+      /* pixel_xy.xy = PIXELXY payload[0];
+       */
+      emit_op1(c,
+	       WM_PIXELXY,
+	       dst_mask(pixel_xy, BRW_WRITEMASK_XY),
+	       payload_r0_depth);
+
+      c->fp_pixel_xy = src_reg_from_dst(pixel_xy);
+   }
+
+   return c->fp_pixel_xy;
+}
+
+static struct brw_fp_src get_delta_xy( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_delta_xy)) {
+      struct brw_fp_dst delta_xy = get_temp(c);
+      struct brw_fp_src pixel_xy = get_pixel_xy(c);
+      struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+      
+      /* deltas.xy = DELTAXY pixel_xy, payload[0]
+       */
+      emit_op3(c,
+	      WM_DELTAXY,
+	      dst_mask(delta_xy, BRW_WRITEMASK_XY),
+	      pixel_xy, 
+	      payload_r0_depth,
+	      src_undef());
+      
+      c->fp_delta_xy = src_reg_from_dst(delta_xy);
+   }
+
+   return c->fp_delta_xy;
+}
+
+static struct brw_fp_src get_pixel_w( struct brw_wm_compile *c )
+{
+   if (src_is_undef(c->fp_pixel_w)) {
+      struct brw_fp_dst pixel_w = get_temp(c);
+      struct brw_fp_src deltas = get_delta_xy(c);
+
+      /* XXX: assuming position is always first -- valid? 
+       */
+      struct brw_fp_src interp_wpos = src_reg(BRW_FILE_PAYLOAD, 0);
+
+      /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
+       */
+      emit_op3(c,
+	       WM_PIXELW,
+	       dst_mask(pixel_w, BRW_WRITEMASK_W),
+	       interp_wpos,
+	       deltas, 
+	       src_undef());
+      
+
+      c->fp_pixel_w = src_reg_from_dst(pixel_w);
+   }
+
+   return c->fp_pixel_w;
+}
+
+
+/***********************************************************************
+ * Emit INTERP instructions ahead of first use of each attrib.
+ */
+
+static void emit_interp( struct brw_wm_compile *c,
+			 GLuint idx,
+			 GLuint semantic,
+			 GLuint interp_mode )
+{
+   struct brw_fp_dst dst = dst_reg(TGSI_FILE_INPUT, idx);
+   struct brw_fp_src interp = src_reg(BRW_FILE_PAYLOAD, idx);
+   struct brw_fp_src deltas = get_delta_xy(c);
+
+   /* Need to use PINTERP on attributes which have been
+    * multiplied by 1/W in the SF program, and LINTERP on those
+    * which have not:
+    */
+   switch (semantic) {
+   case TGSI_SEMANTIC_POSITION:
+      /* Have to treat wpos.xy specially:
+       */
+      emit_op1(c,
+	      WM_WPOSXY,
+	      dst_mask(dst, BRW_WRITEMASK_XY),
+	      get_pixel_xy(c));
+      
+      /* TGSI_FILE_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+       */
+      emit_op2(c,
+	       WM_LINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_ZW),
+	       interp,
+	       deltas);
+      break;
+
+   case TGSI_SEMANTIC_COLOR:
+      if (c->key.flat_shade) {
+	 emit_op1(c,
+		 WM_CINTERP,
+		 dst,
+		 interp);
+      }
+      else if (interp_mode == TGSI_INTERPOLATE_LINEAR) {
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
+      }
+      else {
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
+      }
+
+      break;
+
+   case TGSI_SEMANTIC_FOG:
+      /* Interpolate the fog coordinate */
+      emit_op3(c,
+	      WM_PINTERP,
+	      dst_mask(dst, BRW_WRITEMASK_X),
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
+
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src_imm1f(c, 0.0));
+
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src_imm1f(c, 1.0));
+      break;
+
+   case TGSI_SEMANTIC_FACE:
+      /* XXX review/test this case */
+      emit_op0(c,
+	       WM_FRONTFACING,
+	       dst_mask(dst, BRW_WRITEMASK_X));
+      
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src_imm1f(c, 0.0));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	       src_imm1f(c, 1.0));
+      break;
+
+   case TGSI_SEMANTIC_PSIZE:
+      /* XXX review/test this case */
+      emit_op3(c,
+	       WM_PINTERP,
+	       dst_mask(dst, BRW_WRITEMASK_XY),
+	       interp,
+	       deltas,
+	       get_pixel_w(c));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      src_imm1f(c, 0.0f));
+
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_W),
+	      src_imm1f(c, 1.0f));
+      break;
+
+   default: 
+      switch (interp_mode) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 emit_op1(c,
+		  WM_CINTERP,
+		  dst,
+		  interp);
+	 break;
+
+      case TGSI_INTERPOLATE_LINEAR:
+	 emit_op2(c,
+		  WM_LINTERP,
+		  dst,
+		  interp,
+		  deltas);
+	 break;
+
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 emit_op3(c,
+		  WM_PINTERP,
+		  dst,
+		  interp,
+		  deltas,
+		  get_pixel_w(c));
+	 break;
+      }
+      break;
+   }
+}
+
+
+/***********************************************************************
+ * Expand various instructions here to simpler forms.  
+ */
+static void precalc_dst( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0,
+			 struct brw_fp_src src1 )
+{
+   if (dst.writemask & BRW_WRITEMASK_Y) {      
+      /* dst.y = mul src0.y, src1.y
+       */
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src0,
+	       src1);
+   }
+
+   if (dst.writemask & BRW_WRITEMASK_XZ) {
+      /* dst.z = mov src0.zzzz
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_mask(dst, BRW_WRITEMASK_Z),
+	      src_scalar(src0, Z));
+
+      /* dst.x = imm1f(1.0)
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_MOV,
+	      dst_saturate(dst_mask(dst, BRW_WRITEMASK_X), 0),
+	      src_imm1f(c, 1.0));
+   }
+   if (dst.writemask & BRW_WRITEMASK_W) {
+      /* dst.w = mov src1.w
+       */
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_mask(dst, BRW_WRITEMASK_W),
+	       src1);
+   }
+}
+
+
+static void precalc_lit( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 struct brw_fp_src src0 )
+{
+   if (dst.writemask & BRW_WRITEMASK_XW) {
+      /* dst.xw = imm(1.0f)
+       */
+      emit_op1(c,
+	       TGSI_OPCODE_MOV,
+	       dst_saturate(dst_mask(dst, BRW_WRITEMASK_XW), 0),
+	       src_imm1f(c, 1.0f));
+   }
+
+   if (dst.writemask & BRW_WRITEMASK_YZ) {
+      emit_op1(c,
+	       TGSI_OPCODE_LIT,
+	       dst_mask(dst, BRW_WRITEMASK_YZ),
+	       src0);
+   }
+}
+
+
+/**
+ * Some TEX instructions require extra code, cube map coordinate
+ * normalization, or coordinate scaling for RECT textures, etc.
+ * This function emits those extra instructions and the TEX
+ * instruction itself.
+ */
+static void precalc_tex( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 unsigned target,
+			 unsigned unit,
+			 struct brw_fp_src src0,
+			 struct brw_fp_src sampler )
+{
+   struct brw_fp_src coord = src_undef();
+   struct brw_fp_dst tmp = dst_undef();
+
+   assert(unit < BRW_MAX_TEX_UNIT);
+
+   /* Cubemap: find longest component of coord vector and normalize
+    * it.
+    */
+   if (target == TGSI_TEXTURE_CUBE) {
+      struct brw_fp_src tmpsrc;
+
+      tmp = get_temp(c);
+      tmpsrc = src_reg_from_dst(tmp);
+
+      /* tmp = abs(src0) */
+      emit_op1(c, 
+	       TGSI_OPCODE_MOV,
+	       tmp,
+	       src_abs(src0));
+
+      /* tmp.X = MAX(tmp.X, tmp.Y) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       dst_mask(tmp, BRW_WRITEMASK_X),
+	       src_scalar(tmpsrc, X),
+	       src_scalar(tmpsrc, Y));
+
+      /* tmp.X = MAX(tmp.X, tmp.Z) */
+      emit_op2(c, TGSI_OPCODE_MAX,
+	       dst_mask(tmp, BRW_WRITEMASK_X),
+	       tmpsrc,
+	       src_scalar(tmpsrc, Z));
+
+      /* tmp.X = 1 / tmp.X */
+      emit_op1(c, TGSI_OPCODE_RCP,
+	      dst_mask(tmp, BRW_WRITEMASK_X),
+	      tmpsrc);
+
+      /* tmp = src0 * tmp.xxxx */
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       tmp,
+	       src0,
+	       src_scalar(tmpsrc, X));
+
+      coord = tmpsrc;
+   }
+   else if (target == TGSI_TEXTURE_RECT ||
+	    target == TGSI_TEXTURE_SHADOWRECT) {
+      /* XXX: need a mechanism for internally generated constants.
+       */
+      coord = src0;
+   }
+   else {
+      coord = src0;
+   }
+
+   /* Need to emit YUV texture conversions by hand.  Probably need to
+    * do this here - the alternative is in brw_wm_emit.c, but the
+    * conversion requires allocating a temporary variable which we
+    * don't have the facility to do that late in the compilation.
+    */
+   if (c->key.yuvtex_mask & (1 << unit)) {
+      /* convert ycbcr to RGBA */
+      GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
+      struct brw_fp_dst tmp = get_temp(c);
+      struct brw_fp_src tmpsrc = src_reg_from_dst(tmp);
+      struct brw_fp_src C0 = src_imm4f( c,  -.5, -.0625, -.5, 1.164 );
+      struct brw_fp_src C1 = src_imm4f( c, 1.596, -0.813, 2.018, -.391 );
+     
+      /* tmp     = TEX ...
+       */
+      emit_tex_op(c, 
+                  TGSI_OPCODE_TEX,
+                  dst_saturate(tmp, dst.saturate),
+                  unit,
+                  target,
+                  sampler.index,
+                  coord,
+                  src_undef(),
+                  src_undef());
+
+      /* tmp.xyz =  ADD TMP, C0
+       */
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       tmpsrc,
+	       C0);
+
+      /* YUV.y   = MUL YUV.y, C0.w
+       */
+      emit_op2(c, TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_Y),
+	       tmpsrc,
+	       src_scalar(C0, W));
+
+      /* 
+       * if (UV swaped)
+       *     RGB.xyz = MAD YUV.zzx, C1, YUV.y
+       * else
+       *     RGB.xyz = MAD YUV.xxz, C1, YUV.y
+       */
+
+      emit_op3(c, TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
+	       ( swap_uv ? 
+		 src_swizzle(tmpsrc, Z,Z,X,X) : 
+		 src_swizzle(tmpsrc, X,X,Z,Z)),
+	       C1,
+	       src_scalar(tmpsrc, Y));
+
+      /*  RGB.y   = MAD YUV.z, C1.w, RGB.y
+       */
+      emit_op3(c,
+	       TGSI_OPCODE_MAD,
+	       dst_mask(dst, BRW_WRITEMASK_Y),
+	       src_scalar(tmpsrc, Z),
+	       src_scalar(C1, W),
+	       src_scalar(src_reg_from_dst(dst), Y));
+
+      release_temp(c, tmp);
+   }
+   else {
+      /* ordinary RGBA tex instruction */
+      emit_tex_op(c, 
+                  TGSI_OPCODE_TEX,
+                  dst,
+                  unit,
+                  target,
+                  sampler.index,
+                  coord,
+                  src_undef(),
+                  src_undef());
+   }
+
+   /* XXX: add GL_EXT_texture_swizzle support to gallium -- by
+    * generating shader varients in mesa state tracker.
+    */
+
+   /* Release this temp if we ended up allocating it:
+    */
+   if (!dst_is_undef(tmp))
+      release_temp(c, tmp);
+}
+
+
+/**
+ * Check if the given TXP instruction really needs the divide-by-W step.
+ */
+static GLboolean projtex( struct brw_wm_compile *c,
+			  unsigned target, 
+			  struct brw_fp_src src )
+{
+   /* Only try to detect the simplest cases.  Could detect (later)
+    * cases where we are trying to emit code like RCP {1.0}, MUL x,
+    * {1.0}, and so on.
+    *
+    * More complex cases than this typically only arise from
+    * user-provided fragment programs anyway:
+    */
+   if (target == TGSI_TEXTURE_CUBE)
+      return GL_FALSE;  /* ut2004 gun rendering !?! */
+   
+   if (src.file == TGSI_FILE_INPUT && 
+       BRW_GET_SWZ(src.swizzle, W) == W &&
+       c->fp->info.input_interpolate[src.index] != TGSI_INTERPOLATE_PERSPECTIVE)
+      return GL_FALSE;
+
+   return GL_TRUE;
+}
+
+
+/**
+ * Emit code for TXP.
+ */
+static void precalc_txp( struct brw_wm_compile *c,
+			 struct brw_fp_dst dst,
+			 unsigned target,
+			 unsigned unit,
+			 struct brw_fp_src src0,
+                         struct brw_fp_src sampler )
+{
+   if (projtex(c, target, src0)) {
+      struct brw_fp_dst tmp = get_temp(c);
+
+      /* tmp0.w = RCP inst.arg[0][3]
+       */
+      emit_op1(c,
+	      TGSI_OPCODE_RCP,
+	      dst_mask(tmp, BRW_WRITEMASK_W),
+	      src_scalar(src0, W));
+
+      /* tmp0.xyz =  MUL inst.arg[0], tmp0.wwww
+       */
+      emit_op2(c,
+	       TGSI_OPCODE_MUL,
+	       dst_mask(tmp, BRW_WRITEMASK_XYZ),
+	       src0,
+	       src_scalar(src_reg_from_dst(tmp), W));
+
+      /* dst = TEX tmp0
+       */
+      precalc_tex(c, 
+		  dst,
+		  target,
+		  unit,
+		  src_reg_from_dst(tmp),
+                  sampler );
+
+      release_temp(c, tmp);
+   }
+   else
+   {
+      /* dst = TEX src0
+       */
+      precalc_tex(c, dst, target, unit, src0, sampler);
+   }
+}
+
+
+/* XXX: note this returns a src_reg.
+ */
+static struct brw_fp_src
+find_output_by_semantic( struct brw_wm_compile *c,
+			 unsigned semantic,
+			 unsigned index )
+{
+   const struct tgsi_shader_info *info = &c->fp->info;
+   unsigned i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return src_reg( TGSI_FILE_OUTPUT, i );
+
+   /* If not found, return some arbitrary immediate value:
+    *
+    * XXX: this is a good idea but immediates are up generating extra
+    * curbe entries atm, as they would have in the original driver.
+    */
+   return src_reg( TGSI_FILE_OUTPUT, 0 ); /* src_imm1f(c, 1.0); */
+}
+
+
+static void emit_fb_write( struct brw_wm_compile *c )
+{
+   struct brw_fp_src payload_r0_depth = src_reg(BRW_FILE_PAYLOAD, PAYLOAD_DEPTH);
+   struct brw_fp_src outdepth = find_output_by_semantic(c, TGSI_SEMANTIC_POSITION, 0);
+   GLuint i;
+
+
+   outdepth = src_scalar(outdepth, Z);
+
+   for (i = 0 ; i < c->key.nr_cbufs; i++) {
+      struct brw_fp_src outcolor;
+      
+      outcolor = find_output_by_semantic(c, TGSI_SEMANTIC_COLOR, i);
+
+      /* Use emit_tex_op so that we can specify the inst->target
+       * field, which is abused to contain the FB write target and the
+       * EOT marker
+       */
+      emit_tex_op(c, WM_FB_WRITE,
+		  dst_undef(),
+		  (i == c->key.nr_cbufs - 1), /* EOT */
+		  i,
+                  0,            /* no sampler */
+		  outcolor,
+		  payload_r0_depth,
+		  outdepth);
+   }
+}
+
+
+static struct brw_fp_dst translate_dst( struct brw_wm_compile *c,
+					const struct tgsi_full_dst_register *dst,
+					unsigned saturate )
+{
+   struct brw_fp_dst out;
+
+   out.file = dst->Register.File;
+   out.index = dst->Register.Index;
+   out.writemask = dst->Register.WriteMask;
+   out.indirect = dst->Register.Indirect;
+   out.saturate = (saturate == TGSI_SAT_ZERO_ONE);
+   
+   if (out.indirect) {
+      assert(dst->Indirect.File == TGSI_FILE_ADDRESS);
+      assert(dst->Indirect.Index == 0);
+   }
+   
+   return out;
+}
+
+
+static struct brw_fp_src translate_src( struct brw_wm_compile *c,
+					const struct tgsi_full_src_register *src )
+{
+   struct brw_fp_src out;
+
+   out.file = src->Register.File;
+   out.index = src->Register.Index;
+   out.indirect = src->Register.Indirect;
+
+   out.swizzle = ((src->Register.SwizzleX << 0) |
+		  (src->Register.SwizzleY << 2) |
+		  (src->Register.SwizzleZ << 4) |
+		  (src->Register.SwizzleW << 6));
+   
+   switch (tgsi_util_get_full_src_register_sign_mode( src, 0 )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      out.abs = 1;
+      out.negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      out.abs = 1;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      out.abs = 0;
+      out.negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+   default:
+      out.abs = 0;
+      out.negate = 0;
+      break;
+   }
+
+   if (out.indirect) {
+      assert(src->Indirect.File == TGSI_FILE_ADDRESS);
+      assert(src->Indirect.Index == 0);
+   }
+   
+   return out;
+}
+
+
+
+static void emit_insn( struct brw_wm_compile *c,
+		       const struct tgsi_full_instruction *inst )
+{
+   unsigned opcode = inst->Instruction.Opcode;
+   struct brw_fp_dst dst;
+   struct brw_fp_src src[3];
+   int i;
+
+   dst = translate_dst( c, &inst->Dst[0],
+			inst->Instruction.Saturate );
+
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++)
+      src[i] = translate_src( c, &inst->Src[i] );
+   
+   switch (opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_op1(c, TGSI_OPCODE_MOV,
+	       dst, 
+	       src_abs(src[0]));
+      break;
+
+   case TGSI_OPCODE_SUB: 
+      emit_op2(c, TGSI_OPCODE_ADD,
+	       dst,
+	       src[0],
+	       src_negate(src[1]));
+      break;
+
+   case TGSI_OPCODE_SCS: 
+      emit_op1(c, TGSI_OPCODE_SCS,
+	       dst_mask(dst, BRW_WRITEMASK_XY),
+	       src[0]);
+      break;
+	 
+   case TGSI_OPCODE_DST:
+      precalc_dst(c, dst, src[0], src[1]);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      precalc_lit(c, dst, src[0]);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      precalc_tex(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
+      break;
+
+   case TGSI_OPCODE_TXP:
+      precalc_txp(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx */
+		  src[0],       /* coord */
+                  src[1]);      /* sampler */
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* XXX: TXB not done
+       */
+      precalc_tex(c, dst,
+		  inst->Texture.Texture,
+		  src[1].index,	/* use sampler unit for tex idx*/
+		  src[0],
+                  src[1]);
+      break;
+
+   case TGSI_OPCODE_XPD: 
+      emit_op2(c, TGSI_OPCODE_XPD,
+	       dst_mask(dst, BRW_WRITEMASK_XYZ),
+	       src[0], 
+	       src[1]);
+      break;
+
+   case TGSI_OPCODE_KIL: 
+      emit_op1(c, TGSI_OPCODE_KIL,
+	       dst_mask(dst_undef(), 0),
+	       src[0]);
+      break;
+
+   case TGSI_OPCODE_END:
+      emit_fb_write(c);
+      break;
+   default:
+      if (!c->key.has_flow_control &&
+	  brw_wm_is_scalar_result(opcode))
+	 emit_scalar_insn(c, opcode, dst, src[0], src[1], src[2]);
+      else
+	 emit_op3(c, opcode, dst, src[0], src[1], src[2]);
+      break;
+   }
+}
+
+/**
+ * Initial pass for fragment program code generation.
+ * This function is used by both the GLSL and non-GLSL paths.
+ */
+int brw_wm_pass_fp( struct brw_wm_compile *c )
+{
+   struct brw_fragment_shader *fs = c->fp;
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction *inst;
+   struct tgsi_full_declaration *decl;
+   const float *imm;
+   GLuint size;
+   GLuint i;
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("pre-fp:\n");
+      tgsi_dump(fs->tokens, 0); 
+   }
+
+   c->fp_pixel_xy = src_undef();
+   c->fp_delta_xy = src_undef();
+   c->fp_pixel_w = src_undef();
+   c->nr_fp_insns = 0;
+   c->nr_immediates = 0;
+
+
+   /* Loop over all instructions doing assorted simplifications and
+    * transformations.
+    */
+   tgsi_parse_init( &parse, fs->tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* Turn intput declarations into special WM_* instructions.
+	  *
+	  * XXX: For non-branching shaders, consider deferring variable
+	  * initialization as late as possible to minimize register
+	  * usage.  This is how the original BRW driver worked.
+	  *
+	  * In a branching shader, must preamble instructions at decl
+	  * time, as instruction order in the shader does not
+	  * correspond to the order instructions are executed in the
+	  * wild.
+	  *
+	  * This is where special instructions such as WM_CINTERP,
+	  * WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+	  * compute shader inputs from the payload registers and pixel
+	  * position.
+	  */
+         decl = &parse.FullToken.FullDeclaration;
+         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+            unsigned first, last, mask;
+            unsigned attrib;
+
+            first = decl->Range.First;
+            last = decl->Range.Last;
+            mask = decl->Declaration.UsageMask;
+
+            for (attrib = first; attrib <= last; attrib++) {
+	       emit_interp(c, 
+			   attrib, 
+			   decl->Semantic.Name,
+			   decl->Declaration.Interpolate );
+            }
+         }
+	 
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+	 /* Unlike VS programs we can probably manage fine encoding
+	  * immediate values directly into the emitted EU
+	  * instructions, as we probably only need to reference one
+	  * float value per instruction.  Just save the data for now
+	  * and use directly later.
+	  */
+	 i = c->nr_immediates++;
+	 imm = &parse.FullToken.FullImmediate.u[i].Float;
+	 size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+
+	 if (c->nr_immediates >= BRW_WM_MAX_CONST)
+	    return PIPE_ERROR_OUT_OF_MEMORY;
+
+	 for (i = 0; i < size; i++)
+	    c->immediate[c->nr_immediates].v[i] = imm[i];
+
+	 for (; i < 4; i++)
+	    c->immediate[c->nr_immediates].v[i] = 0.0;
+
+	 c->immediate[c->nr_immediates].nr = size;
+	 c->nr_immediates++;
+	 break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         inst = &parse.FullToken.FullInstruction;
+	 emit_insn(c, inst);
+	 break;
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_fp_program( c, "pass_fp" );
+      debug_printf("\n");
+   }
+
+   return c->error;
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_glsl.c b/src/gallium/drivers/i965/brw_wm_glsl.c
new file mode 100644
index 00000000000..3b3afc39d3c
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_glsl.c
@@ -0,0 +1,2032 @@
+#include "util/u_math.h"
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint component);
+
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      debug_printf("%s: ran out of registers for fragment program", __FUNCTION__);
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
+/**
+ * Record the mapping of a Mesa register to a hardware register.
+ */
+static void set_reg(struct brw_wm_compile *c, int file, int index, 
+	int component, struct brw_reg reg)
+{
+    c->wm_regs[file][index][component].reg = reg;
+    c->wm_regs[file][index][component].inited = GL_TRUE;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+    struct brw_reg reg;
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
+    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
+    return reg;
+
+}
+
+/**
+ * Save current temp register info.
+ * There must be a matching call to release_tmps().
+ */
+static int mark_tmps(struct brw_wm_compile *c)
+{
+    return c->tmp_index;
+}
+
+static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
+{
+    return brw_vec8_grf( c->tmp_regs[ index ], 0 );
+}
+
+static void release_tmps(struct brw_wm_compile *c, int mark)
+{
+    c->tmp_index = mark;
+}
+
+/**
+ * Convert Mesa src register to brw register.
+ *
+ * Since we're running in SOA mode each Mesa register corresponds to four
+ * hardware registers.  We allocate the hardware registers as needed here.
+ *
+ * \param file  register file, one of PROGRAM_x
+ * \param index  register number
+ * \param component  src component (X=0, Y=1, Z=2, W=3)
+ * \param nr  not used?!?
+ * \param neg  negate value?
+ * \param abs  take absolute value?
+ */
+static struct brw_reg 
+get_reg(struct brw_wm_compile *c, int file, int index, int component,
+        int nr, GLuint neg, GLuint abs)
+{
+    struct brw_reg reg;
+    switch (file) {
+	case TGSI_FILE_NULL:
+	    return brw_null_reg();	
+
+	case TGSI_FILE_CONSTANT:
+	case TGSI_FILE_TEMPORARY:
+	case TGSI_FILE_INPUT:
+	case TGSI_FILE_OUTPUT:
+	case BRW_FILE_PAYLOAD:
+	    break;
+
+	default:
+	   debug_printf("%s: Unexpected file type\n", __FUNCTION__);
+	   return brw_null_reg();
+    }
+
+    assert(index < 256);
+    assert(component < 4);
+
+    /* see if we've already allocated a HW register for this Mesa register */
+    if (c->wm_regs[file][index][component].inited) {
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
+    }
+    else {
+	/* no, allocate new register */
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
+
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
+
+       set_reg(c, file, index, component, reg);
+    }
+
+    if (neg & (1 << component)) {
+	reg = negate(reg);
+    }
+    if (abs)
+	reg = brw_abs(reg);
+    return reg;
+}
+
+
+
+
+/**
+ * Find first/last instruction that references each temporary register.
+ */
+GLboolean
+_mesa_find_temp_intervals(const struct prog_instruction *instructions,
+                          GLuint numInstructions,
+                          GLint intBegin[MAX_PROGRAM_TEMPS],
+                          GLint intEnd[MAX_PROGRAM_TEMPS])
+{
+   struct loop_info
+   {
+      GLuint Start, End;  /**< Start, end instructions of loop */
+   };
+   struct loop_info loopStack[MAX_LOOP_NESTING];
+   GLuint loopStackDepth = 0;
+   GLuint i;
+
+   for (i = 0; i < MAX_PROGRAM_TEMPS; i++){
+      intBegin[i] = intEnd[i] = -1;
+   }
+
+   /* Scan instructions looking for temporary registers */
+   for (i = 0; i < numInstructions; i++) {
+      const struct prog_instruction *inst = instructions + i;
+      if (inst->Opcode == OPCODE_BGNLOOP) {
+         loopStack[loopStackDepth].Start = i;
+         loopStack[loopStackDepth].End = inst->BranchTarget;
+         loopStackDepth++;
+      }
+      else if (inst->Opcode == OPCODE_ENDLOOP) {
+         loopStackDepth--;
+      }
+      else if (inst->Opcode == OPCODE_CAL) {
+         return GL_FALSE;
+      }
+      else {
+         const GLuint numSrc = 3;
+         GLuint j;
+         for (j = 0; j < numSrc; j++) {
+            if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+               const GLuint index = inst->SrcReg[j].Index;
+               if (inst->SrcReg[j].RelAddr)
+                  return GL_FALSE;
+               update_interval(intBegin, intEnd, index, i);
+               if (loopStackDepth > 0) {
+                  /* extend temp register's interval to end of loop */
+                  GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+                  update_interval(intBegin, intEnd, index, loopEnd);
+               }
+            }
+         }
+         if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+            const GLuint index = inst->DstReg.Index;
+            if (inst->DstReg.RelAddr)
+               return GL_FALSE;
+            update_interval(intBegin, intEnd, index, i);
+            if (loopStackDepth > 0) {
+               /* extend temp register's interval to end of loop */
+               GLuint loopEnd = loopStack[loopStackDepth - 1].End;
+               update_interval(intBegin, intEnd, index, loopEnd);
+            }
+         }
+      }
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[BRW_WM_MAX_TEMPS];
+   GLint intEnd[BRW_WM_MAX_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->fp_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < BRW_WM_MAX_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[TGSI_FILE_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[TGSI_FILE_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
+/**
+ * Preallocate registers.  This sets up the Mesa to hardware register
+ * mapping for certain registers, such as constants (uniforms/state vars)
+ * and shader inputs.
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+    int i, j;
+    struct brw_reg reg;
+    int urb_read_length = 0;
+    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
+
+    for (i = 0; i < 4; i++) {
+        if (i < c->key.nr_depth_regs) 
+            reg = brw_vec8_grf(i * 2, 0);
+        else
+            reg = brw_vec8_grf(0, 0);
+	set_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, i, reg);
+    }
+    reg_index += 2 * c->key.nr_depth_regs;
+
+    /* constants */
+    {
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
+
+        /* use a real constant buffer, or just use a section of the GRF? */
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
+
+        if (c->fp->use_const_buffer) {
+           /* We'll use a real constant buffer and fetch constants from
+            * it with a dataport read message.
+            */
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 0;
+        }
+        else {
+           const struct gl_program_parameter_list *plist = 
+              c->fp->program.Base.Parameters;
+           int index = 0;
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 4 * nr_params;
+
+           /* loop over program constants (float[4]) */
+           for (i = 0; i < nr_params; i++) {
+              /* loop over XYZW channels */
+              for (j = 0; j < 4; j++, index++) {
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
+                 /* Save pointer to parameter/constant value.
+                  * Constants will be copied in prepare_constant_buffer()
+                  */
+                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
+                 set_reg(c, TGSI_FILE_STATE_VAR, i, j, reg);
+              }
+           }
+           /* number of constant regs used (each reg is float[8]) */
+           c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
+           reg_index += c->nr_creg;
+        }
+    }
+
+    /* fragment shader inputs */
+    for (i = 0; i < VERT_RESULT_MAX; i++) {
+       int fp_input;
+
+       if (i >= VERT_RESULT_VAR0)
+	  fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
+       else if (i <= VERT_RESULT_TEX7)
+	  fp_input = i;
+       else
+	  fp_input = -1;
+
+       if (fp_input >= 0 && inputs & (1 << fp_input)) {
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
+	  for (j = 0; j < 4; j++)
+	     set_reg(c, TGSI_FILE_PAYLOAD, fp_input, j, reg);
+       }
+       if (c->key.nr_vp_outputs > i) {
+	  reg_index += 2;
+       }
+    }
+
+    c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+    c->prog_data.urb_read_length = urb_read_length;
+    c->prog_data.curb_read_length = c->nr_creg;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	const struct brw_fp_instruction *inst = &c->fp_instructions[i];
+	struct brw_reg dst[4];
+
+	switch (inst->Opcode) {
+	case OPCODE_TEX:
+	case OPCODE_TXB:
+	    /* Allocate the channels of texture results contiguously,
+	     * since they are written out that way by the sampler unit.
+	     */
+	    for (j = 0; j < 4; j++) {
+		dst[j] = get_dst_reg(c, inst, j);
+		if (j != 0)
+		    assert(dst[j].nr == dst[j - 1].nr + 1);
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    /* An instruction may reference up to three constants.
+     * They'll be found in these registers.
+     * XXX alloc these on demand!
+     */
+    if (c->fp->use_const_buffer) {
+       for (i = 0; i < 3; i++) {
+          c->current_const[i].index = -1;
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
+       }
+    }
+#if 0
+    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
+#endif
+}
+
+
+/**
+ * Check if any of the instruction's src registers are constants, uniforms,
+ * or statevars.  If so, fetch any constants that we don't already have in
+ * the three GRF slots.
+ */
+static void fetch_constants(struct brw_wm_compile *c,
+                            const struct brw_fp_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   /* loop over instruction src regs */
+   for (i = 0; i < 3; i++) {
+      const struct prog_src_register *src = &inst->SrcReg[i];
+      if (src->File == TGSI_FILE_IMMEDIATE ||
+          src->File == TGSI_FILE_CONSTANT) {
+	 c->current_const[i].index = src->Index;
+
+#if 0
+	 printf("  fetch const[%d] for arg %d into reg %d\n",
+		src->Index, i, c->current_const[i].reg.nr);
+#endif
+
+	 /* need to fetch the constant now */
+	 brw_dp_READ_4(p,
+		       c->current_const[i].reg,  /* writeback dest */
+		       src->RelAddr,             /* relative indexing? */
+		       16 * src->Index,          /* byte offset */
+		       SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
+		       );
+      }
+   }
+}
+
+
+/**
+ * Convert Mesa dst register to brw register.
+ */
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint component)
+{
+    const int nr = 1;
+    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
+	    0, 0);
+}
+
+
+static struct brw_reg
+get_src_reg_const(struct brw_wm_compile *c,
+                  const struct brw_fp_instruction *inst,
+                  GLuint srcRegIndex, GLuint component)
+{
+   /* We should have already fetched the constant from the constant
+    * buffer in fetch_constants().  Now we just have to return a
+    * register description that extracts the needed component and
+    * smears it across all eight vector components.
+    */
+   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+   struct brw_reg const_reg;
+
+   assert(component < 4);
+   assert(srcRegIndex < 3);
+   assert(c->current_const[srcRegIndex].index != -1);
+   const_reg = c->current_const[srcRegIndex].reg;
+
+   /* extract desired float from the const_reg, and smear */
+   const_reg = stride(const_reg, 0, 1, 0);
+   const_reg.subnr = component * 4;
+
+   if (src->Negate)
+      const_reg = negate(const_reg);
+   if (src->Abs)
+      const_reg = brw_abs(const_reg);
+
+#if 0
+   printf("  form const[%d].%d for arg %d, reg %d\n",
+          c->current_const[srcRegIndex].index,
+          component,
+          srcRegIndex,
+          const_reg.nr);
+#endif
+
+   return const_reg;
+}
+
+
+/**
+ * Convert Mesa src register to brw register.
+ */
+static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
+                                  const struct brw_fp_instruction *inst,
+                                  GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    const GLuint nr = 1;
+    const GLuint component = BRW_GET_SWZ(src->Swizzle, channel);
+
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
+    if (c->fp->use_const_buffer &&
+        (src->File == TGSI_FILE_STATE_VAR ||
+         src->File == TGSI_FILE_CONSTANT ||
+         src->File == TGSI_FILE_UNIFORM)) {
+       return get_src_reg_const(c, inst, srcRegIndex, component);
+    }
+    else {
+       /* other type of source register */
+       return get_reg(c, src->File, src->Index, component, nr, 
+                      src->Negate, src->Abs);
+    }
+}
+
+
+/**
+ * Same as \sa get_src_reg() but if the register is a immediate, emit
+ * a brw_reg encoding the immediate.
+ * Note that a brw instruction only allows one src operand to be a immediate.
+ * For instructions with more than one operand, only the second can be a
+ * immediate.  This means that we treat some immediates as constants
+ * (which why TGSI_FILE_IMMEDIATE is checked in fetch_constants()).
+ * 
+ */
+static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
+                                      const struct brw_fp_instruction *inst,
+                                      GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    if (src->File == TGSI_FILE_IMMEDIATE) {
+       /* an immediate */
+       const int component = BRW_GET_SWZ(src->Swizzle, channel);
+       const GLfloat *param =
+          c->fp->program.Base.Parameters->ParameterValues[src->Index];
+       GLfloat value = param[component];
+       if (src->Negate)
+          value = -value;
+       if (src->Abs)
+          value = FABSF(value);
+#if 0
+       printf("  form immed value %f for chan %d\n", value, channel);
+#endif
+       return brw_imm_f(value);
+    }
+    else {
+       return get_src_reg(c, inst, srcRegIndex, channel);
+    }
+}
+
+
+/**
+ * Subroutines are minimal support for resusable instruction sequences.
+ * They are implemented as simply as possible to minimise overhead: there
+ * is no explicit support for communication between the caller and callee
+ * other than saving the return address in a temporary register, nor is
+ * there any automatic local storage.  This implies that great care is
+ * required before attempting reentrancy or any kind of nested
+ * subroutine invocations.
+ */
+static void invoke_subroutine( struct brw_wm_compile *c,
+			       enum _subroutine subroutine,
+			       void (*emit)( struct brw_wm_compile * ) )
+{
+    struct brw_compile *p = &c->func;
+
+    assert( subroutine < BRW_WM_MAX_SUBROUTINE );
+    
+    if( c->subroutines[ subroutine ] ) {
+	/* subroutine previously emitted: reuse existing instructions */
+
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	int here = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
+
+	brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+		 brw_imm_d( ( c->subroutines[ subroutine ] -
+			      here - 1 ) << 4 ) );
+	brw_pop_insn_state(p);
+
+	release_tmps( c, mark );
+    } else {
+	/* previously unused subroutine: emit, and mark for later reuse */
+	
+	int mark = mark_tmps( c );
+	struct brw_reg return_address = retype( alloc_tmp( c ),
+						BRW_REGISTER_TYPE_UD );
+	struct brw_instruction *calc;
+	int base = p->nr_insn;
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
+	brw_pop_insn_state(p);
+	
+	c->subroutines[ subroutine ] = p->nr_insn;
+
+	emit( c );
+	
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE);
+	brw_MOV( p, brw_ip_reg(), return_address );
+	brw_pop_insn_state(p);
+
+	brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
+	
+	release_tmps( c, mark );
+    }
+}
+
+static void emit_trunc( struct brw_wm_compile *c,
+                        const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i);
+	    src = get_src_reg(c, inst, 0, i);
+	    brw_RNDZ(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_mov( struct brw_wm_compile *c,
+                      const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    struct brw_reg src, dst;
+	    dst = get_dst_reg(c, inst, i);
+            /* XXX some moves from immediate value don't work reliably!!! */
+            /*src = get_src_reg_imm(c, inst, 0, i);*/
+            src = get_src_reg(c, inst, 0, i);
+	    brw_MOV(p, dst, src);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_pixel_xy(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+    struct brw_reg dst0, dst1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    /* Calculate pixel centers by adding 1 or 0 to each of the
+     * micro-tile coordinates passed in r1.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 4), 2, 4, 0),
+		brw_imm_v(0x10101010));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
+		stride(suboffset(r1_uw, 5), 2, 4, 0),
+		brw_imm_v(0x11001100));
+    }
+}
+
+static void emit_delta_xy(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_reg r1 = brw_vec1_grf(1, 0);
+    struct brw_reg dst0, dst1, src0, src1;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
+    src1 = get_src_reg(c, inst, 0, 1);
+    /* Calc delta X,Y by subtracting origin in r1 from the pixel
+     * centers.
+     */
+    if (mask & WRITEMASK_X) {
+	brw_ADD(p,
+		dst0,
+		retype(src0, BRW_REGISTER_TYPE_UW),
+		negate(r1));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	brw_ADD(p,
+		dst1,
+		retype(src1, BRW_REGISTER_TYPE_UW),
+		negate(suboffset(r1,1)));
+
+    }
+}
+
+static void fire_fb_write( struct brw_wm_compile *c,
+                           GLuint base_reg,
+                           GLuint nr,
+                           GLuint target,
+                           GLuint eot)
+{
+    struct brw_compile *p = &c->func;
+    /* Pass through control information:
+     */
+    /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+    {
+	brw_push_insn_state(p);
+	brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+	brw_MOV(p,
+		brw_message_reg(base_reg + 1),
+		brw_vec8_grf(1, 0));
+	brw_pop_insn_state(p);
+    }
+    /* Send framebuffer write message: */
+    brw_fb_WRITE(p,
+	    retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+	    base_reg,
+	    retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+	    target,              
+	    nr,
+	    0,
+	    eot);
+}
+
+static void emit_fb_write(struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    int nr = 2;
+    int channel;
+    GLuint target, eot;
+    struct brw_reg src0;
+
+    /* Reserve a space for AA - may not be needed:
+     */
+    if (c->key.aa_dest_stencil_reg)
+	nr += 1;
+
+    brw_push_insn_state(p);
+    for (channel = 0; channel < 4; channel++) {
+        src0 = get_src_reg(c,  inst, 0, channel);
+        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+        brw_MOV(p, brw_message_reg(nr + channel), src0);
+    }
+    /* skip over the regs populated above: */
+    nr += 8;
+    brw_pop_insn_state(p);
+
+    if (c->key.source_depth_to_render_target) {
+       if (c->key.computes_depth) {
+          src0 = get_src_reg(c, inst, 2, 2);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+       else {
+          src0 = get_src_reg(c, inst, 1, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+
+       nr += 2;
+    }
+
+    if (c->key.dest_depth_reg) {
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
+
+        if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
+
+            brw_push_insn_state(p);
+            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
+            /* 2nd half? */
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
+            brw_pop_insn_state(p);
+        }
+        else
+        {
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
+        }
+        nr += 2;
+   }
+
+    target = inst->Aux >> 1;
+    eot = inst->Aux & 1;
+    fire_fb_write(c, 0, nr, target, eot);
+}
+
+static void emit_pixel_w( struct brw_wm_compile *c,
+                          const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    if (mask & WRITEMASK_W) {
+	struct brw_reg dst, src0, delta0, delta1;
+	struct brw_reg interp3;
+
+	dst = get_dst_reg(c, inst, 3);
+	src0 = get_src_reg(c, inst, 0, 0);
+	delta0 = get_src_reg(c, inst, 1, 0);
+	delta1 = get_src_reg(c, inst, 1, 1);
+
+	interp3 = brw_vec1_grf(src0.nr+1, 4);
+	/* Calc 1/w - just linterp wpos[3] optimized by putting the
+	 * result straight into a message reg.
+	 */
+	brw_LINE(p, brw_null_reg(), interp3, delta0);
+	brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
+
+	/* Calc w */
+	brw_math_16( p, dst,
+		BRW_MATH_FUNCTION_INV,
+		BRW_MATH_SATURATE_NONE,
+		2, brw_null_reg(),
+		BRW_MATH_PRECISION_FULL);
+    }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
+	}
+    }
+}
+
+static void emit_cinterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, src0;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, suboffset(interp[i],3));
+	}
+    }
+}
+
+static void emit_pinterp(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+
+    struct brw_reg interp[4];
+    struct brw_reg dst, delta0, delta1;
+    struct brw_reg src0, w;
+    GLuint nr, i;
+
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    w = get_src_reg(c, inst, 2, 3);
+    nr = src0.nr;
+
+    interp[0] = brw_vec1_grf(nr, 0);
+    interp[1] = brw_vec1_grf(nr, 4);
+    interp[2] = brw_vec1_grf(nr+1, 0);
+    interp[3] = brw_vec1_grf(nr+1, 4);
+
+    for(i = 0; i < 4; i++ ) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
+	    brw_MAC(p, dst, suboffset(interp[i],1), 
+		    delta1);
+	    brw_MUL(p, dst, dst, w);
+	}
+    }
+}
+
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing(struct brw_wm_compile *c,
+			     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+    struct brw_reg dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	}
+    }
+
+    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+     * us front face
+     */
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	}
+    }
+    brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+static void emit_xpd(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    for (i = 0; i < 4; i++) {
+	GLuint i2 = (i+2)%3;
+	GLuint i1 = (i+1)%3;
+	if (mask & (1<<i)) {
+	    struct brw_reg src0, src1, dst;
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = negate(get_src_reg(c, inst, 0, i2));
+	    src1 = get_src_reg_imm(c, inst, 1, i1);
+	    brw_MUL(p, brw_null_reg(), src0, src1);
+	    src0 = get_src_reg(c, inst, 0, i1);
+	    src1 = get_src_reg_imm(c, inst, 1, i2);
+	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+	    brw_MAC(p, dst, src0, src1);
+	    brw_set_saturate(p, 0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[3], src1[3], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 3; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_MAC(p, dst, src0[3], src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_reg src0[4], src1[4], dst;
+    int i;
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    for (i = 0; i < 4; i++) {
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
+    }
+    dst = get_dst_reg(c, inst, dst_chan);
+    brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+    brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+    brw_MAC(p, dst, src0[2], src1[2]);
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    brw_ADD(p, dst, dst, src1[3]);
+    brw_set_saturate(p, 0);
+}
+
+/**
+ * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
+ * Note that the result of the function is smeared across the dest
+ * register's X, Y, Z and W channels (subject to writemasking of course).
+ */
+static void emit_math1(struct brw_wm_compile *c,
+                       const struct brw_fp_instruction *inst, GLuint func)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    /* Get first component of source register */
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg(c, inst, 0, 0);
+
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_math(p,
+             dst,
+             func,
+             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+             2,
+             brw_null_reg(),
+             BRW_MATH_DATA_VECTOR,
+             BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_rcp(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+}
+
+static void emit_rsq(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+}
+
+static void emit_sin(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+}
+
+static void emit_cos(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+}
+
+static void emit_ex2(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+}
+
+static void emit_lg2(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+}
+
+static void emit_add(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_ADD(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_arl(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, addr_reg;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+                           BRW_ARF_ADDRESS, 0);
+    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
+    brw_MOV(p, addr_reg, src0);
+    brw_set_saturate(p, 0);
+}
+
+
+static void emit_mul(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, src1, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_MUL(p, dst, src0, src1);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+static void emit_frc(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
+	    brw_FRC(p, dst, src0);
+	}
+    }
+    if (inst->SaturateMode != SATURATE_OFF)
+	brw_set_saturate(p, 0);
+}
+
+static void emit_flr(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+    for (i = 0 ; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
+	    brw_RNDD(p, dst, src0);
+	}
+    }
+    brw_set_saturate(p, 0);
+}
+
+
+static void emit_min_max(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    const GLuint mask = inst->DstReg.WriteMask;
+    const int mark = mark_tmps(c);
+    int i;
+    brw_push_insn_state(p);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+            struct brw_reg real_dst = get_dst_reg(c, inst, i);
+	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
+	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
+            struct brw_reg dst;
+            /* if dst==src0 or dst==src1 we need to use a temp reg */
+            GLboolean use_temp = brw_same_reg(dst, src0) ||
+                                 brw_same_reg(dst, src1);
+            if (use_temp)
+               dst = alloc_tmp(c);
+            else
+               dst = real_dst;
+
+            /*
+            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
+                   dst.nr, src0.nr, src1.nr);
+            */
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MOV(p, dst, src0);
+	    brw_set_saturate(p, 0);
+
+            if (inst->Opcode == OPCODE_MIN)
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            else
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
+
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, src1);
+	    brw_set_saturate(p, 0);
+	    brw_set_predicate_control_flag_value(p, 0xff);
+            if (use_temp)
+               brw_MOV(p, real_dst, dst);
+	}
+    }
+    brw_pop_insn_state(p);
+    release_tmps(c, mark);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst, src0, src1;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg_imm(c, inst, 0, 0);
+    src1 = get_src_reg_imm(c, inst, 1, 0);
+
+    brw_MOV(p, brw_message_reg(2), src0);
+    brw_MOV(p, brw_message_reg(3), src1);
+
+    brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+    int i;
+    int mark = mark_tmps(c);
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+
+	    if (src1.nr == dst.nr) {
+		tmp1 = alloc_tmp(c);
+		brw_MOV(p, tmp1, src1);
+	    } else
+		tmp1 = src1;
+
+	    src2 = get_src_reg(c, inst, 2, i);
+	    if (src2.nr == dst.nr) {
+		tmp2 = alloc_tmp(c);
+		brw_MOV(p, tmp2, src2);
+	    } else
+		tmp2 = src2;
+
+	    brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	    brw_MUL(p, brw_null_reg(), dst, tmp2);
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_MAC(p, dst, src0, tmp1);
+	    brw_set_saturate(p, 0);
+	}
+	release_tmps(c, mark);
+    }
+}
+
+/**
+ * For GLSL shaders, this KIL will be unconditional.
+ * It may be contained inside an IF/ENDIF structure of course.
+ */
+static void emit_kil(struct brw_wm_compile *c)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+    brw_push_insn_state(p);
+    brw_set_mask_control(p, BRW_MASK_DISABLE);
+    brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+    brw_AND(p, depth, c->emit_mask_reg, depth);
+    brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1, src2;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    src2 = get_src_reg_imm(c, inst, 2, i);
+	    brw_MUL(p, dst, src0, src1);
+
+	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+	    brw_ADD(p, dst, dst, src2);
+	    brw_set_saturate(p, 0);
+	}
+    }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst, GLuint cond)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg dst, src0, src1;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    brw_push_insn_state(p);
+	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	    brw_pop_insn_state(p);
+	}
+    }
+}
+
+static void emit_slt(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_GE);
+}
+
+static void emit_seq(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+}
+
+static INLINE struct brw_reg high_words( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
+		   0, 8, 2 );
+}
+
+static INLINE struct brw_reg low_words( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
+}
+
+static INLINE struct brw_reg even_bytes( struct brw_reg reg )
+{
+    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
+}
+
+static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
+{
+    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
+		   0, 16, 2 );
+}
+
+
+    
+static void emit_wpos_xy(struct brw_wm_compile *c,
+                         const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    struct brw_reg src0[2], dst[2];
+
+    dst[0] = get_dst_reg(c, inst, 0);
+    dst[1] = get_dst_reg(c, inst, 1);
+
+    src0[0] = get_src_reg(c, inst, 0, 0);
+    src0[1] = get_src_reg(c, inst, 0, 1);
+
+    /* Calculate the pixel offset from window bottom left into destination
+     * X and Y channels.
+     */
+    if (mask & WRITEMASK_X) {
+	/* X' = X */
+	brw_MOV(p,
+		dst[0],
+		retype(src0[0], BRW_REGISTER_TYPE_W));
+    }
+
+    if (mask & WRITEMASK_Y) {
+	/* Y' = height - 1 - Y */
+	brw_ADD(p,
+		dst[1],
+		negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
+		brw_imm_d(c->key.drawable_height - 1));
+    }
+}
+
+/* TODO
+   BIAS on SIMD8 not working yet...
+ */	
+static void emit_txb(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
+    GLuint i;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
+
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, inst, 0, i);
+
+    switch (inst->tex_target) {
+	case TEXTURE_1D_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
+	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    break;
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
+	    brw_MOV(p, brw_message_reg(2), src[0]);
+	    brw_MOV(p, brw_message_reg(3), src[1]);
+	    brw_MOV(p, brw_message_reg(4), src[2]);
+	    break;
+	default:
+            /* invalid target */
+            abort();
+    }
+    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
+    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
+    } else {
+        /* Does it work well on SIMD8? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+    }
+
+    brw_SAMPLE(p,
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
+               1,                                           /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                        /* sampler */
+               inst->DstReg.WriteMask,                      /* writemask */
+               msg_type,                                    /* msg_type */
+               4,                                           /* response_length */
+               4,                                           /* msg_length */
+               0,                                           /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
+}
+
+
+static void emit_tex(struct brw_wm_compile *c,
+                     const struct brw_fp_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg dst[4], src[4], payload_reg;
+    /* Note: tex_unit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->tex_unit;
+    GLuint msg_len;
+    GLuint i, nr;
+    GLuint emit;
+    GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
+
+    payload_reg = get_reg(c, TGSI_FILE_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+    for (i = 0; i < 4; i++) 
+	dst[i] = get_dst_reg(c, inst, i);
+    for (i = 0; i < 4; i++)
+	src[i] = get_src_reg(c, inst, 0, i);
+
+    switch (inst->tex_target) {
+	case TEXTURE_1D_INDEX:
+	    emit = WRITEMASK_X;
+	    nr = 1;
+	    break;
+	case TEXTURE_2D_INDEX:
+	case TEXTURE_RECT_INDEX:
+	    emit = WRITEMASK_XY;
+	    nr = 2;
+	    break;
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
+	    emit = WRITEMASK_XYZ;
+	    nr = 3;
+	    break;
+	default:
+           /* invalid target */
+           abort();
+    }
+    msg_len = 1;
+
+    /* move/load S, T, R coords */
+    for (i = 0; i < nr; i++) {
+	static const GLuint swz[4] = {0,1,2,2};
+	if (emit & (1<<i))
+	    brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+	else
+	    brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+	msg_len += 1;
+    }
+
+    if (shadow) {
+       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
+       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
+    }
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        if (shadow)
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
+        else
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
+    } else {
+        /* Does it work for shadow on SIMD8 ? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+    }
+    
+    brw_SAMPLE(p,
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
+               1,                                          /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                       /* sampler */
+               inst->DstReg.WriteMask,                     /* writemask */
+               msg_type,                                   /* msg_type */
+               4,                                          /* response_length */
+               shadow ? 6 : 4,                             /* msg_length */
+               0,                                          /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
+
+    if (shadow)
+	brw_MOV(p, dst[3], brw_imm_f(1.0));
+}
+
+
+/**
+ * Resolve subroutine calls after code emit is done.
+ */
+static void post_wm_emit( struct brw_wm_compile *c )
+{
+    brw_resolve_cals(&c->func);
+}
+
+static void
+get_argument_regs(struct brw_wm_compile *c,
+		  const struct brw_fp_instruction *inst,
+		  int index,
+		  struct brw_reg *regs,
+		  int mask)
+{
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1 << i))
+	    regs[i] = get_src_reg(c, inst, index, i);
+    }
+}
+
+static void brw_wm_emit_branching_shader(struct brw_context *brw, struct brw_wm_compile *c)
+{
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
+    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+    GLuint i, if_depth = 0, loop_depth = 0;
+    struct brw_compile *p = &c->func;
+    struct brw_indirect stack_index = brw_indirect(0, 0);
+
+    c->out_of_regs = GL_FALSE;
+
+    prealloc_reg(c);
+    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+        const struct brw_fp_instruction *inst = &c->fp_instructions[i];
+	int dst_flags;
+	struct brw_reg args[3][4], dst[4];
+	int j;
+
+        c->cur_inst = i;
+
+#if 0
+        debug_printf("Inst %d: ", i);
+        _mesa_print_instruction(inst);
+#endif
+
+        /* fetch any constants that this instruction needs */
+        if (c->fp->use_const_buffer)
+           fetch_constants(c, inst);
+
+	if (inst->CondUpdate)
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	else
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
+	dst_flags = inst->DstReg.WriteMask;
+	if (inst->SaturateMode == SATURATE_ZERO_ONE)
+	    dst_flags |= SATURATE;
+
+	switch (inst->Opcode) {
+	    case WM_PIXELXY:
+		emit_pixel_xy(c, inst);
+		break;
+	    case WM_DELTAXY: 
+		emit_delta_xy(c, inst);
+		break;
+	    case WM_PIXELW:
+		emit_pixel_w(c, inst);
+		break;	
+	    case WM_LINTERP:
+		emit_linterp(c, inst);
+		break;
+	    case WM_PINTERP:
+		emit_pinterp(c, inst);
+		break;
+	    case WM_CINTERP:
+		emit_cinterp(c, inst);
+		break;
+	    case WM_WPOSXY:
+		emit_wpos_xy(c, inst);
+		break;
+	    case WM_FB_WRITE:
+		emit_fb_write(c, inst);
+		break;
+	    case WM_FRONTFACING:
+		emit_frontfacing(c, inst);
+		break;
+	    case OPCODE_ADD:
+		emit_add(c, inst);
+		break;
+	    case OPCODE_ARL:
+		emit_arl(c, inst);
+		break;
+	    case OPCODE_FRC:
+		emit_frc(c, inst);
+		break;
+	    case OPCODE_FLR:
+		emit_flr(c, inst);
+		break;
+	    case OPCODE_LRP:
+		emit_lrp(c, inst);
+		break;
+	    case OPCODE_TRUNC:
+		emit_trunc(c, inst);
+		break;
+	    case OPCODE_MOV:
+		emit_mov(c, inst);
+		break;
+	    case OPCODE_DP3:
+		emit_dp3(c, inst);
+		break;
+	    case OPCODE_DP4:
+		emit_dp4(c, inst);
+		break;
+	    case OPCODE_XPD:
+		emit_xpd(c, inst);
+		break;
+	    case OPCODE_DPH:
+		emit_dph(c, inst);
+		break;
+	    case OPCODE_RCP:
+		emit_rcp(c, inst);
+		break;
+	    case OPCODE_RSQ:
+		emit_rsq(c, inst);
+		break;
+	    case OPCODE_SIN:
+		emit_sin(c, inst);
+		break;
+	    case OPCODE_COS:
+		emit_cos(c, inst);
+		break;
+	    case OPCODE_EX2:
+		emit_ex2(c, inst);
+		break;
+	    case OPCODE_LG2:
+		emit_lg2(c, inst);
+		break;
+	    case OPCODE_MIN:	
+	    case OPCODE_MAX:	
+		emit_min_max(c, inst);
+		break;
+	    case OPCODE_DDX:
+	    case OPCODE_DDY:
+		for (j = 0; j < 4; j++) {
+		    if (inst->DstReg.WriteMask & (1 << j))
+			dst[j] = get_dst_reg(c, inst, j);
+		    else
+			dst[j] = brw_null_reg();
+		}
+		get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
+		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
+			  args[0]);
+                break;
+	    case OPCODE_SLT:
+		emit_slt(c, inst);
+		break;
+	    case OPCODE_SLE:
+		emit_sle(c, inst);
+		break;
+	    case OPCODE_SGT:
+		emit_sgt(c, inst);
+		break;
+	    case OPCODE_SGE:
+		emit_sge(c, inst);
+		break;
+	    case OPCODE_SEQ:
+		emit_seq(c, inst);
+		break;
+	    case OPCODE_SNE:
+		emit_sne(c, inst);
+		break;
+	    case OPCODE_MUL:
+		emit_mul(c, inst);
+		break;
+	    case OPCODE_POW:
+		emit_pow(c, inst);
+		break;
+	    case OPCODE_MAD:
+		emit_mad(c, inst);
+		break;
+	    case OPCODE_TEX:
+		emit_tex(c, inst);
+		break;
+	    case OPCODE_TXB:
+		emit_txb(c, inst);
+		break;
+	    case OPCODE_KIL_NV:
+		emit_kil(c);
+		break;
+	    case OPCODE_IF:
+		assert(if_depth < MAX_IF_DEPTH);
+		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_ELSE:
+		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
+		break;
+	    case OPCODE_ENDIF:
+		assert(if_depth > 0);
+		brw_ENDIF(p, if_inst[--if_depth]);
+		break;
+	    case OPCODE_BGNSUB:
+		brw_save_label(p, inst->Comment, p->nr_insn);
+		break;
+	    case OPCODE_ENDSUB:
+		/* no-op */
+		break;
+	    case OPCODE_CAL: 
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+                brw_ADD(p, get_addr_reg(stack_index),
+                         get_addr_reg(stack_index), brw_imm_d(4));
+		brw_save_call(&c->func, inst->label, p->nr_insn);
+                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+                brw_pop_insn_state(p);
+		break;
+
+	    case OPCODE_RET:
+		brw_push_insn_state(p);
+		brw_set_mask_control(p, BRW_MASK_DISABLE);
+                brw_ADD(p, get_addr_reg(stack_index),
+                        get_addr_reg(stack_index), brw_imm_d(-4));
+                brw_set_access_mode(p, BRW_ALIGN_1);
+                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
+                brw_set_access_mode(p, BRW_ALIGN_16);
+		brw_pop_insn_state(p);
+
+		break;
+	    case OPCODE_BGNLOOP:
+                /* XXX may need to invalidate the current_constant regs */
+		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+		break;
+	    case OPCODE_BRK:
+		brw_BREAK(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_CONT:
+		brw_CONT(p);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+		break;
+	    case OPCODE_ENDLOOP: 
+               {
+                  struct brw_instruction *inst0, *inst1;
+                  GLuint br = 1;
+
+                  if (BRW_IS_IGDNG(brw))
+                     br = 2;
+ 
+                  loop_depth--;
+                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
+                  while (inst0 > loop_inst[loop_depth]) {
+                     inst0--;
+                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+			inst0->bits3.if_else.pop_count = 0;
+                     }
+                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+                        inst0->bits3.if_else.pop_count = 0;
+                     }
+                  }
+               }
+               break;
+	    default:
+		debug_printf("unsupported IR in fragment shader %d\n",
+			inst->Opcode);
+	}
+
+	if (inst->CondUpdate)
+	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	else
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+    post_wm_emit(c);
+
+    if (BRW_DEBUG & DEBUG_WM) {
+      debug_printf("wm-native:\n");
+      brw_disasm(stderr, p->store, p->nr_insn);
+    }
+}
+
+/**
+ * Do GPU code generation for shaders that use GLSL features such as
+ * flow control.  Other shaders will be compiled with the 
+ */
+void brw_wm_branching_shader_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+    if (BRW_DEBUG & DEBUG_WM) {
+       debug_printf("%s:\n", __FUNCTION__);
+    }
+
+    /* initial instruction translation/simplification */
+    brw_wm_pass_fp(c);
+
+    /* actual code generation */
+    brw_wm_emit_branching_shader(brw, c);
+
+    if (BRW_DEBUG & DEBUG_WM) {
+        brw_wm_print_program(c, "brw_wm_branching_shader_emit done");
+    }
+
+    c->prog_data.total_grf = num_grf_used(c);
+    c->prog_data.total_scratch = 0;
+}
diff --git a/src/gallium/drivers/i965/brw_wm_iz.c b/src/gallium/drivers/i965/brw_wm_iz.c
new file mode 100644
index 00000000000..6f1e9fcc3c1
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_iz.c
@@ -0,0 +1,156 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                
+
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
+void brw_wm_lookup_iz( GLuint line_aa,
+		       GLuint lookup,
+		       GLboolean ps_uses_depth,
+		       struct brw_wm_prog_key *key )
+{
+   GLuint reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
diff --git a/src/gallium/drivers/i965/brw_wm_pass0.c b/src/gallium/drivers/i965/brw_wm_pass0.c
new file mode 100644
index 00000000000..0bacad2b0f0
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass0.c
@@ -0,0 +1,366 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "brw_debug.h"
+#include "brw_wm.h"
+
+
+
+/***********************************************************************
+ */
+
+static struct brw_wm_ref *get_ref( struct brw_wm_compile *c )
+{
+   assert(c->nr_refs < BRW_WM_MAX_REF);
+   return &c->refs[c->nr_refs++];
+}
+
+static struct brw_wm_value *get_value( struct brw_wm_compile *c)
+{
+   assert(c->nr_refs < BRW_WM_MAX_VREG);
+   return &c->vreg[c->nr_vreg++];
+}
+
+/** return pointer to a newly allocated instruction */
+static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
+{
+   assert(c->nr_insns < BRW_WM_MAX_INSN);
+   return &c->instruction[c->nr_insns++];
+}
+
+/***********************************************************************
+ */
+
+/** Init the "undef" register */
+static void pass0_init_undef( struct brw_wm_compile *c)
+{
+   struct brw_wm_ref *ref = &c->undef_ref;
+   ref->value = &c->undef_value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+}
+
+/** Set a FP register to a value */
+static void pass0_set_fpreg_value( struct brw_wm_compile *c,
+				   GLuint file,
+				   GLuint idx,
+				   GLuint component,
+				   struct brw_wm_value *value )
+{
+   struct brw_wm_ref *ref = get_ref(c);
+   ref->value = value;
+   ref->hw_reg = brw_vec8_grf(0, 0);
+   ref->insn = 0;
+   ref->prevuse = NULL;
+   c->pass0_fp_reg[file][idx][component] = ref;
+}
+
+/** Set a FP register to a ref */
+static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
+				 GLuint file,
+				 GLuint idx,
+				 GLuint component,
+				 const struct brw_wm_ref *src_ref )
+{
+   c->pass0_fp_reg[file][idx][component] = src_ref;
+}
+
+static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c, 
+					       unsigned idx,
+                                               unsigned component)
+{
+   GLuint i = idx * 4 + component;
+   
+   if (i >= BRW_WM_MAX_PARAM) {
+      debug_printf("%s: out of params\n", __FUNCTION__);
+      c->prog_data.error = 1;
+      return NULL;
+   }
+   else {
+      struct brw_wm_ref *ref = get_ref(c);
+
+      c->nr_creg = MAX2(c->nr_creg, (i+16)/16);
+
+      /* Push the offsets into hw_reg.  These will be added to the
+       * real register numbers once one is allocated in pass2.
+       */
+      ref->hw_reg = brw_vec1_grf((i&8)?1:0, i%8);
+      ref->value = &c->creg[i/16];
+      ref->insn = 0;
+      ref->prevuse = NULL;
+
+      return ref;
+   }
+}
+
+
+
+
+/* Lookup our internal registers
+ */
+static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
+					       GLuint file,
+					       GLuint idx,
+					       GLuint component )
+{
+   const struct brw_wm_ref *ref = c->pass0_fp_reg[file][idx][component];
+
+   if (!ref) {
+      switch (file) {
+      case TGSI_FILE_INPUT:
+      case TGSI_FILE_TEMPORARY:
+      case TGSI_FILE_OUTPUT:
+      case BRW_FILE_PAYLOAD:
+	 /* should already be done?? */
+	 break;
+
+      case TGSI_FILE_CONSTANT:
+	 ref = get_param_ref(c, 
+                             c->fp->info.immediate_count + idx,
+                             component);
+	 break;
+
+      case TGSI_FILE_IMMEDIATE:
+	 ref = get_param_ref(c, 
+                             idx,
+                             component);
+	 break;
+
+      default:
+	 assert(0);
+	 break;
+      }
+
+      c->pass0_fp_reg[file][idx][component] = ref;
+   }
+
+   if (!ref)
+      ref = &c->undef_ref;
+
+   return ref;
+}
+
+
+
+/***********************************************************************
+ * Straight translation to internal instruction format
+ */
+
+static void pass0_set_dst( struct brw_wm_compile *c,
+			   struct brw_wm_instruction *out,
+			   const struct brw_fp_instruction *inst,
+			   GLuint writemask )
+{
+   const struct brw_fp_dst dst = inst->dst;
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+	 out->dst[i] = get_value(c);
+	 pass0_set_fpreg_value(c, dst.file, dst.index, i, out->dst[i]);
+      }
+   }
+
+   out->writemask = writemask;
+}
+
+
+static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
+						    struct brw_fp_src src,
+						    GLuint i )
+{
+   return pass0_get_reg(c, src.file, src.index, BRW_GET_SWZ(src.swizzle,i));
+}
+
+
+static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
+				       struct brw_fp_src src,
+				       GLuint i,
+				       struct brw_wm_instruction *insn)
+{
+   const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i);
+   struct brw_wm_ref *newref = get_ref(c);
+
+   newref->value = ref->value;
+   newref->hw_reg = ref->hw_reg;
+
+   if (insn) {
+      newref->insn = insn - c->instruction;
+      newref->prevuse = newref->value->lastuse;
+      newref->value->lastuse = newref;
+   }
+
+   if (src.negate)
+      newref->hw_reg.negate ^= 1;
+
+   if (src.abs) {
+      newref->hw_reg.negate = 0;
+      newref->hw_reg.abs = 1;
+   }
+
+   return newref;
+}
+
+
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct brw_fp_instruction *inst)
+{
+   struct brw_wm_instruction *out = get_instruction(c);
+   GLuint writemask = inst->dst.writemask;
+   GLuint nr_args = brw_wm_nr_args(inst->opcode);
+   GLuint i, j;
+
+   /* Copy some data out of the instruction
+    */
+   out->opcode = inst->opcode;
+   out->saturate = inst->dst.saturate;
+   out->tex_unit = inst->tex_unit;
+   out->target = inst->target;
+
+   /* Nasty hack:
+    */
+   out->eot = (inst->opcode == WM_FB_WRITE &&
+               inst->tex_unit != 0);
+
+
+   /* Args:
+    */
+   for (i = 0; i < nr_args; i++) {
+      for (j = 0; j < 4; j++) {
+	 out->src[i][j] = get_new_ref(c, inst->src[i], j, out);
+      }
+   }
+
+   /* Dst:
+    */
+   pass0_set_dst(c, out, inst, writemask);
+}
+
+
+
+/***********************************************************************
+ * Optimize moves and swizzles away:
+ */ 
+static void pass0_precalc_mov( struct brw_wm_compile *c,
+			       const struct brw_fp_instruction *inst )
+{
+   const struct brw_fp_dst dst = inst->dst;
+   GLuint writemask = dst.writemask;
+   struct brw_wm_ref *refs[4];
+   GLuint i;
+
+   /* Get the effect of a MOV by manipulating our register table:
+    * First get all refs, then assign refs.  This ensures that "in-place"
+    * swizzles such as:
+    *   MOV t, t.xxyx
+    * are handled correctly.  Previously, these two steps were done in
+    * one loop and the above case was incorrectly handled.
+    */
+   for (i = 0; i < 4; i++) {
+      refs[i] = get_new_ref(c, inst->src[0], i, NULL);
+   }
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1 << i)) {	    
+         pass0_set_fpreg_ref( c, dst.file, dst.index, i, refs[i]);
+      }
+   }
+}
+
+
+/* Initialize payload "registers".
+ */
+static void pass0_init_payload( struct brw_wm_compile *c )
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      GLuint j = i >= c->key.nr_depth_regs ? 0 : i;
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, PAYLOAD_DEPTH, i, 
+			     &c->payload.depth[j] );
+   }
+
+   for (i = 0; i < c->key.nr_inputs; i++)
+      pass0_set_fpreg_value( c, BRW_FILE_PAYLOAD, i, 0, 
+			     &c->payload.input_interp[i] );      
+}
+
+
+/***********************************************************************
+ * PASS 0
+ *
+ * Work forwards to give each calculated value a unique number.  Where
+ * an instruction produces duplicate values (eg DP3), all are given
+ * the same number.
+ *
+ * Translate away swizzling and eliminate non-saturating moves.
+ *
+ * Translate instructions from our fp_instruction structs to our
+ * internal brw_wm_instruction representation.
+ */
+void brw_wm_pass0( struct brw_wm_compile *c )
+{
+   GLuint insn;
+
+   c->nr_vreg = 0;
+   c->nr_insns = 0;
+
+   pass0_init_undef(c);
+   pass0_init_payload(c);
+
+   for (insn = 0; insn < c->nr_fp_insns; insn++) {
+      const struct brw_fp_instruction *inst = &c->fp_instructions[insn];
+
+      /* Optimize away moves, otherwise emit translated instruction:
+       */      
+      switch (inst->opcode) {
+      case TGSI_OPCODE_MOV: 
+	 if (!inst->dst.saturate) {
+	    pass0_precalc_mov(c, inst);
+	 }
+	 else {
+	    translate_insn(c, inst);
+	 }
+	 break;
+      default:
+	 translate_insn(c, inst);
+	 break;
+      }
+   }
+ 
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass0");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_pass1.c b/src/gallium/drivers/i965/brw_wm_pass1.c
new file mode 100644
index 00000000000..005747f00ba
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass1.c
@@ -0,0 +1,292 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                  
+
+#include "brw_wm.h"
+#include "brw_debug.h"
+
+
+static GLuint get_tracked_mask(struct brw_wm_compile *c,
+			       struct brw_wm_instruction *inst)
+{
+   GLuint i;
+   for (i = 0; i < 4; i++) {
+      if (inst->writemask & (1<<i)) {
+	 if (!inst->dst[i]->contributes_to_output) {
+	    inst->writemask &= ~(1<<i);
+	    inst->dst[i] = 0;
+	 }
+      }
+   }
+
+   return inst->writemask;
+}
+
+/* Remove a reference from a value's usage chain.
+ */
+static void unlink_ref(struct brw_wm_ref *ref)
+{
+   struct brw_wm_value *value = ref->value;
+
+   if (ref == value->lastuse) {
+      value->lastuse = ref->prevuse;
+   }
+   else {
+      struct brw_wm_ref *i = value->lastuse;
+      while (i->prevuse != ref) i = i->prevuse;
+      i->prevuse = ref->prevuse;
+   }
+}
+
+static void track_arg(struct brw_wm_compile *c,
+		      struct brw_wm_instruction *inst,
+		      GLuint arg,
+		      GLuint readmask)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      struct brw_wm_ref *ref = inst->src[arg][i];
+      if (ref) {
+	 if (readmask & (1<<i)) {
+	    ref->value->contributes_to_output = 1;
+         }
+	 else {
+	    unlink_ref(ref);
+	    inst->src[arg][i] = NULL;
+	 }
+      }
+   }
+}
+
+static GLuint get_texcoord_mask( GLuint tex_idx )
+{
+   switch (tex_idx) {
+   case TGSI_TEXTURE_1D:
+      return BRW_WRITEMASK_X;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      return BRW_WRITEMASK_XY;
+   case TGSI_TEXTURE_3D:
+      return BRW_WRITEMASK_XYZ;
+   case TGSI_TEXTURE_CUBE:
+      return BRW_WRITEMASK_XYZ;
+
+   case TGSI_TEXTURE_SHADOW1D:
+      return BRW_WRITEMASK_XZ;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return BRW_WRITEMASK_XYZ;
+   default: 
+      assert(0);
+      return 0;
+   }
+}
+
+
+/* Step two: Basically this is dead code elimination.  
+ *
+ * Iterate backwards over instructions, noting which values
+ * contribute to the final result.  Adjust writemasks to only
+ * calculate these values.
+ */
+void brw_wm_pass1( struct brw_wm_compile *c )
+{
+   GLint insn;
+
+   for (insn = c->nr_insns-1; insn >= 0; insn--) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+      GLuint writemask;
+      GLuint read0, read1, read2;
+
+      if (inst->opcode == TGSI_OPCODE_KIL) {
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); /* All args contribute to final */
+	 continue;
+      }
+
+      if (inst->opcode == WM_FB_WRITE) {
+	 track_arg(c, inst, 0, BRW_WRITEMASK_XYZW); 
+	 track_arg(c, inst, 1, BRW_WRITEMASK_XYZW); 
+	 if (c->key.source_depth_to_render_target &&
+	     c->key.computes_depth)
+	    track_arg(c, inst, 2, BRW_WRITEMASK_Z); 
+	 else
+	    track_arg(c, inst, 2, 0); 
+	 continue;
+      }
+
+      /* Lookup all the registers which were written by this
+       * instruction and get a mask of those that contribute to the output:
+       */
+      writemask = get_tracked_mask(c, inst);
+      if (!writemask) {
+	 GLuint arg;
+	 for (arg = 0; arg < 3; arg++)
+	    track_arg(c, inst, arg, 0);
+	 continue;
+      }
+
+      read0 = 0;
+      read1 = 0;
+      read2 = 0;
+
+      /* Mark all inputs which contribute to the marked outputs:
+       */
+      switch (inst->opcode) {
+      case TGSI_OPCODE_ABS:
+      case TGSI_OPCODE_FLR:
+      case TGSI_OPCODE_FRC:
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_TRUNC:
+	 read0 = writemask;
+	 break;
+
+      case TGSI_OPCODE_SUB:
+      case TGSI_OPCODE_SLT:
+      case TGSI_OPCODE_SLE:
+      case TGSI_OPCODE_SGE:
+      case TGSI_OPCODE_SGT:
+      case TGSI_OPCODE_SEQ:
+      case TGSI_OPCODE_SNE:
+      case TGSI_OPCODE_ADD:
+      case TGSI_OPCODE_MAX:
+      case TGSI_OPCODE_MIN:
+      case TGSI_OPCODE_MUL:
+	 read0 = writemask;
+	 read1 = writemask;
+	 break;
+
+      case TGSI_OPCODE_DDX:
+      case TGSI_OPCODE_DDY:
+	 read0 = writemask;
+	 break;
+
+      case TGSI_OPCODE_MAD:	
+      case TGSI_OPCODE_CMP:
+      case TGSI_OPCODE_LRP:
+	 read0 = writemask;
+	 read1 = writemask;	
+	 read2 = writemask;	
+	 break;
+
+      case TGSI_OPCODE_XPD: 
+	 if (writemask & BRW_WRITEMASK_X) read0 |= BRW_WRITEMASK_YZ;	 
+	 if (writemask & BRW_WRITEMASK_Y) read0 |= BRW_WRITEMASK_XZ;	 
+	 if (writemask & BRW_WRITEMASK_Z) read0 |= BRW_WRITEMASK_XY;
+	 read1 = read0;
+	 break;
+
+      case TGSI_OPCODE_COS:
+      case TGSI_OPCODE_EX2:
+      case TGSI_OPCODE_LG2:
+      case TGSI_OPCODE_RCP:
+      case TGSI_OPCODE_RSQ:
+      case TGSI_OPCODE_SIN:
+      case TGSI_OPCODE_SCS:
+      case WM_CINTERP:
+      case WM_PIXELXY:
+	 read0 = BRW_WRITEMASK_X;
+	 break;
+
+      case TGSI_OPCODE_POW:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_X;
+	 break;
+
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXP:
+	 read0 = get_texcoord_mask(inst->target);
+	 break;
+
+      case TGSI_OPCODE_TXB:
+	 read0 = get_texcoord_mask(inst->target) | BRW_WRITEMASK_W;
+	 break;
+
+      case WM_WPOSXY:
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_DELTAXY:
+	 read0 = writemask & BRW_WRITEMASK_XY;
+	 read1 = BRW_WRITEMASK_X;
+	 break;
+
+      case WM_PIXELW:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_LINTERP:
+	 read0 = BRW_WRITEMASK_X;
+	 read1 = BRW_WRITEMASK_XY;
+	 break;
+
+      case WM_PINTERP:
+	 read0 = BRW_WRITEMASK_X; /* interpolant */
+	 read1 = BRW_WRITEMASK_XY; /* deltas */
+	 read2 = BRW_WRITEMASK_W; /* pixel w */
+	 break;
+
+      case TGSI_OPCODE_DP3:	
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZ;
+	 break;
+
+      case TGSI_OPCODE_DPH:
+	 read0 = BRW_WRITEMASK_XYZ;
+	 read1 = BRW_WRITEMASK_XYZW;
+	 break;
+
+      case TGSI_OPCODE_DP4:
+	 read0 = BRW_WRITEMASK_XYZW;
+	 read1 = BRW_WRITEMASK_XYZW;
+	 break;
+
+      case TGSI_OPCODE_LIT: 
+	 read0 = BRW_WRITEMASK_XYW;
+	 break;
+
+      case TGSI_OPCODE_DST:
+      case WM_FRONTFACING:
+      case TGSI_OPCODE_KILP:
+      default:
+	 break;
+      }
+
+      track_arg(c, inst, 0, read0);
+      track_arg(c, inst, 1, read1);
+      track_arg(c, inst, 2, read2);
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass1");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_pass2.c b/src/gallium/drivers/i965/brw_wm_pass2.c
new file mode 100644
index 00000000000..19248b45195
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_pass2.c
@@ -0,0 +1,334 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+
+#include "brw_debug.h"
+#include "brw_wm.h"
+
+
+/* Use these to force spilling so that that functionality can be
+ * tested with known-good examples rather than having to construct new
+ * tests.
+ */
+#define TEST_PAYLOAD_SPILLS 0
+#define TEST_DST_SPILLS 0
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value);
+
+static void prealloc_reg(struct brw_wm_compile *c,
+			 struct brw_wm_value *value,
+			 GLuint reg)
+{
+   if (value->lastuse) {
+      /* Set nextuse to zero, it will be corrected by
+       * update_register_usage().
+       */
+      c->pass2_grf[reg].value = value;
+      c->pass2_grf[reg].nextuse = 0;
+
+      value->resident = &c->pass2_grf[reg];
+      value->hw_reg = brw_vec8_grf(reg*2, 0);
+
+      if (TEST_PAYLOAD_SPILLS)
+	 spill_value(c, value);
+   }
+}
+
+
+/* Initialize all the register values.  Do the initial setup
+ * calculations for interpolants.
+ */
+static void init_registers( struct brw_wm_compile *c )
+{
+   GLuint reg = 0;
+   GLuint j;
+
+   for (j = 0; j < c->grf_limit; j++) 
+      c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
+
+   /* Pre-allocate incoming payload regs:
+    */
+   for (j = 0; j < c->key.nr_depth_regs; j++) 
+      prealloc_reg(c, &c->payload.depth[j], reg++);
+
+   for (j = 0; j < c->nr_creg; j++) 
+      prealloc_reg(c, &c->creg[j], reg++);
+
+   reg++;                       /* XXX: skip over position output */
+
+   /* XXX: currently just hope the VS outputs line up with FS inputs:
+    */
+   for (j = 0; j < c->key.nr_inputs; j++)
+      prealloc_reg(c, &c->payload.input_interp[j], reg++);
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = (c->key.nr_inputs + 1) * 2;
+   c->prog_data.curb_read_length = c->nr_creg * 2;
+
+   /* Note this allocation:
+    */
+   c->max_wm_grf = reg * 2;
+}
+
+
+/* Update the nextuse value for each register in our file.
+ */
+static void update_register_usage(struct brw_wm_compile *c,
+				  GLuint thisinsn)
+{
+   GLuint i;
+
+   for (i = 1; i < c->grf_limit; i++) {
+      struct brw_wm_grf *grf = &c->pass2_grf[i];
+
+      /* Only search those which can change:
+       */
+      if (grf->nextuse < thisinsn) {
+	 const struct brw_wm_ref *ref = grf->value->lastuse;
+
+	 /* Has last use of value been passed?
+	  */
+	 if (ref->insn < thisinsn) {
+	    grf->value->resident = 0;
+	    grf->value = 0;
+	    grf->nextuse = BRW_WM_MAX_INSN;
+	 }
+	 else {
+	    /* Else loop through chain to update:
+	     */
+	    while (ref->prevuse && ref->prevuse->insn >= thisinsn)
+	       ref = ref->prevuse;
+
+	    grf->nextuse = ref->insn;
+	 }
+      }
+   }
+}
+
+
+static void spill_value(struct brw_wm_compile *c,
+			struct brw_wm_value *value)
+{	
+   /* Allocate a spill slot.  Note that allocations start from 0x40 -
+    * the first slot is reserved to mean "undef" in brw_wm_emit.c
+    */
+   if (!value->spill_slot) {
+      c->last_scratch += 0x40;	
+      value->spill_slot = c->last_scratch;
+   }
+
+   /* The spill will be done in brw_wm_emit.c immediately after the
+    * value is calculated, so we can just take this reg without any
+    * further work.
+    */
+   value->resident->value = NULL;
+   value->resident->nextuse = BRW_WM_MAX_INSN;
+   value->resident = NULL;
+}
+
+
+
+/* Search for contiguous region with the most distant nearest
+ * member.  Free regs count as very distant.
+ *
+ * TODO: implement spill-to-reg so that we can rearrange discontigous
+ * free regs and then spill the oldest non-free regs in sequence.
+ * This would mean inserting instructions in this pass.
+ */
+static GLuint search_contiguous_regs(struct brw_wm_compile *c,
+				     GLuint nr,
+				     GLuint thisinsn)
+{
+   struct brw_wm_grf *grf = c->pass2_grf;
+   GLuint furthest = 0;
+   GLuint reg = 0;
+   GLuint i, j;
+
+   /* Start search at 1: r0 is special and can't be used or spilled.
+    */
+   for (i = 1; i < c->grf_limit && furthest < BRW_WM_MAX_INSN; i++) {
+      GLuint group_nextuse = BRW_WM_MAX_INSN;
+
+      for (j = 0; j < nr; j++) {
+	 if (grf[i+j].nextuse < group_nextuse)
+	    group_nextuse = grf[i+j].nextuse;
+      }
+
+      if (group_nextuse > furthest) {
+	 furthest = group_nextuse;
+	 reg = i;
+      }
+   }
+
+   assert(furthest != thisinsn);
+
+   /* Any non-empty regs will need to be spilled:
+    */
+   for (j = 0; j < nr; j++) 
+      if (grf[reg+j].value)
+	 spill_value(c, grf[reg+j].value);
+
+   return reg;
+}
+
+
+static void alloc_contiguous_dest(struct brw_wm_compile *c, 
+				  struct brw_wm_value *dst[],
+				  GLuint nr,
+				  GLuint thisinsn)
+{
+   GLuint reg = search_contiguous_regs(c, nr, thisinsn);
+   GLuint i;
+
+   for (i = 0; i < nr; i++) {
+      if (!dst[i]) {
+	 /* Need to grab a dummy value in TEX case.  Don't introduce
+	  * it into the tracking scheme.
+	  */
+	 dst[i] = &c->vreg[c->nr_vreg++];
+      }
+      else {
+	 assert(!dst[i]->resident);
+	 assert(c->pass2_grf[reg+i].nextuse != thisinsn);
+
+	 c->pass2_grf[reg+i].value = dst[i];
+	 c->pass2_grf[reg+i].nextuse = thisinsn;
+
+	 dst[i]->resident = &c->pass2_grf[reg+i];
+      }
+
+      dst[i]->hw_reg = brw_vec8_grf((reg+i)*2, 0);
+   }
+
+   if ((reg+nr)*2 > c->max_wm_grf)
+      c->max_wm_grf = (reg+nr) * 2;
+}
+
+
+static void load_args(struct brw_wm_compile *c, 
+		      struct brw_wm_instruction *inst)
+{
+   GLuint thisinsn = inst - c->instruction;
+   GLuint i,j;
+
+   for (i = 0; i < 3; i++) {
+      for (j = 0; j < 4; j++) {
+	 struct brw_wm_ref *ref = inst->src[i][j];
+
+	 if (ref) {
+	    if (!ref->value->resident) {
+	       /* Need to bring the value in from scratch space.  The code for
+		* this will be done in brw_wm_emit.c, here we just do the
+		* register allocation and mark the ref as requiring a fill.
+		*/
+	       GLuint reg = search_contiguous_regs(c, 1, thisinsn);
+
+	       c->pass2_grf[reg].value = ref->value;
+	       c->pass2_grf[reg].nextuse = thisinsn;
+
+	       ref->value->resident = &c->pass2_grf[reg];
+
+	       /* Note that a fill is required:
+		*/
+	       ref->unspill_reg = reg*2;
+	    }
+
+	    /* Adjust the hw_reg to point at the value's current location:
+	     */
+	    assert(ref->value == ref->value->resident->value);
+	    ref->hw_reg.nr += (ref->value->resident - c->pass2_grf) * 2;
+	 }
+      }
+   }
+}
+
+
+
+/* Step 3: Work forwards once again.  Perform register allocations,
+ * taking into account instructions like TEX which require contiguous
+ * result registers.  Where necessary spill registers to scratch space
+ * and reload later.
+ */
+void brw_wm_pass2( struct brw_wm_compile *c )
+{
+   GLuint insn;
+   GLuint i;
+
+   init_registers(c);
+
+   for (insn = 0; insn < c->nr_insns; insn++) {
+      struct brw_wm_instruction *inst = &c->instruction[insn];
+
+      /* Update registers' nextuse values:
+       */
+      update_register_usage(c, insn);
+
+      /* May need to unspill some args.
+       */
+      load_args(c, inst);
+
+      /* Allocate registers to hold results:
+       */
+      switch (inst->opcode) {
+      case TGSI_OPCODE_TEX:
+      case TGSI_OPCODE_TXB:
+      case TGSI_OPCODE_TXP:
+	 alloc_contiguous_dest(c, inst->dst, 4, insn);
+	 break;
+
+      default:
+	 for (i = 0; i < 4; i++) {
+	    if (inst->writemask & (1<<i)) {
+	       assert(inst->dst[i]);
+	       alloc_contiguous_dest(c, &inst->dst[i], 1, insn);
+	    }
+	 }
+	 break;
+      }
+
+      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) {
+	 for (i = 0; i < 4; i++)	
+	    if (inst->dst[i])
+	       spill_value(c, inst->dst[i]);
+      }
+   }
+
+   if (BRW_DEBUG & DEBUG_WM) {
+      brw_wm_print_program(c, "pass2");
+   }
+
+   c->state = PASS2_DONE;
+
+   if (BRW_DEBUG & DEBUG_WM) {
+       brw_wm_print_program(c, "pass2/done");
+   }
+}
diff --git a/src/gallium/drivers/i965/brw_wm_sampler_state.c b/src/gallium/drivers/i965/brw_wm_sampler_state.c
new file mode 100644
index 00000000000..a8bc31c9cef
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_sampler_state.c
@@ -0,0 +1,229 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "util/u_math.h"
+#include "util/u_format.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_screen.h"
+
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+
+
+static enum pipe_error
+upload_default_color( struct brw_context *brw,
+		      const GLfloat *color,
+                      struct brw_winsys_buffer **bo_out )
+{
+   struct brw_sampler_default_color sdc;
+   enum pipe_error ret;
+
+   COPY_4V(sdc.color, color); 
+   
+   ret = brw_cache_data( &brw->cache, BRW_SAMPLER_DEFAULT_COLOR, &sdc,
+                         NULL, 0, bo_out );
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+struct wm_sampler_key {
+   int sampler_count;
+   struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+};
+
+
+/** Sets up the cache key for sampler state for all texture units */
+static void
+brw_wm_sampler_populate_key(struct brw_context *brw,
+			    struct wm_sampler_key *key)
+{
+   int i;
+
+   memset(key, 0, sizeof(*key));
+
+   key->sampler_count = MIN2(brw->curr.num_textures,
+			    brw->curr.num_samplers);
+
+   for (i = 0; i < key->sampler_count; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.texture[i]);
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      struct brw_sampler_state *entry = &key->sampler[i];
+
+      entry->ss0 = sampler->ss0;
+      entry->ss1 = sampler->ss1;
+      entry->ss2.default_color_pointer = 0; /* reloc */
+      entry->ss3 = sampler->ss3;
+
+      /* Cube-maps on 965 and later must use the same wrap mode for all 3
+       * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
+       */
+      if (tex->base.target == PIPE_TEXTURE_CUBE) {
+	 if (FALSE &&
+	     (sampler->ss0.min_filter != BRW_MAPFILTER_NEAREST || 
+	      sampler->ss0.mag_filter != BRW_MAPFILTER_NEAREST)) {
+	    entry->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	    entry->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	    entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 } else {
+	    entry->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	    entry->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	    entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 }
+      } else if (tex->base.target == PIPE_TEXTURE_1D) {
+	 /* There's a bug in 1D texture sampling - it actually pays
+	  * attention to the wrap_t value, though it should not.
+	  * Override the wrap_t value here to GL_REPEAT to keep
+	  * any nonexistent border pixels from floating in.
+	  */
+	 entry->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
+      }
+   }
+}
+
+
+static enum pipe_error
+brw_wm_sampler_update_default_colors(struct brw_context *brw)
+{
+   enum pipe_error ret;
+   int nr = MIN2(brw->curr.num_textures,
+		 brw->curr.num_samplers);
+   int i;
+
+   for (i = 0; i < nr; i++) {
+      const struct brw_texture *tex = brw_texture(brw->curr.texture[i]);
+      const struct brw_sampler *sampler = brw->curr.sampler[i];
+      const float *bc;
+
+      if (util_format_is_depth_or_stencil(tex->base.format)) {
+	 float bordercolor[4] = {
+	    sampler->border_color[0],
+	    sampler->border_color[0],
+	    sampler->border_color[0],
+	    sampler->border_color[0]
+	 };
+         
+         bc = bordercolor;
+      }
+      else {
+         bc = sampler->border_color;
+      }
+
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the
+       * channels for safety.
+       */
+      ret = upload_default_color(brw, 
+                                 bc,
+                                 &brw->wm.sdc_bo[i]);
+      if (ret) 
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
+
+
+/* All samplers must be uploaded in a single contiguous array.  
+ */
+static int upload_wm_samplers( struct brw_context *brw )
+{
+   struct wm_sampler_key key;
+   struct brw_winsys_reloc reloc[BRW_MAX_TEX_UNIT];
+   enum pipe_error ret;
+   int i;
+
+   brw_wm_sampler_update_default_colors(brw);
+   brw_wm_sampler_populate_key(brw, &key);
+
+   if (brw->wm.sampler_count != key.sampler_count) {
+      brw->wm.sampler_count = key.sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   if (brw->wm.sampler_count == 0) {
+      bo_reference(&brw->wm.sampler_bo, NULL);
+      return PIPE_OK;
+   }
+
+   /* Emit SDC relocations */
+   for (i = 0; i < key.sampler_count; i++) {
+      make_reloc( &reloc[i],
+                  BRW_USAGE_SAMPLER,
+                  0,
+                  i * sizeof(struct brw_sampler_state) +
+                  offsetof(struct brw_sampler_state, ss2),
+                  brw->wm.sdc_bo[i]);
+   }
+
+
+   if (brw_search_cache(&brw->cache, BRW_SAMPLER,
+                        &key, sizeof(key),
+                        reloc, key.sampler_count,
+                        NULL,
+                        &brw->wm.sampler_bo))
+      return PIPE_OK;
+
+   /* If we didnt find it in the cache, compute the state and put it in the
+    * cache.
+    */
+   ret = brw_upload_cache(&brw->cache, BRW_SAMPLER,
+                          &key, sizeof(key),
+                          reloc, key.sampler_count,
+                          &key.sampler, sizeof(key.sampler),
+                          NULL, NULL,
+                          &brw->wm.sampler_bo);
+   if (ret)
+      return ret;
+
+
+   return 0;
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .mesa = PIPE_NEW_BOUND_TEXTURES | PIPE_NEW_SAMPLERS,
+      .brw = 0,
+      .cache = 0
+   },
+   .prepare = upload_wm_samplers,
+};
+
+
diff --git a/src/gallium/drivers/i965/brw_wm_state.c b/src/gallium/drivers/i965/brw_wm_state.c
new file mode 100644
index 00000000000..ee970ac75bc
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_state.c
@@ -0,0 +1,339 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "util/u_math.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+#include "brw_debug.h"
+#include "brw_pipe_rast.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+
+struct brw_wm_unit_key {
+   unsigned int total_grf, total_scratch;
+   unsigned int urb_entry_read_length;
+   unsigned int curb_entry_read_length;
+   unsigned int dispatch_grf_start_reg;
+
+   unsigned int curbe_offset;
+   unsigned int urb_size;
+
+   unsigned int max_threads;
+
+   unsigned int nr_surfaces, sampler_count;
+   GLboolean uses_depth, computes_depth, uses_kill, has_flow_control;
+   GLboolean polygon_stipple, stats_wm, line_stipple, offset_enable;
+   GLfloat offset_units, offset_factor;
+};
+
+static void
+wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
+{
+   const struct brw_fragment_shader *fp = brw->curr.fragment_shader;
+
+   memset(key, 0, sizeof(*key));
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      key->max_threads = 1;
+   else {
+      /* WM maximum threads is number of EUs times number of threads per EU. */
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
+	 key->max_threads = 10 * 5;
+      else
+	 key->max_threads = 8 * 4;
+   }
+
+   /* CACHE_NEW_WM_PROG */
+   key->total_grf = brw->wm.prog_data->total_grf;
+   key->urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   key->curb_entry_read_length = brw->wm.prog_data->curb_read_length;
+   key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   key->total_scratch = align(brw->wm.prog_data->total_scratch, 1024);
+
+   /* BRW_NEW_URB_FENCE */
+   key->urb_size = brw->urb.vsize;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   key->curbe_offset = brw->curbe.wm_start;
+
+   /* BRW_NEW_NR_SURFACEs */
+   key->nr_surfaces = brw->wm.nr_surfaces;
+
+   /* CACHE_NEW_SAMPLER */
+   key->sampler_count = brw->wm.sampler_count;
+
+   /* PIPE_NEW_RAST */
+   key->polygon_stipple = brw->curr.rast->templ.poly_stipple_enable;
+
+   /* PIPE_NEW_FRAGMENT_PROGRAM */
+   key->uses_depth = fp->uses_depth;
+   key->computes_depth = fp->info.writes_z;
+
+   /* PIPE_NEW_DEPTH_BUFFER
+    *
+    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
+    * Depth field.
+    */
+   if (brw->curr.fb.zsbuf == NULL)
+      key->computes_depth = 0;
+
+   /* PIPE_NEW_DEPTH_STENCIL_ALPHA */
+   key->uses_kill = (fp->info.uses_kill || 
+		     brw->curr.zstencil->cc3.alpha_test);
+
+   key->has_flow_control = fp->has_flow_control;
+
+   /* temporary sanity check assertion */
+   assert(fp->has_flow_control == 0);
+
+   /* PIPE_NEW_QUERY */
+   key->stats_wm = (brw->query.stats_wm != 0);
+
+   /* PIPE_NEW_RAST */
+   key->line_stipple = brw->curr.rast->templ.line_stipple_enable;
+
+
+   key->offset_enable = (brw->curr.rast->templ.offset_cw ||
+			 brw->curr.rast->templ.offset_ccw);
+
+   key->offset_units = brw->curr.rast->templ.offset_units;
+   key->offset_factor = brw->curr.rast->templ.offset_scale;
+}
+
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
+static enum pipe_error
+wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
+			struct brw_winsys_reloc *reloc,
+                        unsigned nr_reloc,
+                        struct brw_winsys_buffer **bo_out)
+{
+   struct brw_wm_unit_state wm;
+   enum pipe_error ret;
+
+   memset(&wm, 0, sizeof(wm));
+
+   wm.thread0.grf_reg_count = align(key->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = 0; /* reloc */
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (key->total_scratch != 0) {
+      wm.thread2.scratch_space_base_pointer = 0; /* reloc */
+      wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+      wm.thread2.per_thread_scratch_space = 0;
+   }
+
+   wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
+   wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
+   wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
+
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
+   /* reloc */
+   wm.wm4.sampler_state_pointer = 0;
+
+   wm.wm5.program_uses_depth = key->uses_depth;
+   wm.wm5.program_computes_depth = key->computes_depth;
+   wm.wm5.program_uses_killpixel = key->uses_kill;
+
+   if (key->has_flow_control)
+      wm.wm5.enable_8_pix = 1;
+   else
+      wm.wm5.enable_16_pix = 1;
+
+   wm.wm5.max_threads = key->max_threads - 1;
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   wm.wm5.polygon_stipple = key->polygon_stipple;
+
+   if (key->offset_enable) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = key->offset_units * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = key->offset_factor;
+   }
+
+   wm.wm5.line_stipple = key->line_stipple;
+
+   if ((BRW_DEBUG & DEBUG_STATS) || key->stats_wm)
+      wm.wm4.stats_enable = 1;
+
+   ret = brw_upload_cache(&brw->cache, BRW_WM_UNIT,
+                          key, sizeof(*key),
+                          reloc, nr_reloc,
+                          &wm, sizeof(wm),
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error upload_wm_unit( struct brw_context *brw )
+{
+   struct brw_wm_unit_key key;
+   struct brw_winsys_reloc reloc[3];
+   unsigned nr_reloc = 0;
+   enum pipe_error ret;
+   unsigned grf_reg_count;
+   unsigned per_thread_scratch_space;
+   unsigned stats_enable;
+   unsigned sampler_count;
+
+   wm_unit_populate_key(brw, &key);
+
+
+   /* Allocate the necessary scratch space if we haven't already.  Don't
+    * bother reducing the allocation later, since we use scratch so
+    * rarely.
+    */
+   assert(key.total_scratch <= 12 * 1024);
+   if (key.total_scratch) {
+      GLuint total = key.total_scratch * key.max_threads;
+
+      /* Do we need a new buffer:
+       */
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) 
+	 bo_reference(&brw->wm.scratch_bo, NULL);
+
+      if (brw->wm.scratch_bo == NULL) {
+	 ret = brw->sws->bo_alloc(brw->sws,
+                                  BRW_BUFFER_TYPE_SHADER_SCRATCH,
+                                  total,
+                                  4096,
+                                  &brw->wm.scratch_bo);
+         if (ret)
+            return ret;
+      }
+   }
+
+
+   /* XXX: temporary:
+    */
+   grf_reg_count = (align(key.total_grf, 16) / 16 - 1);
+   per_thread_scratch_space = key.total_scratch / 1024 - 1;
+   stats_enable = (BRW_DEBUG & DEBUG_STATS) || key.stats_wm;
+   sampler_count = BRW_IS_IGDNG(brw) ? 0 :(key.sampler_count + 1) / 4;
+
+   /* Emit WM program relocation */
+   make_reloc(&reloc[nr_reloc++],
+              BRW_USAGE_STATE,
+              grf_reg_count << 1,
+              offsetof(struct brw_wm_unit_state, thread0),
+              brw->wm.prog_bo);
+
+   /* Emit scratch space relocation */
+   if (key.total_scratch != 0) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_SCRATCH,
+                 per_thread_scratch_space,
+                 offsetof(struct brw_wm_unit_state, thread2),
+                 brw->wm.scratch_bo);
+   }
+
+   /* Emit sampler state relocation */
+   if (key.sampler_count != 0) {
+      make_reloc(&reloc[nr_reloc++],
+                 BRW_USAGE_STATE,
+                 stats_enable | (sampler_count << 2),
+                 offsetof(struct brw_wm_unit_state, wm4),
+                 brw->wm.sampler_bo);
+   }
+
+
+   if (brw_search_cache(&brw->cache, BRW_WM_UNIT,
+                        &key, sizeof(key),
+                        reloc, nr_reloc,
+                        NULL,
+                        &brw->wm.state_bo))
+      return PIPE_OK;
+
+   ret = wm_unit_create_from_key(brw, &key, 
+                                 reloc, nr_reloc,
+                                 &brw->wm.state_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .mesa = (PIPE_NEW_FRAGMENT_SHADER |
+	       PIPE_NEW_DEPTH_BUFFER |
+	       PIPE_NEW_RAST | 
+	       PIPE_NEW_DEPTH_STENCIL_ALPHA |
+	       PIPE_NEW_QUERY),
+
+      .brw = (BRW_NEW_CURBE_OFFSETS |
+	      BRW_NEW_NR_WM_SURFACES),
+
+      .cache = (CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .prepare = upload_wm_unit,
+};
+
diff --git a/src/gallium/drivers/i965/brw_wm_surface_state.c b/src/gallium/drivers/i965/brw_wm_surface_state.c
new file mode 100644
index 00000000000..f92b8198ed3
--- /dev/null
+++ b/src/gallium/drivers/i965/brw_wm_surface_state.c
@@ -0,0 +1,294 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+                   
+#include "pipe/p_format.h"
+
+#include "brw_batchbuffer.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_screen.h"
+
+
+
+
+static enum pipe_error
+brw_update_texture_surface( struct brw_context *brw,
+			    struct brw_texture *tex,
+                            struct brw_winsys_buffer **bo_out)
+{
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+   /* Emit relocation to surface contents */
+   make_reloc(&reloc[0],
+              BRW_USAGE_SAMPLER,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              tex->bo);
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &tex->ss, sizeof tex->ss,
+                        reloc, Elements(reloc),
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   ret = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+                          &tex->ss, sizeof tex->ss,
+                          reloc, Elements(reloc),
+                          &tex->ss, sizeof tex->ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+
+
+
+
+
+
+/**
+ * Sets up a surface state structure to point at the given region.
+ * While it is only used for the front/back buffer currently, it should be
+ * usable for further buffers when doing ARB_draw_buffer support.
+ */
+static enum pipe_error
+brw_update_render_surface(struct brw_context *brw,
+                          struct brw_surface *surface,
+                          struct brw_winsys_buffer **bo_out)
+{
+   struct brw_surf_ss0 blend_ss0 = brw->curr.blend->ss0;
+   struct brw_surface_state ss;
+   struct brw_winsys_reloc reloc[1];
+   enum pipe_error ret;
+
+   /* XXX: we will only be rendering to this surface:
+    */
+   make_reloc(&reloc[0],
+              BRW_USAGE_RENDER_TARGET,
+              0,
+              offsetof(struct brw_surface_state, ss1),
+              surface->bo);
+
+   /* Surfaces are potentially shared between contexts, so can't
+    * scribble the in-place ss0 value in the surface.
+    */
+   memcpy(&ss, &surface->ss, sizeof ss);
+
+   ss.ss0.color_blend        = blend_ss0.color_blend;
+   ss.ss0.writedisable_blue  = blend_ss0.writedisable_blue;
+   ss.ss0.writedisable_green = blend_ss0.writedisable_green;
+   ss.ss0.writedisable_red   = blend_ss0.writedisable_red;
+   ss.ss0.writedisable_alpha = blend_ss0.writedisable_alpha;
+
+   if (brw_search_cache(&brw->surface_cache,
+                        BRW_SS_SURFACE,
+                        &ss, sizeof(ss),
+                        reloc, Elements(reloc),
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+       
+   ret = brw_upload_cache(&brw->surface_cache,
+                          BRW_SS_SURFACE,
+                          &ss, sizeof ss,
+                          reloc, Elements(reloc),
+                          &ss, sizeof ss,
+                          NULL, NULL,
+                          bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Constructs the binding table for the WM surface state, which maps unit
+ * numbers to surface state objects.
+ */
+static enum pipe_error
+brw_wm_get_binding_table(struct brw_context *brw,
+                         struct brw_winsys_buffer **bo_out )
+{
+   enum pipe_error ret;
+   struct brw_winsys_reloc reloc[BRW_WM_MAX_SURF];
+   uint32_t data[BRW_WM_MAX_SURF];
+   GLuint nr_relocs = 0;
+   GLuint data_size = brw->wm.nr_surfaces * sizeof data[0];
+   int i;
+
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+   assert(brw->wm.nr_surfaces > 0);
+
+   /* Emit binding table relocations to surface state 
+    */
+   for (i = 0; i < brw->wm.nr_surfaces; i++) {
+      if (brw->wm.surf_bo[i]) {
+         make_reloc(&reloc[nr_relocs++],
+                    BRW_USAGE_STATE,
+                    0,
+                    i * sizeof(GLuint),
+                    brw->wm.surf_bo[i]);
+      }
+   }
+
+   /* Note there is no key for this search beyond the values in the
+    * relocation array:
+    */
+   if (brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+                        NULL, 0,
+                        reloc, nr_relocs,
+                        NULL,
+                        bo_out))
+      return PIPE_OK;
+
+   /* Upload zero data, will all be overwitten with relocation
+    * offsets:
+    */
+   for (i = 0; i < brw->wm.nr_surfaces; i++)
+      data[i] = 0;
+
+   ret = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+                           NULL, 0,
+                           reloc, nr_relocs,
+                           data, data_size,
+                           NULL, NULL,
+                           bo_out);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+static enum pipe_error prepare_wm_surfaces(struct brw_context *brw )
+{
+   enum pipe_error ret;
+   int nr_surfaces = 0;
+   GLuint i;
+
+   /* PIPE_NEW_COLOR_BUFFERS | PIPE_NEW_BLEND
+    *
+    * Update surfaces for drawing buffers.  Mixes in colormask and
+    * blend state.
+    *
+    * XXX: no color buffer case
+    */
+   for (i = 0; i < brw->curr.fb.nr_cbufs; i++) {
+      ret = brw_update_render_surface(brw, 
+                                      brw_surface(brw->curr.fb.cbufs[i]), 
+                                      &brw->wm.surf_bo[BTI_COLOR_BUF(i)]);
+      if (ret)
+         return ret;
+      
+      nr_surfaces = BTI_COLOR_BUF(i) + 1;
+   }
+
+
+
+   /* PIPE_NEW_FRAGMENT_CONSTANTS
+    */
+#if 0
+   if (brw->curr.fragment_constants) {
+      ret = brw_update_fragment_constant_surface(
+         brw, 
+         brw->curr.fragment_constants, 
+         &brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS]);
+
+      if (ret)
+         return ret;
+
+      nr_surfaces = BTI_FRAGMENT_CONSTANTS + 1;
+   }
+   else {
+      bo_reference(&brw->wm.surf_bo[SURF_FRAG_CONSTANTS], NULL);      
+   }
+#endif
+
+
+   /* PIPE_NEW_TEXTURE 
+    */
+   for (i = 0; i < brw->curr.num_textures; i++) {
+      ret = brw_update_texture_surface(brw, 
+                                       brw_texture(brw->curr.texture[i]),
+                                       &brw->wm.surf_bo[BTI_TEXTURE(i)]);
+      if (ret)
+         return ret;
+
+      nr_surfaces = BTI_TEXTURE(i) + 1;
+   }
+
+   /* Clear any inactive entries:
+    */
+   for (i = brw->curr.fb.nr_cbufs; i < BRW_MAX_DRAW_BUFFERS; i++) 
+      bo_reference(&brw->wm.surf_bo[BTI_COLOR_BUF(i)], NULL);
+
+   if (!brw->curr.fragment_constants)
+      bo_reference(&brw->wm.surf_bo[BTI_FRAGMENT_CONSTANTS], NULL);      
+
+   /* XXX: no pipe_max_textures define?? */
+   for (i = brw->curr.num_textures; i < PIPE_MAX_SAMPLERS; i++)
+      bo_reference(&brw->wm.surf_bo[BTI_TEXTURE(i)], NULL);
+
+   if (brw->wm.nr_surfaces != nr_surfaces) {
+      brw->wm.nr_surfaces = nr_surfaces;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
+   }
+
+   ret = brw_wm_get_binding_table(brw, &brw->wm.bind_bo);
+   if (ret)
+      return ret;
+
+   return PIPE_OK;
+}
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .mesa = (PIPE_NEW_COLOR_BUFFERS |
+               PIPE_NEW_BOUND_TEXTURES |
+               PIPE_NEW_FRAGMENT_CONSTANTS |
+	       PIPE_NEW_BLEND),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
+      .cache = 0
+   },
+   .prepare = prepare_wm_surfaces,
+};
+
+
+
diff --git a/src/gallium/drivers/i965/intel_decode.c b/src/gallium/drivers/i965/intel_decode.c
new file mode 100644
index 00000000000..3166958bad3
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_decode.c
@@ -0,0 +1,1790 @@
+/* -*- c-basic-offset: 4 -*- */
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_decode.c
+ * This file contains code to print out batchbuffer contents in a
+ * human-readable format.
+ *
+ * The current version only supports i915 packets, and only pretty-prints a
+ * subset of them.  The intention is for it to make just a best attempt to
+ * decode, but never crash in the process.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "intel_decode.h"
+
+/*#include "intel_chipset.h"*/
+#define IS_965(x) 1             /* XXX */
+#define IS_9XX(x) 1             /* XXX */
+
+#define BUFFER_FAIL(_count, _len, _name) do {			\
+    fprintf(out, "Buffer size too small in %s (%d < %d)\n",	\
+	    (_name), (_count), (_len));				\
+    (*failures)++;						\
+    return count;						\
+} while (0)
+
+static FILE *out;
+static uint32_t saved_s2 = 0, saved_s4 = 0;
+static char saved_s2_set = 0, saved_s4_set = 0;
+
+static float
+int_as_float(uint32_t intval)
+{
+    union intfloat {
+	uint32_t i;
+	float f;
+    } uval;
+
+    uval.i = intval;
+    return uval.f;
+}
+
+static void
+instr_out(const uint32_t *data, uint32_t hw_offset, unsigned int index,
+	  char *fmt, ...)
+{
+    va_list va;
+
+    fprintf(out, "0x%08x: 0x%08x:%s ", hw_offset + index * 4, data[index],
+	    index == 0 ? "" : "  ");
+    va_start(va, fmt);
+    vfprintf(out, fmt, va);
+    va_end(va);
+}
+
+
+static int
+decode_mi(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int len_mask;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_mi[] = {
+	{ 0x08, 0, 1, 1, "MI_ARB_ON_OFF" },
+	{ 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" },
+	{ 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" },
+	{ 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
+	{ 0x04, 0, 1, 1, "MI_FLUSH" },
+	{ 0x22, 0, 3, 3, "MI_LOAD_REGISTER_IMM" },
+	{ 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
+	{ 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
+	{ 0x00, 0, 1, 1, "MI_NOOP" },
+	{ 0x11, 0x3f, 2, 2, "MI_OVERLAY_FLIP" },
+	{ 0x07, 0, 1, 1, "MI_REPORT_HEAD" },
+	{ 0x18, 0x3f, 2, 2, "MI_SET_CONTEXT" },
+	{ 0x20, 0x3f, 3, 4, "MI_STORE_DATA_IMM" },
+	{ 0x21, 0x3f, 3, 4, "MI_STORE_DATA_INDEX" },
+	{ 0x24, 0x3f, 3, 3, "MI_STORE_REGISTER_MEM" },
+	{ 0x02, 0, 1, 1, "MI_USER_INTERRUPT" },
+	{ 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" },
+    };
+
+
+    for (opcode = 0; opcode < sizeof(opcodes_mi) / sizeof(opcodes_mi[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f800000) >> 23 == opcodes_mi[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_mi[opcode].name);
+	    if (opcodes_mi[opcode].max_len > 1) {
+		len = (data[0] & opcodes_mi[opcode].len_mask) + 2;
+		if (len < opcodes_mi[opcode].min_len ||
+		    len > opcodes_mi[opcode].max_len)
+		{
+		    fprintf(out, "Bad length (%d) in %s, [%d, %d]\n",
+			    len, opcodes_mi[opcode].name,
+			    opcodes_mi[opcode].min_len,
+			    opcodes_mi[opcode].max_len);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_mi[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "MI UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_2d(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode, len;
+    char *format = NULL;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_2d[] = {
+	{ 0x40, 5, 5, "COLOR_BLT" },
+	{ 0x43, 6, 6, "SRC_COPY_BLT" },
+	{ 0x01, 8, 8, "XY_SETUP_BLT" },
+	{ 0x11, 9, 9, "XY_SETUP_MONO_PATTERN_SL_BLT" },
+	{ 0x03, 3, 3, "XY_SETUP_CLIP_BLT" },
+	{ 0x24, 2, 2, "XY_PIXEL_BLT" },
+	{ 0x25, 3, 3, "XY_SCANLINES_BLT" },
+	{ 0x26, 4, 4, "Y_TEXT_BLT" },
+	{ 0x31, 5, 134, "XY_TEXT_IMMEDIATE_BLT" },
+	{ 0x50, 6, 6, "XY_COLOR_BLT" },
+	{ 0x51, 6, 6, "XY_PAT_BLT" },
+	{ 0x76, 8, 8, "XY_PAT_CHROMA_BLT" },
+	{ 0x72, 7, 135, "XY_PAT_BLT_IMMEDIATE" },
+	{ 0x77, 9, 137, "XY_PAT_CHROMA_BLT_IMMEDIATE" },
+	{ 0x52, 9, 9, "XY_MONO_PAT_BLT" },
+	{ 0x59, 7, 7, "XY_MONO_PAT_FIXED_BLT" },
+	{ 0x53, 8, 8, "XY_SRC_COPY_BLT" },
+	{ 0x54, 8, 8, "XY_MONO_SRC_COPY_BLT" },
+	{ 0x71, 9, 137, "XY_MONO_SRC_COPY_IMMEDIATE_BLT" },
+	{ 0x55, 9, 9, "XY_FULL_BLT" },
+	{ 0x55, 9, 137, "XY_FULL_IMMEDIATE_PATTERN_BLT" },
+	{ 0x56, 9, 9, "XY_FULL_MONO_SRC_BLT" },
+	{ 0x75, 10, 138, "XY_FULL_MONO_SRC_IMMEDIATE_PATTERN_BLT" },
+	{ 0x57, 12, 12, "XY_FULL_MONO_PATTERN_BLT" },
+	{ 0x58, 12, 12, "XY_FULL_MONO_PATTERN_MONO_SRC_BLT" },
+    };
+
+    switch ((data[0] & 0x1fc00000) >> 22) {
+    case 0x50:
+	instr_out(data, hw_offset, 0,
+		  "XY_COLOR_BLT (rgb %sabled, alpha %sabled, dst tile %d)\n",
+		  (data[0] & (1 << 20)) ? "en" : "dis",
+		  (data[0] & (1 << 21)) ? "en" : "dis",
+		  (data[0] >> 11) & 1);
+
+	len = (data[0] & 0x000000ff) + 2;
+	if (len != 6)
+	    fprintf(out, "Bad count in XY_COLOR_BLT\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "XY_COLOR_BLT");
+
+	switch ((data[1] >> 24) & 0x3) {
+	case 0:
+	    format="8";
+	    break;
+	case 1:
+	    format="565";
+	    break;
+	case 2:
+	    format="1555";
+	    break;
+	case 3:
+	    format="8888";
+	    break;
+	}
+
+	instr_out(data, hw_offset, 1, "format %s, pitch %d, "
+		  "clipping %sabled\n", format,
+		  (short)(data[1] & 0xffff),
+		  data[1] & (1 << 30) ? "en" : "dis");
+	instr_out(data, hw_offset, 2, "(%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	instr_out(data, hw_offset, 3, "(%d,%d)\n",
+		  data[3] & 0xffff, data[3] >> 16);
+	instr_out(data, hw_offset, 4, "offset 0x%08x\n", data[4]);
+	instr_out(data, hw_offset, 5, "color\n");
+	return len;
+    case 0x53:
+	instr_out(data, hw_offset, 0,
+		  "XY_SRC_COPY_BLT (rgb %sabled, alpha %sabled, "
+		  "src tile %d, dst tile %d)\n",
+		  (data[0] & (1 << 20)) ? "en" : "dis",
+		  (data[0] & (1 << 21)) ? "en" : "dis",
+		  (data[0] >> 15) & 1,
+		  (data[0] >> 11) & 1);
+
+	len = (data[0] & 0x000000ff) + 2;
+	if (len != 8)
+	    fprintf(out, "Bad count in XY_SRC_COPY_BLT\n");
+	if (count < 8)
+	    BUFFER_FAIL(count, len, "XY_SRC_COPY_BLT");
+
+	switch ((data[1] >> 24) & 0x3) {
+	case 0:
+	    format="8";
+	    break;
+	case 1:
+	    format="565";
+	    break;
+	case 2:
+	    format="1555";
+	    break;
+	case 3:
+	    format="8888";
+	    break;
+	}
+
+	instr_out(data, hw_offset, 1, "format %s, dst pitch %d, "
+		  "clipping %sabled\n", format,
+		  (short)(data[1] & 0xffff),
+		  data[1] & (1 << 30) ? "en" : "dis");
+	instr_out(data, hw_offset, 2, "dst (%d,%d)\n",
+		  data[2] & 0xffff, data[2] >> 16);
+	instr_out(data, hw_offset, 3, "dst (%d,%d)\n",
+		  data[3] & 0xffff, data[3] >> 16);
+	instr_out(data, hw_offset, 4, "dst offset 0x%08x\n", data[4]);
+	instr_out(data, hw_offset, 5, "src (%d,%d)\n",
+		  data[5] & 0xffff, data[5] >> 16);
+	instr_out(data, hw_offset, 6, "src pitch %d\n",
+		  (short)(data[6] & 0xffff));
+	instr_out(data, hw_offset, 7, "src offset 0x%08x\n", data[7]);
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_2d) / sizeof(opcodes_2d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1fc00000) >> 22 == opcodes_2d[opcode].opcode) {
+	    unsigned int i;
+
+	    len = 1;
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_2d[opcode].name);
+	    if (opcodes_2d[opcode].max_len > 1) {
+		len = (data[0] & 0x000000ff) + 2;
+		if (len < opcodes_2d[opcode].min_len ||
+		    len > opcodes_2d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_2d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_2d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "2D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_1c(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    switch ((data[0] & 0x00f80000) >> 19) {
+    case 0x11:
+	instr_out(data, hw_offset, 0, "3DSTATE_DEPTH_SUBRECTANGLE_DISALBE\n");
+	return 1;
+    case 0x10:
+	instr_out(data, hw_offset, 0, "3DSTATE_SCISSOR_ENABLE\n");
+	return 1;
+    case 0x01:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_COORD_SET_I830\n");
+	return 1;
+    case 0x0a:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_CUBE_I830\n");
+	return 1;
+    case 0x05:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_TEX_STREAM_I830\n");
+	return 1;
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+/** Sets the string dstname to describe the destination of the PS instruction */
+static void
+i915_get_instruction_dst(const uint32_t *data, int i, char *dstname, int do_mask)
+{
+    uint32_t a0 = data[i];
+    int dst_nr = (a0 >> 14) & 0xf;
+    char dstmask[8];
+    char *sat;
+
+    if (do_mask) {
+	if (((a0 >> 10) & 0xf) == 0xf) {
+	    dstmask[0] = 0;
+	} else {
+	    int dstmask_index = 0;
+
+	    dstmask[dstmask_index++] = '.';
+	    if (a0 & (1 << 10))
+		dstmask[dstmask_index++] = 'x';
+	    if (a0 & (1 << 11))
+		dstmask[dstmask_index++] = 'y';
+	    if (a0 & (1 << 12))
+		dstmask[dstmask_index++] = 'z';
+	    if (a0 & (1 << 13))
+		dstmask[dstmask_index++] = 'w';
+	    dstmask[dstmask_index++] = 0;
+	}
+
+	if (a0 & (1 << 22))
+	    sat = ".sat";
+	else
+	    sat = "";
+    } else {
+	dstmask[0] = 0;
+	sat = "";
+    }
+
+    switch ((a0 >> 19) & 0x7) {
+    case 0:
+	if (dst_nr > 15)
+	    fprintf(out, "bad destination reg R%d\n", dst_nr);
+	sprintf(dstname, "R%d%s%s", dst_nr, dstmask, sat);
+	break;
+    case 4:
+	if (dst_nr > 0)
+	    fprintf(out, "bad destination reg oC%d\n", dst_nr);
+	sprintf(dstname, "oC%s%s", dstmask, sat);
+	break;
+    case 5:
+	if (dst_nr > 0)
+	    fprintf(out, "bad destination reg oD%d\n", dst_nr);
+	sprintf(dstname, "oD%s%s",  dstmask, sat);
+	break;
+    case 6:
+	if (dst_nr > 2)
+	    fprintf(out, "bad destination reg U%d\n", dst_nr);
+	sprintf(dstname, "U%d%s%s", dst_nr, dstmask, sat);
+	break;
+    default:
+	sprintf(dstname, "RESERVED");
+	break;
+    }
+}
+
+static char *
+i915_get_channel_swizzle(uint32_t select)
+{
+    switch (select & 0x7) {
+    case 0:
+	return (select & 8) ? "-x" : "x";
+    case 1:
+	return (select & 8) ? "-y" : "y";
+    case 2:
+	return (select & 8) ? "-z" : "z";
+    case 3:
+	return (select & 8) ? "-w" : "w";
+    case 4:
+	return (select & 8) ? "-0" : "0";
+    case 5:
+	return (select & 8) ? "-1" : "1";
+    default:
+	return (select & 8) ? "-bad" : "bad";
+    }
+}
+
+static void
+i915_get_instruction_src_name(uint32_t src_type, uint32_t src_nr, char *name)
+{
+    switch (src_type) {
+    case 0:
+	sprintf(name, "R%d", src_nr);
+	if (src_nr > 15)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 1:
+	if (src_nr < 8)
+	    sprintf(name, "T%d", src_nr);
+	else if (src_nr == 8)
+	    sprintf(name, "DIFFUSE");
+	else if (src_nr == 9)
+	    sprintf(name, "SPECULAR");
+	else if (src_nr == 10)
+	    sprintf(name, "FOG");
+	else {
+	    fprintf(out, "bad src reg T%d\n", src_nr);
+	    sprintf(name, "RESERVED");
+	}
+	break;
+    case 2:
+	sprintf(name, "C%d", src_nr);
+	if (src_nr > 31)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 4:
+	sprintf(name, "oC");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oC%d\n", src_nr);
+	break;
+    case 5:
+	sprintf(name, "oD");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oD%d\n", src_nr);
+	break;
+    case 6:
+	sprintf(name, "U%d", src_nr);
+	if (src_nr > 2)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    default:
+	fprintf(out, "bad src reg type %d\n", src_type);
+	sprintf(name, "RESERVED");
+	break;
+    }
+}
+
+static void
+i915_get_instruction_src0(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a0 = data[i];
+    uint32_t a1 = data[i + 1];
+    int src_nr = (a0 >> 2) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a1 >> 28) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a1 >> 24) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a1 >> 20) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a1 >> 16) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a0 >> 7) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_src1(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a1 = data[i + 1];
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a1 >> 8) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a1 >> 4) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a1 >> 0) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a2 >> 28) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a2 >> 24) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a1 >> 13) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_src2(const uint32_t *data, int i, char *srcname)
+{
+    uint32_t a2 = data[i + 2];
+    int src_nr = (a2 >> 16) & 0x1f;
+    char *swizzle_x = i915_get_channel_swizzle((a2 >> 12) & 0xf);
+    char *swizzle_y = i915_get_channel_swizzle((a2 >> 8) & 0xf);
+    char *swizzle_z = i915_get_channel_swizzle((a2 >> 4) & 0xf);
+    char *swizzle_w = i915_get_channel_swizzle((a2 >> 0) & 0xf);
+    char swizzle[100];
+
+    i915_get_instruction_src_name((a2 >> 21) & 0x7, src_nr, srcname);
+    sprintf(swizzle, ".%s%s%s%s", swizzle_x, swizzle_y, swizzle_z, swizzle_w);
+    if (strcmp(swizzle, ".xyzw") != 0)
+	strcat(srcname, swizzle);
+}
+
+static void
+i915_get_instruction_addr(uint32_t src_type, uint32_t src_nr, char *name)
+{
+    switch (src_type) {
+    case 0:
+	sprintf(name, "R%d", src_nr);
+	if (src_nr > 15)
+	    fprintf(out, "bad src reg %s\n", name);
+	break;
+    case 1:
+	if (src_nr < 8)
+	    sprintf(name, "T%d", src_nr);
+	else if (src_nr == 8)
+	    sprintf(name, "DIFFUSE");
+	else if (src_nr == 9)
+	    sprintf(name, "SPECULAR");
+	else if (src_nr == 10)
+	    sprintf(name, "FOG");
+	else {
+	    fprintf(out, "bad src reg T%d\n", src_nr);
+	    sprintf(name, "RESERVED");
+	}
+	break;
+    case 4:
+	sprintf(name, "oC");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oC%d\n", src_nr);
+	break;
+    case 5:
+	sprintf(name, "oD");
+	if (src_nr > 0)
+	    fprintf(out, "bad src reg oD%d\n", src_nr);
+	break;
+    default:
+	fprintf(out, "bad src reg type %d\n", src_type);
+	sprintf(name, "RESERVED");
+	break;
+    }
+}
+
+static void
+i915_decode_alu1(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s\n", instr_prefix,
+	      op_name, dst, src0);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_alu2(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100], src1[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+    i915_get_instruction_src1(data, i, src1);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_alu3(const uint32_t *data, uint32_t hw_offset,
+		 int i, char *instr_prefix, char *op_name)
+{
+    char dst[100], src0[100], src1[100], src2[100];
+
+    i915_get_instruction_dst(data, i, dst, 1);
+    i915_get_instruction_src0(data, i, src0);
+    i915_get_instruction_src1(data, i, src1);
+    i915_get_instruction_src2(data, i, src2);
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, %s, %s, %s\n", instr_prefix,
+	      op_name, dst, src0, src1, src2);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_tex(const uint32_t *data, uint32_t hw_offset, int i, char *instr_prefix,
+		char *tex_name)
+{
+    uint32_t t0 = data[i];
+    uint32_t t1 = data[i + 1];
+    char dst_name[100];
+    char addr_name[100];
+    int sampler_nr;
+
+    i915_get_instruction_dst(data, i, dst_name, 0);
+    i915_get_instruction_addr((t1 >> 24) & 0x7,
+			      (t1 >> 17) & 0xf,
+			      addr_name);
+    sampler_nr = t0 & 0xf;
+
+    instr_out(data, hw_offset, i++, "%s: %s %s, S%d, %s\n", instr_prefix,
+	      tex_name, dst_name, sampler_nr, addr_name);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+}
+
+static void
+i915_decode_dcl(const uint32_t *data, uint32_t hw_offset, int i, char *instr_prefix)
+{
+    uint32_t d0 = data[i];
+    char *sampletype;
+    int dcl_nr = (d0 >> 14) & 0xf;
+    char *dcl_x = d0 & (1 << 10) ? "x" : "";
+    char *dcl_y = d0 & (1 << 11) ? "y" : "";
+    char *dcl_z = d0 & (1 << 12) ? "z" : "";
+    char *dcl_w = d0 & (1 << 13) ? "w" : "";
+    char dcl_mask[10];
+
+    switch ((d0 >> 19) & 0x3) {
+    case 1:
+	sprintf(dcl_mask, ".%s%s%s%s", dcl_x, dcl_y, dcl_z, dcl_w);
+	if (strcmp(dcl_mask, ".") == 0)
+	    fprintf(out, "bad (empty) dcl mask\n");
+
+	if (dcl_nr > 10)
+	    fprintf(out, "bad T%d dcl register number\n", dcl_nr);
+	if (dcl_nr < 8) {
+	    if (strcmp(dcl_mask, ".x") != 0 &&
+		strcmp(dcl_mask, ".xy") != 0 &&
+		strcmp(dcl_mask, ".xz") != 0 &&
+		strcmp(dcl_mask, ".w") != 0 &&
+		strcmp(dcl_mask, ".xyzw") != 0) {
+		fprintf(out, "bad T%d.%s dcl mask\n", dcl_nr, dcl_mask);
+	    }
+	    instr_out(data, hw_offset, i++, "%s: DCL T%d%s\n", instr_prefix,
+		      dcl_nr, dcl_mask);
+	} else {
+	    if (strcmp(dcl_mask, ".xz") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+	    else if (strcmp(dcl_mask, ".xw") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+	    else if (strcmp(dcl_mask, ".xzw") == 0)
+		fprintf(out, "errataed bad dcl mask %s\n", dcl_mask);
+
+	    if (dcl_nr == 8) {
+		instr_out(data, hw_offset, i++, "%s: DCL DIFFUSE%s\n", instr_prefix,
+			  dcl_mask);
+	    } else if (dcl_nr == 9) {
+		instr_out(data, hw_offset, i++, "%s: DCL SPECULAR%s\n", instr_prefix,
+			  dcl_mask);
+	    } else if (dcl_nr == 10) {
+		instr_out(data, hw_offset, i++, "%s: DCL FOG%s\n", instr_prefix,
+			  dcl_mask);
+	    }
+	}
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    case 3:
+	switch ((d0 >> 22) & 0x3) {
+	case 0:
+	    sampletype = "2D";
+	    break;
+	case 1:
+	    sampletype = "CUBE";
+	    break;
+	case 2:
+	    sampletype = "3D";
+	    break;
+	default:
+	    sampletype = "RESERVED";
+	    break;
+	}
+	if (dcl_nr > 15)
+	    fprintf(out, "bad S%d dcl register number\n", dcl_nr);
+	instr_out(data, hw_offset, i++, "%s: DCL S%d %s\n", instr_prefix,
+		  dcl_nr, sampletype);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    default:
+	instr_out(data, hw_offset, i++, "%s: DCL RESERVED%d\n", instr_prefix, dcl_nr);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+    }
+}
+
+static void
+i915_decode_instruction(const uint32_t *data, uint32_t hw_offset,
+			int i, char *instr_prefix)
+{
+    switch ((data[i] >> 24) & 0x1f) {
+    case 0x0:
+	instr_out(data, hw_offset, i++, "%s: NOP\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    case 0x01:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "ADD");
+	break;
+    case 0x02:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "MOV");
+	break;
+    case 0x03:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MUL");
+	break;
+    case 0x04:
+	i915_decode_alu3(data, hw_offset, i, instr_prefix, "MAD");
+	break;
+    case 0x05:
+	i915_decode_alu3(data, hw_offset, i, instr_prefix, "DP2ADD");
+	break;
+    case 0x06:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "DP3");
+	break;
+    case 0x07:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "DP4");
+	break;
+    case 0x08:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "FRC");
+	break;
+    case 0x09:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "RCP");
+	break;
+    case 0x0a:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "RSQ");
+	break;
+    case 0x0b:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "EXP");
+	break;
+    case 0x0c:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "LOG");
+	break;
+    case 0x0d:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "CMP");
+	break;
+    case 0x0e:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MIN");
+	break;
+    case 0x0f:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "MAX");
+	break;
+    case 0x10:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "FLR");
+	break;
+    case 0x11:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "MOD");
+	break;
+    case 0x12:
+	i915_decode_alu1(data, hw_offset, i, instr_prefix, "TRC");
+	break;
+    case 0x13:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "SGE");
+	break;
+    case 0x14:
+	i915_decode_alu2(data, hw_offset, i, instr_prefix, "SLT");
+	break;
+    case 0x15:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLD");
+	break;
+    case 0x16:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLDP");
+	break;
+    case 0x17:
+	i915_decode_tex(data, hw_offset, i, instr_prefix, "TEXLDB");
+	break;
+    case 0x19:
+	i915_decode_dcl(data, hw_offset, i, instr_prefix);
+	break;
+    default:
+	instr_out(data, hw_offset, i++, "%s: unknown\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	instr_out(data, hw_offset, i++, "%s\n", instr_prefix);
+	break;
+    }
+}
+
+static int
+decode_3d_1d(const uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
+{
+    unsigned int len, i, c, opcode, word, map, sampler, instr;
+    char *format;
+
+    struct {
+	uint32_t opcode;
+	int i830_only;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d_1d[] = {
+	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
+	{ 0x86, 0, 4, 4, "3DSTATE_CHROMA_KEY" },
+	{ 0x9c, 0, 1, 1, "3DSTATE_CLEAR_PARAMETERS" },
+	{ 0x88, 0, 2, 2, "3DSTATE_CONSTANT_BLEND_COLOR" },
+	{ 0x99, 0, 2, 2, "3DSTATE_DEFAULT_DIFFUSE" },
+	{ 0x9a, 0, 2, 2, "3DSTATE_DEFAULT_SPECULAR" },
+	{ 0x98, 0, 2, 2, "3DSTATE_DEFAULT_Z" },
+	{ 0x97, 0, 2, 2, "3DSTATE_DEPTH_OFFSET_SCALE" },
+	{ 0x85, 0, 2, 2, "3DSTATE_DEST_BUFFER_VARIABLES" },
+	{ 0x80, 0, 5, 5, "3DSTATE_DRAWING_RECTANGLE" },
+	{ 0x8e, 0, 3, 3, "3DSTATE_BUFFER_INFO" },
+	{ 0x9d, 0, 65, 65, "3DSTATE_FILTER_COEFFICIENTS_4X4" },
+	{ 0x9e, 0, 4, 4, "3DSTATE_MONO_FILTER" },
+	{ 0x89, 0, 4, 4, "3DSTATE_FOG_MODE" },
+	{ 0x8f, 0, 2, 16, "3DSTATE_MAP_PALLETE_LOAD_32" },
+	{ 0x81, 0, 3, 3, "3DSTATE_SCISSOR_RECTANGLE" },
+	{ 0x83, 0, 2, 2, "3DSTATE_SPAN_STIPPLE" },
+	{ 0x8c, 1, 2, 2, "3DSTATE_MAP_COORD_TRANSFORM_I830" },
+	{ 0x8b, 1, 2, 2, "3DSTATE_MAP_VERTEX_TRANSFORM_I830" },
+	{ 0x8d, 1, 3, 3, "3DSTATE_W_STATE_I830" },
+	{ 0x01, 1, 2, 2, "3DSTATE_COLOR_FACTOR_I830" },
+	{ 0x02, 1, 2, 2, "3DSTATE_MAP_COORD_SETBIND_I830" },
+    };
+
+    switch ((data[0] & 0x00ff0000) >> 16) {
+    case 0x07:
+	/* This instruction is unusual.  A 0 length means just 1 DWORD instead of
+	 * 2.  The 0 length is specified in one place to be unsupported, but
+	 * stated to be required in another, and 0 length LOAD_INDIRECTs appear
+	 * to cause no harm at least.
+	 */
+	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_INDIRECT\n");
+	len = (data[0] & 0x000000ff) + 1;
+	i = 1;
+	if (data[0] & (0x01 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "SIS.0\n");
+	    instr_out(data, hw_offset, i++, "SIS.1\n");
+	}
+	if (data[0] & (0x02 << 8)) {
+	    if (i + 1 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "DIS.0\n");
+	}
+	if (data[0] & (0x04 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "SSB.0\n");
+	    instr_out(data, hw_offset, i++, "SSB.1\n");
+	}
+	if (data[0] & (0x08 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "MSB.0\n");
+	    instr_out(data, hw_offset, i++, "MSB.1\n");
+	}
+	if (data[0] & (0x10 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "PSP.0\n");
+	    instr_out(data, hw_offset, i++, "PSP.1\n");
+	}
+	if (data[0] & (0x20 << 8)) {
+	    if (i + 2 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_LOAD_INDIRECT");
+	    instr_out(data, hw_offset, i++, "PSC.0\n");
+	    instr_out(data, hw_offset, i++, "PSC.1\n");
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n");
+	    (*failures)++;
+	    return len;
+	}
+	return len;
+    case 0x04:
+	instr_out(data, hw_offset, 0, "3DSTATE_LOAD_STATE_IMMEDIATE_1\n");
+	len = (data[0] & 0x0000000f) + 2;
+	i = 1;
+	for (word = 0; word <= 7; word++) {
+	    if (data[0] & (1 << (4 + word))) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_LOAD_STATE_IMMEDIATE_1");
+
+		/* save vertex state for decode */
+		if (word == 2) {
+		    saved_s2_set = 1;
+		    saved_s2 = data[i];
+		}
+		if (word == 4) {
+		    saved_s4_set = 1;
+		    saved_s4 = data[i];
+		}
+
+		instr_out(data, hw_offset, i++, "S%d\n", word);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_LOAD_INDIRECT\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x00:
+	instr_out(data, hw_offset, 0, "3DSTATE_MAP_STATE\n");
+	len = (data[0] & 0x0000003f) + 2;
+	instr_out(data, hw_offset, 1, "mask\n");
+
+	i = 2;
+	for (map = 0; map <= 15; map++) {
+	    if (data[1] & (1 << map)) {
+		if (i + 3 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_MAP_STATE");
+		instr_out(data, hw_offset, i++, "map %d MS2\n", map);
+		instr_out(data, hw_offset, i++, "map %d MS3\n", map);
+		instr_out(data, hw_offset, i++, "map %d MS4\n", map);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_MAP_STATE\n");
+	    (*failures)++;
+	    return len;
+	}
+	return len;
+    case 0x06:
+	instr_out(data, hw_offset, 0, "3DSTATE_PIXEL_SHADER_CONSTANTS\n");
+	len = (data[0] & 0x000000ff) + 2;
+
+	i = 2;
+	for (c = 0; c <= 31; c++) {
+	    if (data[1] & (1 << c)) {
+		if (i + 4 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_PIXEL_SHADER_CONSTANTS");
+		instr_out(data, hw_offset, i, "C%d.X = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.Y = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.Z = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+		instr_out(data, hw_offset, i, "C%d.W = %f\n",
+			  c, int_as_float(data[i]));
+		i++;
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_PIXEL_SHADER_CONSTANTS\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x05:
+	instr_out(data, hw_offset, 0, "3DSTATE_PIXEL_SHADER_PROGRAM\n");
+	len = (data[0] & 0x000000ff) + 2;
+	if ((len - 1) % 3 != 0 || len > 370) {
+	    fprintf(out, "Bad count in 3DSTATE_PIXEL_SHADER_PROGRAM\n");
+	    (*failures)++;
+	}
+	i = 1;
+	for (instr = 0; instr < (len - 1) / 3; instr++) {
+	    char instr_prefix[10];
+
+	    if (i + 3 >= count)
+		BUFFER_FAIL(count, len, "3DSTATE_PIXEL_SHADER_PROGRAM");
+	    sprintf(instr_prefix, "PS%03d", instr);
+	    i915_decode_instruction(data, hw_offset, i, instr_prefix);
+	    i += 3;
+	}
+	return len;
+    case 0x01:
+	if (i830)
+	    break;
+	instr_out(data, hw_offset, 0, "3DSTATE_SAMPLER_STATE\n");
+	instr_out(data, hw_offset, 1, "mask\n");
+	len = (data[0] & 0x0000003f) + 2;
+	i = 2;
+	for (sampler = 0; sampler <= 15; sampler++) {
+	    if (data[1] & (1 << sampler)) {
+		if (i + 3 >= count)
+		    BUFFER_FAIL(count, len, "3DSTATE_SAMPLER_STATE");
+		instr_out(data, hw_offset, i++, "sampler %d SS2\n",
+			  sampler);
+		instr_out(data, hw_offset, i++, "sampler %d SS3\n",
+			  sampler);
+		instr_out(data, hw_offset, i++, "sampler %d SS4\n",
+			  sampler);
+	    }
+	}
+	if (len != i) {
+	    fprintf(out, "Bad count in 3DSTATE_SAMPLER_STATE\n");
+	    (*failures)++;
+	}
+	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+
+	if (len != 2)
+	    fprintf(out, "Bad count in 3DSTATE_DEST_BUFFER_VARIABLES\n");
+	if (count < 2)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEST_BUFFER_VARIABLES");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	instr_out(data, hw_offset, 1, "%s format, early Z %sabled\n",
+		  format,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
+	 opcode++)
+    {
+	if (opcodes_3d_1d[opcode].i830_only && !i830)
+	    continue;
+
+	if (((data[0] & 0x00ff0000) >> 16) == opcodes_3d_1d[opcode].opcode) {
+	    len = 1;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d_1d[opcode].name);
+	    if (opcodes_3d_1d[opcode].max_len > 1) {
+		len = (data[0] & 0x0000ffff) + 2;
+		if (len < opcodes_3d_1d[opcode].min_len ||
+		    len > opcodes_3d_1d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n",
+			    opcodes_3d_1d[opcode].name);
+		    (*failures)++;
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len,  opcodes_3d_1d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_primitive(const uint32_t *data, int count, uint32_t hw_offset,
+		    int *failures)
+{
+    char immediate = (data[0] & (1 << 23)) == 0;
+    unsigned int len, i;
+    char *primtype;
+
+    switch ((data[0] >> 18) & 0xf) {
+    case 0x0: primtype = "TRILIST"; break;
+    case 0x1: primtype = "TRISTRIP"; break;
+    case 0x2: primtype = "TRISTRIP_REVERSE"; break;
+    case 0x3: primtype = "TRIFAN"; break;
+    case 0x4: primtype = "POLYGON"; break;
+    case 0x5: primtype = "LINELIST"; break;
+    case 0x6: primtype = "LINESTRIP"; break;
+    case 0x7: primtype = "RECTLIST"; break;
+    case 0x8: primtype = "POINTLIST"; break;
+    case 0x9: primtype = "DIB"; break;
+    case 0xa: primtype = "CLEAR_RECT"; break;
+    default: primtype = "unknown"; break;
+    }
+
+    /* XXX: 3DPRIM_DIB not supported */
+    if (immediate) {
+	len = (data[0] & 0x0003ffff) + 2;
+	instr_out(data, hw_offset, 0, "3DPRIMITIVE inline %s\n", primtype);
+	if (count < len)
+	    BUFFER_FAIL(count, len,  "3DPRIMITIVE inline");
+	if (!saved_s2_set || !saved_s4_set) {
+	    fprintf(out, "unknown vertex format\n");
+	    for (i = 1; i < len; i++) {
+		instr_out(data, hw_offset, i,
+			  "           vertex data (%f float)\n",
+			  int_as_float(data[i]));
+	    }
+	} else {
+	    unsigned int vertex = 0;
+	    for (i = 1; i < len;) {
+		unsigned int tc;
+
+#define VERTEX_OUT(fmt, ...) do {					\
+    if (i < len)							\
+	instr_out(data, hw_offset, i, " V%d."fmt"\n", vertex, __VA_ARGS__); \
+    else								\
+	fprintf(out, " missing data in V%d\n", vertex);			\
+    i++;								\
+} while (0)
+
+		VERTEX_OUT("X = %f", int_as_float(data[i]));
+		VERTEX_OUT("Y = %f", int_as_float(data[i]));
+	        switch (saved_s4 >> 6 & 0x7) {
+		case 0x1:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    break;
+		case 0x2:
+		    VERTEX_OUT("Z = %f", int_as_float(data[i]));
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		case 0x3:
+		    break;
+		case 0x4:
+		    VERTEX_OUT("W = %f", int_as_float(data[i]));
+		    break;
+		default:
+		    fprintf(out, "bad S4 position mask\n");
+		}
+
+		if (saved_s4 & (1 << 10)) {
+		    VERTEX_OUT("color = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 11)) {
+		    VERTEX_OUT("spec = (A=0x%02x, R=0x%02x, G=0x%02x, "
+			       "B=0x%02x)",
+			       data[i] >> 24,
+			       (data[i] >> 16) & 0xff,
+			       (data[i] >> 8) & 0xff,
+			       data[i] & 0xff);
+		}
+		if (saved_s4 & (1 << 12))
+		    VERTEX_OUT("width = 0x%08x)", data[i]);
+
+		for (tc = 0; tc <= 7; tc++) {
+		    switch ((saved_s2 >> (tc * 4)) & 0xf) {
+		    case 0x0:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x1:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x2:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Y = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.Z = %f", tc, int_as_float(data[i]));
+			VERTEX_OUT("T%d.W = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x3:
+			VERTEX_OUT("T%d.X = %f", tc, int_as_float(data[i]));
+			break;
+		    case 0x4:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0x5:
+			VERTEX_OUT("T%d.XY = 0x%08x half-float", tc, data[i]);
+			VERTEX_OUT("T%d.ZW = 0x%08x half-float", tc, data[i]);
+			break;
+		    case 0xf:
+			break;
+		    default:
+			fprintf(out, "bad S2.T%d format\n", tc);
+		    }
+		}
+		vertex++;
+	    }
+	}
+    } else {
+	/* indirect vertices */
+	len = data[0] & 0x0000ffff; /* index count */
+	if (data[0] & (1 << 17)) {
+	    /* random vertex access */
+	    if (count < (len + 1) / 2 + 1) {
+		BUFFER_FAIL(count, (len + 1) / 2 + 1,
+			    "3DPRIMITIVE random indirect");
+	    }
+	    instr_out(data, hw_offset, 0,
+		      "3DPRIMITIVE random indirect %s (%d)\n", primtype, len);
+	    if (len == 0) {
+		/* vertex indices continue until 0xffff is found */
+		for (i = 1; i < count; i++) {
+		    if ((data[i] & 0xffff) == 0xffff) {
+			instr_out(data, hw_offset, i,
+				  "            indices: (terminator)\n");
+			return i;
+		    } else if ((data[i] >> 16) == 0xffff) {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, "
+				  "(terminator)\n",
+				  data[i] & 0xffff);
+			return i;
+		    } else {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, 0x%04x\n",
+				  data[i] & 0xffff, data[i] >> 16);
+		    }
+		}
+		fprintf(out,
+			"3DPRIMITIVE: no terminator found in index buffer\n");
+		(*failures)++;
+		return count;
+	    } else {
+		/* fixed size vertex index buffer */
+		for (i = 0; i < len; i += 2) {
+		    if (i * 2 == len - 1) {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x\n",
+				  data[i] & 0xffff);
+		    } else {
+			instr_out(data, hw_offset, i,
+				  "            indices: 0x%04x, 0x%04x\n",
+				  data[i] & 0xffff, data[i] >> 16);
+		    }
+		}
+	    }
+	    return (len + 1) / 2 + 1;
+	} else {
+	    /* sequential vertex access */
+	    if (count < 2)
+		BUFFER_FAIL(count, 2, "3DPRIMITIVE seq indirect");
+	    instr_out(data, hw_offset, 0,
+		      "3DPRIMITIVE sequential indirect %s, %d starting from "
+		      "%d\n", primtype, len, data[1] & 0xffff);
+	    instr_out(data, hw_offset, 1, "           start\n");
+	    return 2;
+	}
+    }
+
+    return len;
+}
+
+static int
+decode_3d(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x06, 1, 1, "3DSTATE_ANTI_ALIASING" },
+	{ 0x08, 1, 1, "3DSTATE_BACKFACE_STENCIL_OPS" },
+	{ 0x09, 1, 1, "3DSTATE_BACKFACE_STENCIL_MASKS" },
+	{ 0x16, 1, 1, "3DSTATE_COORD_SET_BINDINGS" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x0b, 1, 1, "3DSTATE_INDEPENDENT_ALPHA_BLEND" },
+	{ 0x0d, 1, 1, "3DSTATE_MODES_4" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+    };
+
+    switch ((data[0] & 0x1f000000) >> 24) {
+    case 0x1f:
+	return decode_3d_primitive(data, count, hw_offset, failures);
+    case 0x1d:
+	return decode_3d_1d(data, count, hw_offset, failures, 0);
+    case 0x1c:
+	return decode_3d_1c(data, count, hw_offset, failures);
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static const char *
+get_965_surfacetype(unsigned int surfacetype)
+{
+    switch (surfacetype) {
+    case 0: return "1D";
+    case 1: return "2D";
+    case 2: return "3D";
+    case 3: return "CUBE";
+    case 4: return "BUFFER";
+    case 7: return "NULL";
+    default: return "unknown";
+    }
+}
+
+static const char *
+get_965_depthformat(unsigned int depthformat)
+{
+    switch (depthformat) {
+    case 0: return "s8_z24float";
+    case 1: return "z32float";
+    case 2: return "z24s8";
+    case 5: return "z16";
+    default: return "unknown";
+    }
+}
+
+static const char *
+get_965_element_component(uint32_t data, int component)
+{
+    uint32_t component_control = (data >> (16 + (3 - component) * 4)) & 0x7;
+
+    switch (component_control) {
+    case 0:
+	return "nostore";
+    case 1:
+	switch (component) {
+	case 0: return "X";
+	case 1: return "Y";
+	case 2: return "Z";
+	case 3: return "W";
+	default: return "fail";
+	}
+    case 2:
+	return "0.0";
+    case 3:
+	return "1.0";
+    case 4:
+	return "0x1";
+    case 5:
+	return "VID";
+    default:
+	return "fail";
+    }
+}
+
+static const char *
+get_965_prim_type(uint32_t data)
+{
+    uint32_t primtype = (data >> 10) & 0x1f;
+
+    switch (primtype) {
+    case 0x01: return "point list";
+    case 0x02: return "line list";
+    case 0x03: return "line strip";
+    case 0x04: return "tri list";
+    case 0x05: return "tri strip";
+    case 0x06: return "tri fan";
+    case 0x07: return "quad list";
+    case 0x08: return "quad strip";
+    case 0x09: return "line list adj";
+    case 0x0a: return "line strip adj";
+    case 0x0b: return "tri list adj";
+    case 0x0c: return "tri strip adj";
+    case 0x0d: return "tri strip reverse";
+    case 0x0e: return "polygon";
+    case 0x0f: return "rect list";
+    case 0x10: return "line loop";
+    case 0x11: return "point list bf";
+    case 0x12: return "line strip cont";
+    case 0x13: return "line strip bf";
+    case 0x14: return "line strip cont bf";
+    case 0x15: return "tri fan no stipple";
+    default: return "fail";
+    }
+}
+
+static int
+decode_3d_965(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode, len;
+    int i;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x6000, 3, 3, "URB_FENCE" },
+	{ 0x6001, 2, 2, "CS_URB_STATE" },
+	{ 0x6002, 2, 2, "CONSTANT_BUFFER" },
+	{ 0x6101, 6, 6, "STATE_BASE_ADDRESS" },
+	{ 0x6102, 2, 2 , "STATE_SIP" },
+	{ 0x6104, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+	{ 0x680b, 1, 1, "3DSTATE_VF_STATISTICS" },
+	{ 0x6904, 1, 1, "3DSTATE_PIPELINE_SELECT" },
+	{ 0x7800, 7, 7, "3DSTATE_PIPELINED_POINTERS" },
+	{ 0x7801, 6, 6, "3DSTATE_BINDING_TABLE_POINTERS" },
+	{ 0x780b, 1, 1, "3DSTATE_VF_STATISTICS" },
+	{ 0x7808, 5, 257, "3DSTATE_VERTEX_BUFFERS" },
+	{ 0x7809, 3, 256, "3DSTATE_VERTEX_ELEMENTS" },
+	{ 0x780a, 3, 3, "3DSTATE_INDEX_BUFFER" },
+	{ 0x7900, 4, 4, "3DSTATE_DRAWING_RECTANGLE" },
+	{ 0x7901, 5, 5, "3DSTATE_CONSTANT_COLOR" },
+	{ 0x7905, 5, 7, "3DSTATE_DEPTH_BUFFER" },
+	{ 0x7906, 2, 2, "3DSTATE_POLY_STIPPLE_OFFSET" },
+	{ 0x7907, 33, 33, "3DSTATE_POLY_STIPPLE_PATTERN" },
+	{ 0x7908, 3, 3, "3DSTATE_LINE_STIPPLE" },
+	{ 0x7909, 2, 2, "3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP" },
+	{ 0x790a, 3, 3, "3DSTATE_AA_LINE_PARAMETERS" },
+	{ 0x7b00, 6, 6, "3DPRIMITIVE" },
+    };
+
+    len = (data[0] & 0x0000ffff) + 2;
+
+    switch ((data[0] & 0xffff0000) >> 16) {
+    case 0x6101:
+	if (len != 6)
+	    fprintf(out, "Bad count in STATE_BASE_ADDRESS\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "STATE_BASE_ADDRESS");
+
+	instr_out(data, hw_offset, 0,
+		  "STATE_BASE_ADDRESS\n");
+
+	if (data[1] & 1) {
+	    instr_out(data, hw_offset, 1, "General state at 0x%08x\n",
+		      data[1] & ~1);
+	} else
+	    instr_out(data, hw_offset, 1, "General state not updated\n");
+
+	if (data[2] & 1) {
+	    instr_out(data, hw_offset, 2, "Surface state at 0x%08x\n",
+		      data[2] & ~1);
+	} else
+	    instr_out(data, hw_offset, 2, "Surface state not updated\n");
+
+	if (data[3] & 1) {
+	    instr_out(data, hw_offset, 3, "Indirect state at 0x%08x\n",
+		      data[3] & ~1);
+	} else
+	    instr_out(data, hw_offset, 3, "Indirect state not updated\n");
+
+	if (data[4] & 1) {
+	    instr_out(data, hw_offset, 4, "General state upper bound 0x%08x\n",
+		      data[4] & ~1);
+	} else
+	    instr_out(data, hw_offset, 4, "General state not updated\n");
+
+	if (data[5] & 1) {
+	    instr_out(data, hw_offset, 5, "Indirect state upper bound 0x%08x\n",
+		      data[5] & ~1);
+	} else
+	    instr_out(data, hw_offset, 5, "Indirect state not updated\n");
+
+	return len;
+    case 0x7800:
+	if (len != 7)
+	    fprintf(out, "Bad count in 3DSTATE_PIPELINED_POINTERS\n");
+	if (count < 7)
+	    BUFFER_FAIL(count, len, "3DSTATE_PIPELINED_POINTERS");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_PIPELINED_POINTERS\n");
+	instr_out(data, hw_offset, 1, "VS state\n");
+	instr_out(data, hw_offset, 2, "GS state\n");
+	instr_out(data, hw_offset, 3, "Clip state\n");
+	instr_out(data, hw_offset, 4, "SF state\n");
+	instr_out(data, hw_offset, 5, "WM state\n");
+	instr_out(data, hw_offset, 6, "CC state\n");
+	return len;
+    case 0x7801:
+	if (len != 6)
+	    fprintf(out, "Bad count in 3DSTATE_BINDING_TABLE_POINTERS\n");
+	if (count < 6)
+	    BUFFER_FAIL(count, len, "3DSTATE_BINDING_TABLE_POINTERS");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_BINDING_TABLE_POINTERS\n");
+	instr_out(data, hw_offset, 1, "VS binding table\n");
+	instr_out(data, hw_offset, 2, "GS binding table\n");
+	instr_out(data, hw_offset, 3, "Clip binding table\n");
+	instr_out(data, hw_offset, 4, "SF binding table\n");
+	instr_out(data, hw_offset, 5, "WM binding table\n");
+
+	return len;
+
+    case 0x7808:
+	len = (data[0] & 0xff) + 2;
+	if ((len - 1) % 4 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_BUFFERS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_BUFFERS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_BUFFERS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %s, pitch %db\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "random" : "sequential",
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i++, "buffer address\n");
+	    instr_out(data, hw_offset, i++, "max index\n");
+	    instr_out(data, hw_offset, i++, "mbz\n");
+	}
+	return len;
+
+    case 0x7809:
+	len = (data[0] & 0xff) + 2;
+	if ((len + 1) % 2 != 0)
+	    fprintf(out, "Bad count in 3DSTATE_VERTEX_ELEMENTS\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_VERTEX_ELEMENTS");
+	instr_out(data, hw_offset, 0, "3DSTATE_VERTEX_ELEMENTS\n");
+
+	for (i = 1; i < len;) {
+	    instr_out(data, hw_offset, i, "buffer %d: %svalid, type 0x%04x, "
+		      "src offset 0x%04x bytes\n",
+		      data[i] >> 27,
+		      data[i] & (1 << 26) ? "" : "in",
+		      (data[i] >> 16) & 0x1ff,
+		      data[i] & 0x07ff);
+	    i++;
+	    instr_out(data, hw_offset, i, "(%s, %s, %s, %s), "
+		      "dst offset 0x%02x bytes\n",
+		      get_965_element_component(data[i], 0),
+		      get_965_element_component(data[i], 1),
+		      get_965_element_component(data[i], 2),
+		      get_965_element_component(data[i], 3),
+		      (data[i] & 0xff) * 4);
+	    i++;
+	}
+	return len;
+
+    case 0x780a:
+	len = (data[0] & 0xff) + 2;
+	if (len != 3)
+	    fprintf(out, "Bad count in 3DSTATE_INDEX_BUFFER\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_INDEX_BUFFER");
+	instr_out(data, hw_offset, 0, "3DSTATE_INDEX_BUFFER\n");
+	instr_out(data, hw_offset, 1, "beginning buffer address\n");
+	instr_out(data, hw_offset, 2, "ending buffer address\n");
+	return len;
+
+    case 0x7900:
+	if (len != 4)
+	    fprintf(out, "Bad count in 3DSTATE_DRAWING_RECTANGLE\n");
+	if (count < 4)
+	    BUFFER_FAIL(count, len, "3DSTATE_DRAWING_RECTANGLE");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DRAWING_RECTANGLE\n");
+	instr_out(data, hw_offset, 1, "top left: %d,%d\n",
+		  data[1] & 0xffff,
+		  (data[1] >> 16) & 0xffff);
+	instr_out(data, hw_offset, 2, "bottom right: %d,%d\n",
+		  data[2] & 0xffff,
+		  (data[2] >> 16) & 0xffff);
+	instr_out(data, hw_offset, 3, "origin: %d,%d\n",
+		  (int)data[3] & 0xffff,
+		  ((int)data[3] >> 16) & 0xffff);
+
+	return len;
+
+    case 0x7905:
+	if (len != 5 && len != 6)
+	    fprintf(out, "Bad count in 3DSTATE_DEPTH_BUFFER\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEPTH_BUFFER");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEPTH_BUFFER\n");
+	instr_out(data, hw_offset, 1, "%s, %s, pitch = %d bytes, %stiled\n",
+		  get_965_surfacetype(data[1] >> 29),
+		  get_965_depthformat((data[1] >> 18) & 0x7),
+		  (data[1] & 0x0001ffff) + 1,
+		  data[1] & (1 << 27) ? "" : "not ");
+	instr_out(data, hw_offset, 2, "depth offset\n");
+	instr_out(data, hw_offset, 3, "%dx%d\n",
+		  ((data[3] & 0x0007ffc0) >> 6) + 1,
+		  ((data[3] & 0xfff80000) >> 19) + 1);
+	instr_out(data, hw_offset, 4, "volume depth\n");
+	if (len == 6)
+	    instr_out(data, hw_offset, 5, "\n");
+
+	return len;
+
+    case 0x7b00:
+	len = (data[0] & 0xff) + 2;
+	if (len != 6)
+	    fprintf(out, "Bad count in 3DPRIMITIVE\n");
+	if (count < len)
+	    BUFFER_FAIL(count, len, "3DPRIMITIVE");
+
+	instr_out(data, hw_offset, 0,
+		  "3DPRIMITIVE: %s %s\n",
+		  get_965_prim_type(data[0]),
+		  (data[0] & (1 << 15)) ? "random" : "sequential");
+	instr_out(data, hw_offset, 1, "vertex count\n");
+	instr_out(data, hw_offset, 2, "start vertex\n");
+	instr_out(data, hw_offset, 3, "instance count\n");
+	instr_out(data, hw_offset, 4, "start instance\n");
+	instr_out(data, hw_offset, 5, "index bias\n");
+	return len;
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0xffff0000) >> 16 == opcodes_3d[opcode].opcode) {
+	    unsigned int i;
+	    len = 1;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+static int
+decode_3d_i830(const uint32_t *data, int count, uint32_t hw_offset, int *failures)
+{
+    unsigned int opcode;
+
+    struct {
+	uint32_t opcode;
+	int min_len;
+	int max_len;
+	char *name;
+    } opcodes_3d[] = {
+	{ 0x02, 1, 1, "3DSTATE_MODES_3" },
+	{ 0x03, 1, 1, "3DSTATE_ENABLES_1"},
+	{ 0x04, 1, 1, "3DSTATE_ENABLES_2"},
+	{ 0x05, 1, 1, "3DSTATE_VFT0"},
+	{ 0x06, 1, 1, "3DSTATE_AA"},
+	{ 0x07, 1, 1, "3DSTATE_RASTERIZATION_RULES" },
+	{ 0x08, 1, 1, "3DSTATE_MODES_1" },
+	{ 0x09, 1, 1, "3DSTATE_STENCIL_TEST" },
+	{ 0x0a, 1, 1, "3DSTATE_VFT1"},
+	{ 0x0b, 1, 1, "3DSTATE_INDPT_ALPHA_BLEND" },
+	{ 0x0c, 1, 1, "3DSTATE_MODES_5" },
+	{ 0x0d, 1, 1, "3DSTATE_MAP_BLEND_OP" },
+	{ 0x0e, 1, 1, "3DSTATE_MAP_BLEND_ARG" },
+	{ 0x0f, 1, 1, "3DSTATE_MODES_2" },
+	{ 0x15, 1, 1, "3DSTATE_FOG_COLOR" },
+	{ 0x16, 1, 1, "3DSTATE_MODES_4" },
+    };
+
+    switch ((data[0] & 0x1f000000) >> 24) {
+    case 0x1f:
+	return decode_3d_primitive(data, count, hw_offset, failures);
+    case 0x1d:
+	return decode_3d_1d(data, count, hw_offset, failures, 1);
+    case 0x1c:
+	return decode_3d_1c(data, count, hw_offset, failures);
+    }
+
+    for (opcode = 0; opcode < sizeof(opcodes_3d) / sizeof(opcodes_3d[0]);
+	 opcode++) {
+	if ((data[0] & 0x1f000000) >> 24 == opcodes_3d[opcode].opcode) {
+	    unsigned int len = 1, i;
+
+	    instr_out(data, hw_offset, 0, "%s\n", opcodes_3d[opcode].name);
+	    if (opcodes_3d[opcode].max_len > 1) {
+		len = (data[0] & 0xff) + 2;
+		if (len < opcodes_3d[opcode].min_len ||
+		    len > opcodes_3d[opcode].max_len)
+		{
+		    fprintf(out, "Bad count in %s\n", opcodes_3d[opcode].name);
+		}
+	    }
+
+	    for (i = 1; i < len; i++) {
+		if (i >= count)
+		    BUFFER_FAIL(count, len, opcodes_3d[opcode].name);
+		instr_out(data, hw_offset, i, "dword %d\n", i);
+	    }
+	    return len;
+	}
+    }
+
+    instr_out(data, hw_offset, 0, "3D UNKNOWN\n");
+    (*failures)++;
+    return 1;
+}
+
+/**
+ * Decodes an i830-i915 batch buffer, writing the output to stdout.
+ *
+ * \param data batch buffer contents
+ * \param count number of DWORDs to decode in the batch buffer
+ * \param hw_offset hardware address for the buffer
+ */
+int
+intel_decode(const uint32_t *data, int count, uint32_t hw_offset, uint32_t devid)
+{
+    int index = 0;
+    int failures = 0;
+
+    out = stderr;
+
+    while (index < count) {
+	switch ((data[index] & 0xe0000000) >> 29) {
+	case 0x0:
+	    index += decode_mi(data + index, count - index,
+			       hw_offset + index * 4, &failures);
+	    break;
+	case 0x2:
+	    index += decode_2d(data + index, count - index,
+			       hw_offset + index * 4, &failures);
+	    break;
+	case 0x3:
+	    if (IS_965(devid)) {
+		index += decode_3d_965(data + index, count - index,
+				       hw_offset + index * 4, &failures);
+	    } else if (IS_9XX(devid)) {
+		index += decode_3d(data + index, count - index,
+				   hw_offset + index * 4, &failures);
+	    } else {
+		index += decode_3d_i830(data + index, count - index,
+					hw_offset + index * 4, &failures);
+	    }
+	    break;
+	default:
+	    instr_out(data, hw_offset, index, "UNKNOWN\n");
+	    failures++;
+	    index++;
+	    break;
+	}
+	fflush(out);
+    }
+
+    return failures;
+}
+
+void intel_decode_context_reset(void)
+{
+    saved_s2_set = 0;
+    saved_s4_set = 1;
+}
+
diff --git a/src/gallium/drivers/i965/intel_decode.h b/src/gallium/drivers/i965/intel_decode.h
new file mode 100644
index 00000000000..7683097b869
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_decode.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+int intel_decode(const uint32_t *data, int count, uint32_t hw_offset, uint32_t devid);
+void intel_decode_context_reset(void);
diff --git a/src/gallium/drivers/i965/intel_structs.h b/src/gallium/drivers/i965/intel_structs.h
new file mode 100644
index 00000000000..522e3bd92c2
--- /dev/null
+++ b/src/gallium/drivers/i965/intel_structs.h
@@ -0,0 +1,132 @@
+#ifndef INTEL_STRUCTS_H
+#define INTEL_STRUCTS_H
+
+struct br0 {
+   GLuint length:8;
+   GLuint pad0:3;
+   GLuint dst_tiled:1;
+   GLuint pad1:8;
+   GLuint write_rgb:1;
+   GLuint write_alpha:1;
+   GLuint opcode:7;
+   GLuint client:3;
+};
+
+   
+struct br13 {
+   GLint dest_pitch:16;
+   GLuint rop:8;
+   GLuint color_depth:2;
+   GLuint pad1:3;
+   GLuint mono_source_transparency:1;
+   GLuint clipping_enable:1;
+   GLuint pad0:1;
+};
+
+
+
+/* This is an attempt to move some of the 2D interaction in this
+ * driver to using structs for packets rather than a bunch of #defines
+ * and dwords.
+ */
+struct xy_color_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+   GLuint color;
+};
+
+struct xy_src_copy_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw2;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw3;
+   
+   GLuint dest_base_addr;
+
+   struct {
+      GLuint src_x1:16;
+      GLuint src_y1:16;
+   } dw5;
+
+   struct {
+      GLint src_pitch:16;
+      GLuint pad:16;
+   } dw6;
+   
+   GLuint src_base_addr;
+};
+
+struct xy_setup_blit {
+   struct br0 br0;
+   struct br13 br13;
+
+   struct {
+      GLuint clip_x1:16;
+      GLuint clip_y1:16;
+   } dw2;
+
+   struct {
+      GLuint clip_x2:16;
+      GLuint clip_y2:16;
+   } dw3;
+      
+   GLuint dest_base_addr;
+   GLuint background_color;
+   GLuint foreground_color;
+   GLuint pattern_base_addr;
+};
+
+
+struct xy_text_immediate_blit {
+   struct {
+      GLuint length:8;
+      GLuint pad2:3;
+      GLuint dst_tiled:1;
+      GLuint pad1:4;
+      GLuint byte_packed:1;
+      GLuint pad0:5;
+      GLuint opcode:7;
+      GLuint client:3;
+   } dw0;
+
+   struct {
+      GLuint dest_x1:16;
+      GLuint dest_y1:16;
+   } dw1;
+
+   struct {
+      GLuint dest_x2:16;
+      GLuint dest_y2:16;
+   } dw2;   
+
+   /* Src bitmap data follows as inline dwords.
+    */
+};
+
+
+#define CLIENT_2D 0x2
+#define OPCODE_XY_SETUP_BLT 0x1
+#define OPCODE_XY_COLOR_BLT 0x50
+#define OPCODE_XY_TEXT_IMMEDIATE_BLT 0x31
+
+#endif
diff --git a/src/gallium/drivers/identity/id_context.c b/src/gallium/drivers/identity/id_context.c
index 4e700089e33..f9063d90fb1 100644
--- a/src/gallium/drivers/identity/id_context.c
+++ b/src/gallium/drivers/identity/id_context.c
@@ -46,17 +46,6 @@ identity_destroy(struct pipe_context *_pipe)
 }
 
 static void
-identity_set_edgeflags(struct pipe_context *_pipe,
-                       const unsigned *bitfield)
-{
-   struct identity_context *id_pipe = identity_context(_pipe);
-   struct pipe_context *pipe = id_pipe->pipe;
-
-   pipe->set_edgeflags(pipe,
-                       bitfield);
-}
-
-static boolean
 identity_draw_arrays(struct pipe_context *_pipe,
                      unsigned prim,
                      unsigned start,
@@ -65,13 +54,13 @@ identity_draw_arrays(struct pipe_context *_pipe,
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
 
-   return pipe->draw_arrays(pipe,
-                            prim,
-                            start,
-                            count);
+   pipe->draw_arrays(pipe,
+                     prim,
+                     start,
+                     count);
 }
 
-static boolean
+static void
 identity_draw_elements(struct pipe_context *_pipe,
                        struct pipe_buffer *_indexBuffer,
                        unsigned indexSize,
@@ -84,15 +73,15 @@ identity_draw_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_elements(pipe,
-                              indexBuffer,
-                              indexSize,
-                              prim,
-                              start,
-                              count);
+   pipe->draw_elements(pipe,
+                       indexBuffer,
+                       indexSize,
+                       prim,
+                       start,
+                       count);
 }
 
-static boolean
+static void
 identity_draw_range_elements(struct pipe_context *_pipe,
                              struct pipe_buffer *_indexBuffer,
                              unsigned indexSize,
@@ -107,14 +96,14 @@ identity_draw_range_elements(struct pipe_context *_pipe,
    struct pipe_context *pipe = id_pipe->pipe;
    struct pipe_buffer *indexBuffer = id_buffer->buffer;
 
-   return pipe->draw_range_elements(pipe,
-                                    indexBuffer,
-                                    indexSize,
-                                    minIndex,
-                                    maxIndex,
-                                    mode,
-                                    start,
-                                    count);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize,
+                             minIndex,
+                             maxIndex,
+                             mode,
+                             start,
+                             count);
 }
 
 static struct pipe_query *
@@ -221,16 +210,29 @@ identity_create_sampler_state(struct pipe_context *_pipe,
 }
 
 static void
-identity_bind_sampler_states(struct pipe_context *_pipe,
-                             unsigned num,
-                             void **samplers)
+identity_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                      unsigned num_samplers,
+                                      void **samplers)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+
+   pipe->bind_fragment_sampler_states(pipe,
+                                      num_samplers,
+                                      samplers);
+}
+
+static void
+identity_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
 {
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
 
-   pipe->bind_sampler_states(pipe,
-                             num,
-                             samplers);
+   pipe->bind_vertex_sampler_states(pipe,
+                                    num_samplers,
+                                    samplers);
 }
 
 static void
@@ -402,17 +404,17 @@ static void
 identity_set_constant_buffer(struct pipe_context *_pipe,
                              uint shader,
                              uint index,
-                             const struct pipe_constant_buffer *_buffer)
+                             struct pipe_buffer *_buffer)
 {
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
-   struct pipe_constant_buffer unwrapped_buffer;
-   struct pipe_constant_buffer *buffer = NULL;
+   struct pipe_buffer *unwrapped_buffer;
+   struct pipe_buffer *buffer = NULL;
 
-   /* unwrap the input state */
+   /* XXX hmm? unwrap the input state */
    if (_buffer) {
-      unwrapped_buffer.buffer = identity_buffer_unwrap(_buffer->buffer);
-      buffer = &unwrapped_buffer;
+      unwrapped_buffer = identity_buffer_unwrap(_buffer);
+      buffer = unwrapped_buffer;
    }
 
    pipe->set_constant_buffer(pipe,
@@ -480,9 +482,9 @@ identity_set_viewport_state(struct pipe_context *_pipe,
 }
 
 static void
-identity_set_sampler_textures(struct pipe_context *_pipe,
-                              unsigned num_textures,
-                              struct pipe_texture **_textures)
+identity_set_fragment_sampler_textures(struct pipe_context *_pipe,
+                                       unsigned num_textures,
+                                       struct pipe_texture **_textures)
 {
    struct identity_context *id_pipe = identity_context(_pipe);
    struct pipe_context *pipe = id_pipe->pipe;
@@ -499,9 +501,34 @@ identity_set_sampler_textures(struct pipe_context *_pipe,
       textures = unwrapped_textures;
    }
 
-   pipe->set_sampler_textures(pipe,
-                              num_textures,
-                              textures);
+   pipe->set_fragment_sampler_textures(pipe,
+                                       num_textures,
+                                       textures);
+}
+
+static void
+identity_set_vertex_sampler_textures(struct pipe_context *_pipe,
+                                     unsigned num_textures,
+                                     struct pipe_texture **_textures)
+{
+   struct identity_context *id_pipe = identity_context(_pipe);
+   struct pipe_context *pipe = id_pipe->pipe;
+   struct pipe_texture *unwrapped_textures[PIPE_MAX_VERTEX_SAMPLERS];
+   struct pipe_texture **textures = NULL;
+   unsigned i;
+
+   if (_textures) {
+      for (i = 0; i < num_textures; i++)
+         unwrapped_textures[i] = identity_texture_unwrap(_textures[i]);
+      for (; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
+         unwrapped_textures[i] = NULL;
+
+      textures = unwrapped_textures;
+   }
+
+   pipe->set_vertex_sampler_textures(pipe,
+                                     num_textures,
+                                     textures);
 }
 
 static void
@@ -669,7 +696,6 @@ identity_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
    id_pipe->base.draw = NULL;
 
    id_pipe->base.destroy = identity_destroy;
-   id_pipe->base.set_edgeflags = identity_set_edgeflags;
    id_pipe->base.draw_arrays = identity_draw_arrays;
    id_pipe->base.draw_elements = identity_draw_elements;
    id_pipe->base.draw_range_elements = identity_draw_range_elements;
@@ -682,7 +708,8 @@ identity_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
    id_pipe->base.bind_blend_state = identity_bind_blend_state;
    id_pipe->base.delete_blend_state = identity_delete_blend_state;
    id_pipe->base.create_sampler_state = identity_create_sampler_state;
-   id_pipe->base.bind_sampler_states = identity_bind_sampler_states;
+   id_pipe->base.bind_fragment_sampler_states = identity_bind_fragment_sampler_states;
+   id_pipe->base.bind_vertex_sampler_states = identity_bind_vertex_sampler_states;
    id_pipe->base.delete_sampler_state = identity_delete_sampler_state;
    id_pipe->base.create_rasterizer_state = identity_create_rasterizer_state;
    id_pipe->base.bind_rasterizer_state = identity_bind_rasterizer_state;
@@ -703,7 +730,8 @@ identity_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
    id_pipe->base.set_polygon_stipple = identity_set_polygon_stipple;
    id_pipe->base.set_scissor_state = identity_set_scissor_state;
    id_pipe->base.set_viewport_state = identity_set_viewport_state;
-   id_pipe->base.set_sampler_textures = identity_set_sampler_textures;
+   id_pipe->base.set_fragment_sampler_textures = identity_set_fragment_sampler_textures;
+   id_pipe->base.set_vertex_sampler_textures = identity_set_vertex_sampler_textures;
    id_pipe->base.set_vertex_buffers = identity_set_vertex_buffers;
    id_pipe->base.set_vertex_elements = identity_set_vertex_elements;
    id_pipe->base.surface_copy = identity_surface_copy;
diff --git a/src/gallium/drivers/identity/id_objects.c b/src/gallium/drivers/identity/id_objects.c
index e893e599408..bc9bc7121d5 100644
--- a/src/gallium/drivers/identity/id_objects.c
+++ b/src/gallium/drivers/identity/id_objects.c
@@ -180,3 +180,42 @@ identity_transfer_destroy(struct identity_transfer *id_transfer)
    screen->tex_transfer_destroy(id_transfer->transfer);
    FREE(id_transfer);
 }
+
+struct pipe_video_surface *
+identity_video_surface_create(struct identity_screen *id_screen,
+                              struct pipe_video_surface *video_surface)
+{
+   struct identity_video_surface *id_video_surface;
+
+   if (!video_surface) {
+      goto error;
+   }
+
+   assert(video_surface->screen == id_screen->screen);
+
+   id_video_surface = CALLOC_STRUCT(identity_video_surface);
+   if (!id_video_surface) {
+      goto error;
+   }
+
+   memcpy(&id_video_surface->base,
+          video_surface,
+          sizeof(struct pipe_video_surface));
+
+   pipe_reference_init(&id_video_surface->base.reference, 1);
+   id_video_surface->base.screen = &id_screen->base;
+   id_video_surface->video_surface = video_surface;
+
+   return &id_video_surface->base;
+
+error:
+   pipe_video_surface_reference(&video_surface, NULL);
+   return NULL;
+}
+
+void
+identity_video_surface_destroy(struct identity_video_surface *id_video_surface)
+{
+   pipe_video_surface_reference(&id_video_surface->video_surface, NULL);
+   FREE(id_video_surface);
+}
diff --git a/src/gallium/drivers/identity/id_objects.h b/src/gallium/drivers/identity/id_objects.h
index ce58faa3c7c..77cc7190798 100644
--- a/src/gallium/drivers/identity/id_objects.h
+++ b/src/gallium/drivers/identity/id_objects.h
@@ -31,6 +31,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
+#include "pipe/p_video_state.h"
 
 #include "id_screen.h"
 
@@ -67,6 +68,14 @@ struct identity_transfer
 };
 
 
+struct identity_video_surface
+{
+   struct pipe_video_surface base;
+
+   struct pipe_video_surface *video_surface;
+};
+
+
 static INLINE struct identity_buffer *
 identity_buffer(struct pipe_buffer *_buffer)
 {
@@ -103,6 +112,15 @@ identity_transfer(struct pipe_transfer *_transfer)
    return (struct identity_transfer *)_transfer;
 }
 
+static INLINE struct identity_video_surface *
+identity_video_surface(struct pipe_video_surface *_video_surface)
+{
+   if (!_video_surface) {
+      return NULL;
+   }
+   (void)identity_screen(_video_surface->screen);
+   return (struct identity_video_surface *)_video_surface;
+}
 
 static INLINE struct pipe_buffer *
 identity_buffer_unwrap(struct pipe_buffer *_buffer)
@@ -165,5 +183,12 @@ identity_transfer_create(struct identity_texture *id_texture,
 void
 identity_transfer_destroy(struct identity_transfer *id_transfer);
 
+struct pipe_video_surface *
+identity_video_surface_create(struct identity_screen *id_screen,
+                              struct pipe_video_surface *video_surface);
+
+void
+identity_video_surface_destroy(struct identity_video_surface *id_video_surface);
+
 
 #endif /* ID_OBJECTS_H */
diff --git a/src/gallium/drivers/identity/id_public.h b/src/gallium/drivers/identity/id_public.h
index cac14cfd604..3d2862eaa01 100644
--- a/src/gallium/drivers/identity/id_public.h
+++ b/src/gallium/drivers/identity/id_public.h
@@ -37,4 +37,4 @@ identity_screen_create(struct pipe_screen *screen);
 struct pipe_context *
 identity_context_create(struct pipe_screen *screen, struct pipe_context *pipe);
 
-#endif /* PT_PUBLIC_H */
+#endif /* ID_PUBLIC_H */
diff --git a/src/gallium/drivers/identity/id_screen.c b/src/gallium/drivers/identity/id_screen.c
index 26439637d08..53eae3ef544 100644
--- a/src/gallium/drivers/identity/id_screen.c
+++ b/src/gallium/drivers/identity/id_screen.c
@@ -379,6 +379,33 @@ identity_screen_buffer_destroy(struct pipe_buffer *_buffer)
    identity_buffer_destroy(identity_buffer(_buffer));
 }
 
+static struct pipe_video_surface *
+identity_screen_video_surface_create(struct pipe_screen *_screen,
+                                     enum pipe_video_chroma_format chroma_format,
+                                     unsigned width,
+                                     unsigned height)
+{
+   struct identity_screen *id_screen = identity_screen(_screen);
+   struct pipe_screen *screen = id_screen->screen;
+   struct pipe_video_surface *result;
+
+   result = screen->video_surface_create(screen,
+                                         chroma_format,
+                                         width,
+                                         height);
+
+   if (result) {
+      return identity_video_surface_create(id_screen, result);
+   }
+   return NULL;
+}
+
+static void
+identity_screen_video_surface_destroy(struct pipe_video_surface *_vsfc)
+{
+   identity_video_surface_destroy(identity_video_surface(_vsfc));
+}
+
 static void
 identity_screen_flush_frontbuffer(struct pipe_screen *_screen,
                                   struct pipe_surface *_surface,
@@ -472,6 +499,12 @@ identity_screen_create(struct pipe_screen *screen)
    if (screen->buffer_unmap)
       id_screen->base.buffer_unmap = identity_screen_buffer_unmap;
    id_screen->base.buffer_destroy = identity_screen_buffer_destroy;
+   if (screen->video_surface_create) {
+      id_screen->base.video_surface_create = identity_screen_video_surface_create;
+   }
+   if (screen->video_surface_destroy) {
+      id_screen->base.video_surface_destroy = identity_screen_video_surface_destroy;
+   }
    id_screen->base.flush_frontbuffer = identity_screen_flush_frontbuffer;
    id_screen->base.fence_reference = identity_screen_fence_reference;
    id_screen->base.fence_signalled = identity_screen_fence_signalled;
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index e038a5229e5..7c6e46006b9 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -50,7 +50,6 @@ C_SOURCES = \
 	lp_state_vs.c \
 	lp_surface.c \
 	lp_tex_cache.c \
-	lp_tex_sample_c.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
 	lp_tile_cache.c \
diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 89d08834a3c..72d9f39658f 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -51,47 +51,42 @@ Requirements
 
  - Linux
  
- - udis86, http://udis86.sourceforge.net/ . Use my repository, which decodes
-   opcodes not yet supported by upstream.
+ - A x86 or amd64 processor.  64bit mode is preferred.
  
-     git clone git://people.freedesktop.org/~jrfonseca/udis86
-     cd udis86
-     ./configure --with-pic
-     make
-     sudo make install
+   Support for sse2 is strongly encouraged.  Support for ssse3, and sse4.1 will
+   yield the most efficient code.  The less features the CPU has the more
+   likely is that you ran into underperforming, buggy, or incomplete code.  
+   
+   See /proc/cpuinfo to know what your CPU supports.
+ 
+ - LLVM 2.6.
  
- - LLVM 2.5. On Debian based distributions do:
+   For Linux, on a recent Debian based distribution do:
  
      aptitude install llvm-dev
 
-   There is a typo in one of the llvm-dev 2.5 headers, that causes compilation
-   errors in the debug build:
-
-     --- /usr/include/llvm-c/Core.h.orig	2009-08-10 15:38:54.000000000 +0100
-     +++ /usr/include/llvm-c/Core.h	2009-08-10 15:38:25.000000000 +0100
-     @@ -831,7 +831,7 @@
-        template<typename T>
-        inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
-          #if DEBUG
-     -    for (LLVMValueRef *I = Vals, E = Vals + Length; I != E; ++I)
-     +    for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
-            cast<T>(*I);
-          #endif
-          return reinterpret_cast<T**>(Vals);
+   For Windows download pre-built MSVC 9.0 or MinGW binaries from
+   http://people.freedesktop.org/~jrfonseca/llvm/ and set the LLVM environment
+   variable to the extracted path.
+
+ - scons (optional)
+
+ - udis86, http://udis86.sourceforge.net/ (optional):
  
- - A x86 or amd64 processor with support for sse2, sse3, and sse4.1 SIMD
-   instructions. This is necessary because we emit several SSE intrinsics for
-   convenience. See /proc/cpuinfo to know what your CPU supports.
+     git clone git://udis86.git.sourceforge.net/gitroot/udis86/udis86
+     cd udis86
+     ./autogen.sh
+     ./configure --with-pic
+     make
+     sudo make install
  
- - scons
-
 
 Building
 ========
 
-To build everything invoke scons as:
+To build everything on Linux invoke scons as:
 
-  scons debug=yes statetrackers=mesa drivers=llvmpipe winsys=xlib dri=false -k
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=xlib dri=false
 
 Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
@@ -99,12 +94,15 @@ Alternatively, you can build it with GNU make, if you prefer, by invoking it as
 
 but the rest of these instructions assume that scons is used.
 
+For windows is everything the except except the winsys:
+
+  scons debug=yes statetrackers=mesa drivers=trace,llvmpipe winsys=gdi dri=false
 
 Using
 =====
 
-Building will create a drop-in alternative for libGL.so. To use it set the
-environment variables:
+On Linux, building will create a drop-in alternative for libGL.so. To use it
+set the environment variables:
 
   export LD_LIBRARY_PATH=$PWD/build/linux-x86_64-debug/lib:$LD_LIBRARY_PATH
 
@@ -115,6 +113,11 @@ or
 For performance evaluation pass debug=no to scons, and use the corresponding
 lib directory without the "-debug" suffix.
 
+On Windows, building will create a drop-in alternative for opengl32.dll. To use
+it put it in the same directory as the application. It can also be used by
+replacing the native ICD driver, but it's quite an advanced usage, so if you
+need to ask, don't even try it.
+
 
 Unit testing
 ============
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 3bd2e700138..6bb545a501f 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -9,6 +9,8 @@ if not env.has_key('LLVM_VERSION'):
 
 env.Tool('udis86')
 
+env.Append(CPPPATH = ['.'])
+
 env.CodeGenerate(
 	target = 'lp_tile_soa.c',
 	script = 'lp_tile_soa.py',
@@ -64,7 +66,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_state_vs.c',
 		'lp_surface.c',
 		'lp_tex_cache.c',
-		'lp_tex_sample_c.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
 		'lp_tile_cache.c',
@@ -74,21 +75,19 @@ llvmpipe = env.ConvenienceLibrary(
 
 env = env.Clone()
 
-env.Prepend(LIBS = [llvmpipe] + auxiliaries)
+env.Prepend(LIBS = [llvmpipe] + gallium)
 
-env.Program(
-    target = 'lp_test_format',
-    source = ['lp_test_format.c', 'lp_test_main.c'],
-)
+tests = [
+    'format',
+    'blend',
+    'conv',
+]
 
-env.Program(
-    target = 'lp_test_blend',
-    source = ['lp_test_blend.c', 'lp_test_main.c'],
-)
-
-env.Program(
-    target = 'lp_test_conv',
-    source = ['lp_test_conv.c', 'lp_test_main.c'],
-)
+for test in tests:
+    target = env.Program(
+        target = 'lp_test_' + test,
+        source = ['lp_test_' + test + '.c', 'lp_test_main.c'],
+    )
+    env.InstallProgram(target)
 
 Export('llvmpipe')
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index f5d9db70fb5..1aee9b35f31 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -628,7 +628,8 @@ lp_build_abs(struct lp_build_context *bld,
    if(type.floating) {
       /* Mask out the sign bit */
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
+      unsigned long absMask = ~(1 << (type.width - 1));
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
       a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       a = LLVMBuildAnd(bld->builder, a, mask, "");
       a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
@@ -1082,7 +1083,7 @@ lp_build_log(struct lp_build_context *bld,
              LLVMValueRef x)
 {
    /* log(2) */
-   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
+   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
 
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
@@ -1094,7 +1095,7 @@ lp_build_log(struct lp_build_context *bld,
 
 /**
  * Generate polynomial.
- * Ex:  x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
+ * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
  */
 static LLVMValueRef
 lp_build_polynomial(struct lp_build_context *bld,
@@ -1284,13 +1285,13 @@ lp_build_log2_approx(struct lp_build_context *bld,
       /* mant = (float) mantissa(x) */
       mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
       mant = LLVMBuildOr(bld->builder, mant, one, "");
-      mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
 
       logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
                                     Elements(lp_build_log2_polynomial));
 
       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
 
       res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_debug.c b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
index 59d8f492e60..39dfc51e503 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_debug.c
@@ -77,10 +77,10 @@ lp_disassemble(const void* func)
    while (ud_disassemble(&ud_obj)) {
 
 #ifdef PIPE_ARCH_X86
-      debug_printf("%08lx: ", (unsigned long)ud_insn_off(&ud_obj));
+      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
 #endif
 #ifdef PIPE_ARCH_X86_64
-      debug_printf("%016llx: ", (unsigned long long)ud_insn_off(&ud_obj));
+      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
 #endif
 
 #if 0
@@ -115,9 +115,16 @@ lp_disassemble(const void* func)
          }
       }
 
-      if (ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret)
+      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
+           ud_obj.mnemonic == UD_Iinvalid)
          break;
    }
+
+#if 0
+   /* Print GDB command, useful to verify udis86 output */
+   debug_printf("disassemble %p %p\n", func, (void*)(uintptr_t)ud_obj.pc);
+#endif
+
    debug_printf("\n");
 #else
    (void)func;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 818c0e943e3..49dab8ab61e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -303,8 +303,8 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
             unsigned first, last, mask;
             unsigned attrib;
 
-            first = decl->DeclarationRange.First;
-            last = decl->DeclarationRange.Last;
+            first = decl->Range.First;
+            last = decl->Range.Last;
             mask = decl->Declaration.UsageMask;
 
             for( attrib = first; attrib <= last; ++attrib ) {
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
index d3f78c06d92..6e79438ead0 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
+++ b/src/gallium/drivers/llvmpipe/lp_bld_misc.cpp
@@ -59,3 +59,17 @@ LLVMInitializeNativeTarget(void)
 
 
 #endif
+
+
+/* 
+ * Hack to allow the linking of release LLVM static libraries on a debug build.
+ *
+ * See also:
+ * - http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/7234ea2b-0042-42ed-b4e2-5d8644dfb57d
+ */
+#if defined(_MSC_VER) && defined(_DEBUG)
+#include <crtdefs.h>
+extern "C" {
+   _CRTIMP void __cdecl _invalid_parameter_noinfo(void) {}
+}
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample.c b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
index 4d272bea87e..9003e108c1c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample.c
@@ -59,9 +59,9 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
    state->format            = texture->format;
    state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width[0]);
-   state->pot_height        = util_is_pot(texture->height[0]);
-   state->pot_depth         = util_is_pot(texture->depth[0]);
+   state->pot_width         = util_is_pot(texture->width0);
+   state->pot_height        = util_is_pot(texture->height0);
+   state->pot_depth         = util_is_pot(texture->depth0);
 
    state->wrap_s            = sampler->wrap_s;
    state->wrap_t            = sampler->wrap_t;
@@ -69,8 +69,8 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->min_img_filter    = sampler->min_img_filter;
    state->min_mip_filter    = sampler->min_mip_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if(sampler->compare_mode) {
-      state->compare_mode      = sampler->compare_mode;
+   state->compare_mode      = sampler->compare_mode;
+   if(sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func      = sampler->compare_func;
    }
    state->normalized_coords = sampler->normalized_coords;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
index 47b68b71e25..5ee8d556a68 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -488,7 +488,7 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
    LLVMValueRef res;
    unsigned chan;
 
-   if(!bld->static_state->compare_mode)
+   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
       return;
 
    /* TODO: Compare before swizzling, to avoid redundant computations */
@@ -577,7 +577,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       lp_build_sample_2d_nearest_soa(&bld, s, t, width, height, stride, data_ptr, texel);
       break;
    case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
       if(lp_format_is_rgba8(bld.format_desc))
          lp_build_sample_2d_linear_aos(&bld, s, t, width, height, stride, data_ptr, texel);
       else
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index e722bca1a6c..85e3b1bdd42 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -62,7 +62,7 @@
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
 
 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   ((INST)->FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
 
 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
@@ -155,7 +155,7 @@ emit_fetch(
    unsigned index,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[index];
+   const struct tgsi_full_src_register *reg = &inst->Src[index];
    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    LLVMValueRef res;
 
@@ -165,9 +165,9 @@ emit_fetch(
    case TGSI_SWIZZLE_Z:
    case TGSI_SWIZZLE_W:
 
-      switch (reg->SrcRegister.File) {
+      switch (reg->Register.File) {
       case TGSI_FILE_CONSTANT: {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->SrcRegister.Index*4 + swizzle, 0);
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
          LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
          LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
          res = lp_build_broadcast_scalar(&bld->base, scalar);
@@ -175,17 +175,17 @@ emit_fetch(
       }
 
       case TGSI_FILE_IMMEDIATE:
-         res = bld->immediates[reg->SrcRegister.Index][swizzle];
+         res = bld->immediates[reg->Register.Index][swizzle];
          assert(res);
          break;
 
       case TGSI_FILE_INPUT:
-         res = bld->inputs[reg->SrcRegister.Index][swizzle];
+         res = bld->inputs[reg->Register.Index][swizzle];
          assert(res);
          break;
 
       case TGSI_FILE_TEMPORARY:
-         res = bld->temps[reg->SrcRegister.Index][swizzle];
+         res = bld->temps[reg->Register.Index][swizzle];
          if(!res)
             return bld->base.undef;
          break;
@@ -265,7 +265,7 @@ emit_store(
    unsigned chan_index,
    LLVMValueRef value)
 {
-   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[index];
+   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
 
    switch( inst->Instruction.Saturate ) {
    case TGSI_SAT_NONE:
@@ -285,13 +285,13 @@ emit_store(
       assert(0);
    }
 
-   switch( reg->DstRegister.File ) {
+   switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
-      bld->outputs[reg->DstRegister.Index][chan_index] = value;
+      bld->outputs[reg->Register.Index][chan_index] = value;
       break;
 
    case TGSI_FILE_TEMPORARY:
-      bld->temps[reg->DstRegister.Index][chan_index] = value;
+      bld->temps[reg->Register.Index][chan_index] = value;
       break;
 
    case TGSI_FILE_ADDRESS:
@@ -317,14 +317,14 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
           boolean projected,
           LLVMValueRef *texel)
 {
-   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   const uint unit = inst->Src[1].Register.Index;
    LLVMValueRef lodbias;
    LLVMValueRef oow = NULL;
    LLVMValueRef coords[3];
    unsigned num_coords;
    unsigned i;
 
-   switch (inst->InstructionExtTexture.Texture) {
+   switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
       num_coords = 1;
       break;
@@ -359,6 +359,9 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       if (projected)
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
+   for (i = num_coords; i < 3; i++) {
+      coords[i] = bld->base.undef;
+   }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->base.builder,
@@ -373,7 +376,7 @@ emit_kil(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst )
 {
-   const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[0];
+   const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
    LLVMValueRef mask;
    unsigned chan_index;
@@ -421,15 +424,15 @@ indirect_temp_reference(const struct tgsi_full_instruction *inst)
 {
    uint i;
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
-      if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
-          reg->SrcRegister.Indirect)
+      const struct tgsi_full_src_register *reg = &inst->Src[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
          return TRUE;
    }
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
-      if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
-          reg->DstRegister.Indirect)
+      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
          return TRUE;
    }
    return FALSE;
@@ -766,7 +769,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          src0 = emit_fetch( bld, inst, 0, chan_index );
          tmp0 = lp_build_floor(&bld->base, src0);
-         tmp0 = lp_build_sub(&bld->base, tmp0, src0);
+         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
          dst0[chan_index] = tmp0;
       }
       break;
@@ -1313,7 +1316,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       /* deprecated? */
       assert(0);
       return 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_clear.c b/src/gallium/drivers/llvmpipe/lp_clear.c
index bdcff94b9bf..08d9f2e2735 100644
--- a/src/gallium/drivers/llvmpipe/lp_clear.c
+++ b/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -50,6 +50,7 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
                double depth, unsigned stencil)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   union util_color uc;
    unsigned cv;
    uint i;
 
@@ -64,8 +65,8 @@ llvmpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
          struct pipe_surface *ps = llvmpipe->framebuffer.cbufs[i];
 
-         util_pack_color(rgba, ps->format, &cv);
-         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, cv);
+         util_pack_color(rgba, ps->format, &uc);
+         lp_tile_cache_clear(llvmpipe->cbuf_cache[i], rgba, uc.ui);
       }
       llvmpipe->dirty_render_cache = TRUE;
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 57e71f3e986..aaa675aec77 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -118,9 +118,14 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
       pipe_texture_reference(&llvmpipe->texture[i], NULL);
    }
 
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      lp_destroy_tex_tile_cache(llvmpipe->vertex_tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->vertex_textures[i], NULL);
+   }
+
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
-      if (llvmpipe->constants[i].buffer) {
-         pipe_buffer_reference(&llvmpipe->constants[i].buffer, NULL);
+      if (llvmpipe->constants[i]) {
+         pipe_buffer_reference(&llvmpipe->constants[i], NULL);
       }
    }
 
@@ -135,6 +140,7 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
    unsigned i;
 
+   /* check if any of the bound drawing surfaces are this texture */
    if(llvmpipe->dirty_render_cache) {
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
          if(llvmpipe->framebuffer.cbufs[i] && 
@@ -145,6 +151,18 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          llvmpipe->framebuffer.zsbuf->texture == texture)
          return PIPE_REFERENCED_FOR_WRITE;
    }
+
+   /* check if any of the tex_cache textures are this texture */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (llvmpipe->tex_cache[i] &&
+            llvmpipe->tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      if (llvmpipe->vertex_tex_cache[i] &&
+          llvmpipe->vertex_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
    
    return PIPE_UNREFERENCED;
 }
@@ -180,7 +198,8 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.delete_blend_state = llvmpipe_delete_blend_state;
 
    llvmpipe->pipe.create_sampler_state = llvmpipe_create_sampler_state;
-   llvmpipe->pipe.bind_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.bind_fragment_sampler_states  = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.bind_vertex_sampler_states  = llvmpipe_bind_vertex_sampler_states;
    llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
 
    llvmpipe->pipe.create_depth_stencil_alpha_state = llvmpipe_create_depth_stencil_state;
@@ -205,7 +224,8 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
    llvmpipe->pipe.set_polygon_stipple = llvmpipe_set_polygon_stipple;
    llvmpipe->pipe.set_scissor_state = llvmpipe_set_scissor_state;
-   llvmpipe->pipe.set_sampler_textures = llvmpipe_set_sampler_textures;
+   llvmpipe->pipe.set_fragment_sampler_textures = llvmpipe_set_sampler_textures;
+   llvmpipe->pipe.set_vertex_sampler_textures = llvmpipe_set_vertex_sampler_textures;
    llvmpipe->pipe.set_viewport_state = llvmpipe_set_viewport_state;
 
    llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
@@ -214,8 +234,6 @@ llvmpipe_create( struct pipe_screen *screen )
    llvmpipe->pipe.draw_arrays = llvmpipe_draw_arrays;
    llvmpipe->pipe.draw_elements = llvmpipe_draw_elements;
    llvmpipe->pipe.draw_range_elements = llvmpipe_draw_range_elements;
-   llvmpipe->pipe.set_edgeflags = llvmpipe_set_edgeflags;
-
 
    llvmpipe->pipe.clear = llvmpipe_clear;
    llvmpipe->pipe.flush = llvmpipe_flush;
@@ -234,24 +252,10 @@ llvmpipe_create( struct pipe_screen *screen )
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
       llvmpipe->tex_cache[i] = lp_create_tex_tile_cache( screen );
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++)
+      llvmpipe->vertex_tex_cache[i] = lp_create_tex_tile_cache(screen);
 
 
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.vert_samplers[i].processor = TGSI_PROCESSOR_VERTEX;
-      llvmpipe->tgsi.vert_samplers[i].cache = llvmpipe->tex_cache[i];
-      llvmpipe->tgsi.vert_samplers_list[i] = &llvmpipe->tgsi.vert_samplers[i];
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-      llvmpipe->tgsi.frag_samplers[i].processor = TGSI_PROCESSOR_FRAGMENT;
-      llvmpipe->tgsi.frag_samplers[i].cache = llvmpipe->tex_cache[i];
-      llvmpipe->tgsi.frag_samplers_list[i] = &llvmpipe->tgsi.frag_samplers[i];
-   }
-
    /*
     * Create drawing context and plug our rendering stage into it.
     */
@@ -259,10 +263,7 @@ llvmpipe_create( struct pipe_screen *screen )
    if (!llvmpipe->draw) 
       goto fail;
 
-   draw_texture_samplers(llvmpipe->draw,
-                         PIPE_MAX_SAMPLERS,
-                         (struct tgsi_sampler **)
-                            llvmpipe->tgsi.vert_samplers_list);
+   /* FIXME: devise alternative to draw_texture_samplers */
 
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 3ad95d0bfc2..426d6eb4a12 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -55,6 +55,7 @@ struct llvmpipe_context {
    /** Constant state objects */
    const struct pipe_blend_state *blend;
    const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
    const struct pipe_depth_stencil_alpha_state *depth_stencil;
    const struct pipe_rasterizer_state *rasterizer;
    struct lp_fragment_shader *fs;
@@ -63,17 +64,20 @@ struct llvmpipe_context {
    /** Other rendering state */
    struct pipe_blend_color blend_color[4][16];
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_texture *vertex_textures[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
 
    unsigned num_samplers;
    unsigned num_textures;
+   unsigned num_vertex_samplers;
+   unsigned num_vertex_textures;
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
@@ -111,14 +115,6 @@ struct llvmpipe_context {
 
    unsigned line_stipple_counter;
 
-   /** TGSI exec things */
-   struct {
-      struct lp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct lp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
-   } tgsi;
-
    /** The primitive drawing context */
    struct draw_context *draw;
 
@@ -136,6 +132,7 @@ struct llvmpipe_context {
 
    unsigned tex_timestamp;
    struct llvmpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   struct llvmpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
 
    unsigned no_rast : 1;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
new file mode 100644
index 00000000000..74b27574942
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -0,0 +1,71 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_DEBUG_H
+#define LP_DEBUG_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+extern void
+st_print_current(void);
+
+
+#define DEBUG_PIPE      0x1
+#define DEBUG_TGSI      0x2
+#define DEBUG_TEX       0x4
+#define DEBUG_ASM       0x8
+#define DEBUG_SETUP     0x10
+#define DEBUG_RAST      0x20
+#define DEBUG_QUERY     0x40
+#define DEBUG_SCREEN    0x80
+#define DEBUG_JIT       0x100
+
+#ifdef DEBUG
+extern int LP_DEBUG;
+#else
+#define LP_DEBUG 0
+#endif
+
+void st_debug_init( void );
+
+static INLINE void
+LP_DBG( unsigned flag, const char *fmt, ... )
+{
+    if (LP_DEBUG & flag)
+    {
+        va_list args;
+
+        va_start( args, fmt );
+        debug_vprintf( fmt, args );
+        va_end( args );
+    }
+}
+
+
+#endif /* LP_DEBUG_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index a43e438064c..a0316194c6a 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -43,11 +43,11 @@
 
 
 
-boolean
+void
 llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   llvmpipe_draw_elements(pipe, NULL, 0, mode, start, count);
 }
 
 
@@ -56,7 +56,7 @@ llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
  * Basically, map the vertex buffers (and drawing surfaces), then hand off
  * the drawing to the 'draw' module.
  */
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -101,7 +101,7 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
    draw_arrays(draw, mode, start, count);
 
    /*
-    * unmap vertex/index buffers - will cause draw module to flush
+    * unmap vertex/index buffers
     */
    for (i = 0; i < lp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
@@ -110,31 +110,28 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
       draw_set_mapped_element_buffer(draw, 0, NULL);
    }
 
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
 
    /* Note: leave drawing surfaces mapped */
 
    lp->dirty_render_cache = TRUE;
-   
-   return TRUE;
 }
 
 
-boolean
+void
 llvmpipe_draw_elements(struct pipe_context *pipe,
                        struct pipe_buffer *indexBuffer,
                        unsigned indexSize,
                        unsigned mode, unsigned start, unsigned count)
 {
-   return llvmpipe_draw_range_elements( pipe, indexBuffer,
-                                        indexSize,
-                                        0, 0xffffffff,
-                                        mode, start, count );
+   llvmpipe_draw_range_elements( pipe, indexBuffer,
+                                 indexSize,
+                                 0, 0xffffffff,
+                                 mode, start, count );
 }
 
-
-void
-llvmpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
-{
-   struct llvmpipe_context *lp = llvmpipe_context(pipe);
-   draw_set_edgeflags(lp->draw, edgeflags);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index b5aa7d680f1..9e0118c55b6 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -78,25 +78,22 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 
    /* struct lp_jit_context */
    {
-      LLVMTypeRef elem_types[5];
+      LLVMTypeRef elem_types[4];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* constants */
-      elem_types[1] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      elem_types[2] = LLVMFloatType();                     /* alpha_ref_value */
-      elem_types[3] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
-      elem_types[4] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[1] = LLVMFloatType();                     /* alpha_ref_value */
+      elem_types[2] = LLVMPointerType(LLVMInt8Type(), 0);  /* blend_color */
+      elem_types[3] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
                              screen->target, context_type, 0);
-      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers,
-                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
-                             screen->target, context_type, 2);
+                             screen->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, blend_color,
-                             screen->target, context_type, 3);
+                             screen->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
                              screen->target, context_type,
                              LP_JIT_CONTEXT_TEXTURES_INDEX);
@@ -108,24 +105,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       screen->context_ptr_type = LLVMPointerType(context_type, 0);
    }
 
-   /* fetch_texel
-    */
-   {
-      LLVMTypeRef ret_type;
-      LLVMTypeRef arg_types[3];
-      LLVMValueRef fetch_texel;
-
-      ret_type = LLVMVoidType();
-      arg_types[0] = LLVMPointerType(LLVMInt8Type(), 0);  /* samplers */
-      arg_types[1] = LLVMInt32Type();                     /* unit */
-      arg_types[2] = LLVMPointerType(LLVMVectorType(LLVMFloatType(), 4), 0); /* store */
-
-      fetch_texel = lp_declare_intrinsic(screen->module, "fetch_texel",
-                                         ret_type, arg_types, Elements(arg_types));
-
-      LLVMAddGlobalMapping(screen->engine, fetch_texel, lp_fetch_texel_soa);
-   }
-
 #ifdef DEBUG
    LLVMDumpModule(screen->module);
 #endif
@@ -153,6 +132,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 #if 0
    /* For simulating less capable machines */
    util_cpu_caps.has_sse3 = 0;
+   util_cpu_caps.has_ssse3 = 0;
    util_cpu_caps.has_sse4_1 = 0;
 #endif
 
@@ -166,7 +146,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    if (LLVMCreateJITCompiler(&screen->engine, screen->provider, 1, &error)) {
       _debug_printf("%s\n", error);
       LLVMDisposeMessage(error);
-      abort();
+      assert(0);
    }
 
    screen->target = LLVMGetExecutionEngineTargetData(screen->engine);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 58f716ede29..277b690c02c 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -41,7 +41,6 @@
 #include "pipe/p_state.h"
 
 
-struct tgsi_sampler;
 struct llvmpipe_screen;
 
 
@@ -78,8 +77,6 @@ struct lp_jit_context
 {
    const float *constants;
 
-   struct tgsi_sampler **samplers;
-
    float alpha_ref_value;
 
    /* FIXME: store (also?) in floats */
@@ -92,16 +89,13 @@ struct lp_jit_context
 #define lp_jit_context_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 0, "constants")
 
-#define lp_jit_context_samplers(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 1, "samplers")
-
 #define lp_jit_context_alpha_ref_value(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 2, "alpha_ref_value")
+   lp_build_struct_get(_builder, _ptr, 1, "alpha_ref_value")
 
 #define lp_jit_context_blend_color(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 3, "blend_color")
+   lp_build_struct_get(_builder, _ptr, 2, "blend_color")
 
-#define LP_JIT_CONTEXT_TEXTURES_INDEX 4
+#define LP_JIT_CONTEXT_TEXTURES_INDEX 3
 
 #define lp_jit_context_textures(_builder, _ptr) \
    lp_build_struct_get_ptr(_builder, _ptr, LP_JIT_CONTEXT_TEXTURES_INDEX, "textures")
@@ -118,12 +112,6 @@ typedef void
                     void *color,
                     void *depth);
 
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store );
-
-
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_quad.h b/src/gallium/drivers/llvmpipe/lp_quad.h
index 7eb05de77a1..c3a48700a4e 100644
--- a/src/gallium/drivers/llvmpipe/lp_quad.h
+++ b/src/gallium/drivers/llvmpipe/lp_quad.h
@@ -31,6 +31,7 @@
 #ifndef LP_QUAD_H
 #define LP_QUAD_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_exec.h"
 
@@ -83,7 +84,7 @@ struct quad_header_inout
 struct quad_header_output
 {
    /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-   float ALIGN16_ATTRIB color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   PIPE_ALIGN_VAR(16) float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
 };
 
 
@@ -92,9 +93,9 @@ struct quad_header_output
  */
 struct quad_interp_coef
 {
-   float ALIGN16_ATTRIB a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
-   float ALIGN16_ATTRIB dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float a0[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) float dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 05189274589..9b47415f003 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -27,6 +27,7 @@
 
 
 #include "util/u_memory.h"
+#include "util/u_format.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
@@ -35,6 +36,24 @@
 #include "lp_winsys.h"
 #include "lp_jit.h"
 #include "lp_screen.h"
+#include "lp_debug.h"
+
+#ifdef DEBUG
+int LP_DEBUG = 0;
+
+static const struct debug_named_value lp_debug_flags[] = {
+   { "pipe",   DEBUG_PIPE },
+   { "tgsi",   DEBUG_TGSI },
+   { "tex",    DEBUG_TEX },
+   { "asm",    DEBUG_ASM },
+   { "setup",  DEBUG_SETUP },
+   { "rast",   DEBUG_RAST },
+   { "query",  DEBUG_QUERY },
+   { "screen", DEBUG_SCREEN },
+   { "jit",    DEBUG_JIT },
+   {NULL, 0}
+};
+#endif
 
 
 static const char *
@@ -58,7 +77,9 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_SAMPLERS;
+      return PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
       return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
@@ -131,17 +152,17 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
    struct llvmpipe_winsys *winsys = screen->winsys;
+   const struct util_format_description *format_desc;
+
+   format_desc = util_format_description(format);
+   if(!format_desc)
+      return FALSE;
 
    assert(target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_2D ||
           target == PIPE_TEXTURE_3D ||
           target == PIPE_TEXTURE_CUBE);
 
-   if(format == PIPE_FORMAT_Z16_UNORM)
-      return FALSE;
-   if(format == PIPE_FORMAT_S8_UNORM)
-      return FALSE;
-
    switch(format) {
    case PIPE_FORMAT_DXT1_RGB:
    case PIPE_FORMAT_DXT1_RGBA:
@@ -152,8 +173,51 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       break;
    }
 
-   if(tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
-      return winsys->is_displaytarget_format_supported(winsys, format);
+   if(tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+      if(format_desc->block.width != 1 ||
+         format_desc->block.height != 1)
+         return FALSE;
+
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+         return FALSE;
+
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB)
+         return FALSE;
+   }
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET) {
+      if(!winsys->is_displaytarget_format_supported(winsys, format))
+         return FALSE;
+   }
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /* FIXME: Temporary restriction. See lp_state_fs.c. */
+      if(format_desc->block.bits != 32)
+         return FALSE;
+   }
+
+   /* FIXME: Temporary restrictions. See lp_bld_sample_soa.c */
+   if(tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) {
+      if(format_desc->block.width != 1 ||
+         format_desc->block.height != 1)
+         return FALSE;
+
+      if(format_desc->layout != UTIL_FORMAT_LAYOUT_SCALAR &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+         format_desc->layout != UTIL_FORMAT_LAYOUT_ARRAY)
+         return FALSE;
+
+      if(format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB &&
+         format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+   }
 
    return TRUE;
 }
@@ -213,6 +277,10 @@ llvmpipe_create_screen(struct llvmpipe_winsys *winsys)
 {
    struct llvmpipe_screen *screen = CALLOC_STRUCT(llvmpipe_screen);
 
+#ifdef DEBUG
+   LP_DEBUG = debug_get_flags_option("LP_DEBUG", lp_debug_flags, 0 );
+#endif
+
    if (!screen)
       return NULL;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 95bc66e624d..0b2d3a28014 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -40,6 +40,8 @@
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_bld_debug.h"
@@ -115,7 +117,7 @@ struct setup_context {
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static void
 shade_quads(struct llvmpipe_context *llvmpipe,
             struct quad_header *quads[],
@@ -128,7 +130,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
    uint8_t *tile;
    uint8_t *color;
    void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
+   PIPE_ALIGN_VAR(16) uint32_t mask[4][NUM_CHANNELS];
    unsigned chan_index;
    unsigned q;
 
@@ -165,7 +167,7 @@ shade_quads(struct llvmpipe_context *llvmpipe,
       assert((y % 2) == 0);
       depth = llvmpipe->zsbuf_map +
               y*llvmpipe->zsbuf_transfer->stride +
-              2*x*llvmpipe->zsbuf_transfer->block.size;
+              2*x*util_format_get_blocksize(llvmpipe->zsbuf_transfer->texture->format);
    }
    else
       depth = NULL;
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 7b26ce61a38..e16793186be 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -56,7 +56,6 @@
 #define LP_NEW_QUERY         0x4000
 
 
-struct tgsi_sampler;
 struct vertex_info;
 struct pipe_context;
 struct llvmpipe_context;
@@ -126,6 +125,10 @@ void *
 llvmpipe_create_sampler_state(struct pipe_context *,
                               const struct pipe_sampler_state *);
 void llvmpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void
+llvmpipe_bind_vertex_sampler_states(struct pipe_context *,
+                                    unsigned num_samplers,
+                                    void **samplers);
 void llvmpipe_delete_sampler_state(struct pipe_context *, void *);
 
 void *
@@ -151,7 +154,7 @@ void llvmpipe_set_clip_state( struct pipe_context *,
 
 void llvmpipe_set_constant_buffer(struct pipe_context *,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buf);
+                                  struct pipe_buffer *buf);
 
 void *llvmpipe_create_fs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
@@ -172,6 +175,11 @@ void llvmpipe_set_sampler_textures( struct pipe_context *,
                                     unsigned num,
                                     struct pipe_texture ** );
 
+void
+llvmpipe_set_vertex_sampler_textures(struct pipe_context *,
+                                     unsigned num_textures,
+                                     struct pipe_texture **);
+
 void llvmpipe_set_viewport_state( struct pipe_context *,
                                   const struct pipe_viewport_state * );
 
@@ -188,14 +196,14 @@ void llvmpipe_update_fs(struct llvmpipe_context *lp);
 void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe );
 
 
-boolean llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+void llvmpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
 			     unsigned start, unsigned count);
 
-boolean llvmpipe_draw_elements(struct pipe_context *pipe,
+void llvmpipe_draw_elements(struct pipe_context *pipe,
 			       struct pipe_buffer *indexBuffer,
 			       unsigned indexSize,
 			       unsigned mode, unsigned start, unsigned count);
-boolean
+void
 llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -204,10 +212,6 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned mode, unsigned start, unsigned count);
 
 void
-llvmpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
-
-
-void
 llvmpipe_map_transfers(struct llvmpipe_context *lp);
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
index b2e75d3b14e..a94cd05ef20 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -34,6 +34,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_debug_dump.h"
+#include "draw/draw_context.h"
 #include "lp_screen.h"
 #include "lp_context.h"
 #include "lp_state.h"
@@ -51,6 +52,11 @@ void llvmpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
+   if (llvmpipe->blend == blend)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
    llvmpipe->blend = blend;
 
    llvmpipe->dirty |= LP_NEW_BLEND;
@@ -69,6 +75,11 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    unsigned i, j;
 
+   if(memcmp(&llvmpipe->blend_color, blend_color, sizeof *blend_color) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
    memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
 
    if(!llvmpipe->jit_context.blend_color)
@@ -99,7 +110,12 @@ llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
-   llvmpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   if (llvmpipe->depth_stencil == depth_stencil)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->depth_stencil = depth_stencil;
 
    if(llvmpipe->depth_stencil)
       llvmpipe->jit_context.alpha_ref_value = llvmpipe->depth_stencil->alpha.ref_value;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index c753b183c0c..6c1ef6bc42d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -66,7 +66,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       /* compute vertex layout now */
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
       struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-      const uint num = draw_num_vs_outputs(llvmpipe->draw);
+      const uint num = draw_current_shader_outputs(llvmpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
@@ -116,13 +116,13 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
          }
 
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(llvmpipe->draw,
+         src = draw_find_shader_output(llvmpipe->draw,
                                    lpfs->info.input_semantic_name[i],
                                    lpfs->info.input_semantic_index[i]);
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
-      llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
+      llvmpipe->psize_slot = draw_find_shader_output(llvmpipe->draw,
                                                  TGSI_SEMANTIC_PSIZE, 0);
       if (llvmpipe->psize_slot > 0) {
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
@@ -192,32 +192,6 @@ compute_cliprect(struct llvmpipe_context *lp)
 }
 
 
-static void
-update_tgsi_samplers( struct llvmpipe_context *llvmpipe )
-{
-   unsigned i;
-
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.vert_samplers[i].sampler = llvmpipe->sampler[i];
-      llvmpipe->tgsi.vert_samplers[i].texture = llvmpipe->texture[i];
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      llvmpipe->tgsi.frag_samplers[i].sampler = llvmpipe->sampler[i];
-      llvmpipe->tgsi.frag_samplers[i].texture = llvmpipe->texture[i];
-      llvmpipe->tgsi.frag_samplers[i].base.get_samples = lp_get_samples;
-   }
-
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      lp_tex_tile_cache_validate_texture( llvmpipe->tex_cache[i] );
-   }
-
-   llvmpipe->jit_context.samplers = (struct tgsi_sampler **)llvmpipe->tgsi.frag_samplers_list;
-}
-
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
@@ -233,8 +207,9 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
    }
       
    if (llvmpipe->dirty & (LP_NEW_SAMPLER |
-                          LP_NEW_TEXTURE))
-      update_tgsi_samplers( llvmpipe );
+                          LP_NEW_TEXTURE)) {
+      /* TODO */
+   }
 
    if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
                           LP_NEW_FS |
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 2297cbb76f8..70e2a082cc0 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -85,6 +85,7 @@
 #include "lp_buffer.h"
 #include "lp_state.h"
 #include "lp_tex_sample.h"
+#include "lp_debug.h"
 
 
 static const unsigned char quad_offset_x[4] = {0, 1, 0, 1};
@@ -146,6 +147,20 @@ generate_depth(LLVMBuilderRef builder,
    format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(src_type.floating) {
+      src_type.sign = FALSE;
+      src_type.norm = TRUE;
+   }
+   else {
+      assert(!src_type.sign);
+      assert(src_type.norm);
+   }
+
    /* Pick the depth type. */
    dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
 
@@ -153,14 +168,11 @@ generate_depth(LLVMBuilderRef builder,
    assert(dst_type.width == src_type.width);
    assert(dst_type.length == src_type.length);
 
-#if 1
-   src = lp_build_clamped_float_to_unsigned_norm(builder,
-                                                 src_type,
-                                                 dst_type.width,
-                                                 src);
-#else
    lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-#endif
+
+   dst_ptr = LLVMBuildBitCast(builder,
+                              dst_ptr,
+                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
 
    lp_build_depth_test(builder,
                        &key->depth,
@@ -395,59 +407,58 @@ generate_fragment(struct llvmpipe_context *lp,
    unsigned i;
    unsigned chan;
 
-#ifdef DEBUG
-   tgsi_dump(shader->base.tokens, 0);
-   if(key->depth.enabled) {
-      debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
-      debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
-      debug_printf("depth.writemask = %u\n", key->depth.writemask);
-   }
-   if(key->alpha.enabled) {
-      debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
-      debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
-   }
-   if(key->blend.logicop_enable) {
-      debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
-   }
-   else if(key->blend.blend_enable) {
-      debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rgb_func, TRUE));
-      debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_src_factor, TRUE));
-      debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_dst_factor, TRUE));
-      debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.alpha_func, TRUE));
-      debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_src_factor, TRUE));
-      debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
-   }
-   debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
-   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-      if(key->sampler[i].format) {
-         debug_printf("sampler[%u] = \n", i);
-         debug_printf("  .format = %s\n",
-                      pf_name(key->sampler[i].format));
-         debug_printf("  .target = %s\n",
-                      debug_dump_tex_target(key->sampler[i].target, TRUE));
-         debug_printf("  .pot = %u %u %u\n",
-                      key->sampler[i].pot_width,
-                      key->sampler[i].pot_height,
-                      key->sampler[i].pot_depth);
-         debug_printf("  .wrap = %s %s %s\n",
-                      debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                      debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                      debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-         debug_printf("  .min_img_filter = %s\n",
-                      debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-         debug_printf("  .min_mip_filter = %s\n",
-                      debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-         debug_printf("  .mag_img_filter = %s\n",
-                      debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-         if(key->sampler[i].compare_mode)
-            debug_printf("  .compare_mode = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
-         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-         debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
+   if (LP_DEBUG & DEBUG_JIT) {
+      tgsi_dump(shader->base.tokens, 0);
+      if(key->depth.enabled) {
+         debug_printf("depth.format = %s\n", pf_name(key->zsbuf_format));
+         debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
+         debug_printf("depth.writemask = %u\n", key->depth.writemask);
+      }
+      if(key->alpha.enabled) {
+         debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
+         debug_printf("alpha.ref_value = %f\n", key->alpha.ref_value);
+      }
+      if(key->blend.logicop_enable) {
+         debug_printf("blend.logicop_func = %u\n", key->blend.logicop_func);
+      }
+      else if(key->blend.blend_enable) {
+         debug_printf("blend.rgb_func = %s\n",   debug_dump_blend_func  (key->blend.rgb_func, TRUE));
+         debug_printf("rgb_src_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_src_factor, TRUE));
+         debug_printf("rgb_dst_factor = %s\n",   debug_dump_blend_factor(key->blend.rgb_dst_factor, TRUE));
+         debug_printf("alpha_func = %s\n",       debug_dump_blend_func  (key->blend.alpha_func, TRUE));
+         debug_printf("alpha_src_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_src_factor, TRUE));
+         debug_printf("alpha_dst_factor = %s\n", debug_dump_blend_factor(key->blend.alpha_dst_factor, TRUE));
+      }
+      debug_printf("blend.colormask = 0x%x\n", key->blend.colormask);
+      for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
+         if(key->sampler[i].format) {
+            debug_printf("sampler[%u] = \n", i);
+            debug_printf("  .format = %s\n",
+                         pf_name(key->sampler[i].format));
+            debug_printf("  .target = %s\n",
+                         debug_dump_tex_target(key->sampler[i].target, TRUE));
+            debug_printf("  .pot = %u %u %u\n",
+                         key->sampler[i].pot_width,
+                         key->sampler[i].pot_height,
+                         key->sampler[i].pot_depth);
+            debug_printf("  .wrap = %s %s %s\n",
+                         debug_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                         debug_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                         debug_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+            debug_printf("  .min_img_filter = %s\n",
+                         debug_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+            debug_printf("  .min_mip_filter = %s\n",
+                         debug_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+            debug_printf("  .mag_img_filter = %s\n",
+                         debug_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+            if(key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+               debug_printf("  .compare_func = %s\n", debug_dump_func(key->sampler[i].compare_func, TRUE));
+            debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+            debug_printf("  .prefilter = %u\n", key->sampler[i].prefilter);
+         }
       }
    }
 
-#endif
-
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
    if(!variant)
       return NULL;
@@ -537,13 +548,8 @@ generate_fragment(struct llvmpipe_context *lp,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x0, y0, 2, 0);
 
-#if 0
-   /* C texture sampling */
-   sampler = lp_c_sampler_soa_create(context_ptr);
-#else
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
-#endif
 
    for(i = 0; i < num_fs; ++i) {
       LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
@@ -586,8 +592,8 @@ generate_fragment(struct llvmpipe_context *lp,
    }
 
    lp_build_conv_mask(builder, fs_type, blend_type,
-                               fs_mask, num_fs,
-                               &blend_mask, 1);
+                      fs_mask, num_fs,
+                      &blend_mask, 1);
 
    /*
     * Blending.
@@ -609,23 +615,24 @@ generate_fragment(struct llvmpipe_context *lp,
     * Translate the LLVM IR into machine code.
     */
 
+#ifdef DEBUG
    if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
       LLVMDumpValue(variant->function);
-      abort();
+      assert(0);
    }
+#endif
 
    LLVMRunFunctionPassManager(screen->pass, variant->function);
 
-#ifdef DEBUG
-   LLVMDumpValue(variant->function);
-   debug_printf("\n");
-#endif
+   if (LP_DEBUG & DEBUG_JIT) {
+      LLVMDumpValue(variant->function);
+      debug_printf("\n");
+   }
 
    variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
 
-#ifdef DEBUG
-   lp_disassemble(variant->jit_function);
-#endif
+   if (LP_DEBUG & DEBUG_ASM)
+      lp_disassemble(variant->jit_function);
 
    variant->next = shader->variants;
    shader->variants = variant;
@@ -659,7 +666,12 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
-   llvmpipe->fs = (struct lp_fragment_shader *) fs;
+   if (llvmpipe->fs == fs)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->fs = fs;
 
    llvmpipe->dirty |= LP_NEW_FS;
 }
@@ -700,28 +712,27 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 void
 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *constants)
+                             struct pipe_buffer *constants)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_buffer *buffer = constants ? constants->buffer : NULL;
-   unsigned size = buffer ? buffer->size : 0;
-   const void *data = buffer ? llvmpipe_buffer(buffer)->data : NULL;
+   unsigned size = constants ? constants->size : 0;
+   const void *data = constants ? llvmpipe_buffer(constants)->data : NULL;
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
-   if(shader == PIPE_SHADER_VERTEX)
-      draw_flush(llvmpipe->draw);
+   draw_flush(llvmpipe->draw);
 
    /* note: reference counting */
-   pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
+   pipe_buffer_reference(&llvmpipe->constants[shader], constants);
 
    if(shader == PIPE_SHADER_FRAGMENT) {
       llvmpipe->jit_context.constants = data;
    }
 
    if(shader == PIPE_SHADER_VERTEX) {
-      draw_set_mapped_constant_buffer(llvmpipe->draw, data, size);
+      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX,
+                                      data, size);
    }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index 4561c6b8456..aa3b5a3f91e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -41,14 +41,17 @@ llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
 }
 
 void llvmpipe_bind_rasterizer_state(struct pipe_context *pipe,
-                                    void *setup)
+                                    void *rasterizer)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
+   if (llvmpipe->rasterizer == rasterizer)
+      return;
+
    /* pass-through to draw module */
-   draw_set_rasterizer_state(llvmpipe->draw, setup);
+   draw_set_rasterizer_state(llvmpipe->draw, rasterizer);
 
-   llvmpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+   llvmpipe->rasterizer = rasterizer;
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index c69d90c723a..d382f9ca87e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -78,6 +78,34 @@ llvmpipe_bind_sampler_states(struct pipe_context *pipe,
 
 
 void
+llvmpipe_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == llvmpipe->num_vertex_samplers &&
+       !memcmp(llvmpipe->vertex_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      llvmpipe->vertex_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      llvmpipe->vertex_samplers[i] = NULL;
+
+   llvmpipe->num_vertex_samplers = num_samplers;
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+void
 llvmpipe_set_sampler_textures(struct pipe_context *pipe,
                               unsigned num, struct pipe_texture **texture)
 {
@@ -102,8 +130,8 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
       if(tex) {
          struct llvmpipe_texture *lp_tex = llvmpipe_texture(tex);
          struct lp_jit_texture *jit_tex = &llvmpipe->jit_context.textures[i];
-         jit_tex->width = tex->width[0];
-         jit_tex->height = tex->height[0];
+         jit_tex->width = tex->width0;
+         jit_tex->height = tex->height0;
          jit_tex->stride = lp_tex->stride[0];
          if(!lp_tex->dt)
             jit_tex->data = lp_tex->data;
@@ -117,6 +145,37 @@ llvmpipe_set_sampler_textures(struct pipe_context *pipe,
 
 
 void
+llvmpipe_set_vertex_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num_textures,
+                                     struct pipe_texture **textures)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num_textures <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_textures == llvmpipe->num_vertex_textures &&
+       !memcmp(llvmpipe->vertex_textures, textures, num_textures * sizeof(struct pipe_texture *))) {
+      return;
+   }
+
+   draw_flush(llvmpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num_textures ? textures[i] : NULL;
+
+      pipe_texture_reference(&llvmpipe->vertex_textures[i], tex);
+      lp_tex_tile_cache_set_texture(llvmpipe->vertex_tex_cache[i], tex);
+   }
+
+   llvmpipe->num_vertex_textures = num_textures;
+
+   llvmpipe->dirty |= LP_NEW_TEXTURE;
+}
+
+
+void
 llvmpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 9f745856742..3b08b0d1d70 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -34,6 +34,8 @@
 
 #include "draw/draw_context.h"
 
+#include "util/u_format.h"
+
 
 /**
  * XXX this might get moved someday
@@ -48,6 +50,8 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct llvmpipe_context *lp = llvmpipe_context(pipe);
    uint i;
 
+   draw_flush(lp->draw);
+
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       /* check if changing cbuf */
       if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
@@ -87,8 +91,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       if (lp->framebuffer.zsbuf) {
          int depth_bits;
          double mrd;
-         depth_bits = pf_get_component_bits(lp->framebuffer.zsbuf->format,
-                                            PIPE_FORMAT_COMP_Z);
+         depth_bits = util_format_get_component_bits(lp->framebuffer.zsbuf->format,
+                                                     UTIL_FORMAT_COLORSPACE_ZS,
+                                                     0);
          if (depth_bits > 16) {
             mrd = 0.0000001;
          }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vs.c b/src/gallium/drivers/llvmpipe/lp_state_vs.c
index 15c30296144..884e3878e62 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -70,14 +70,18 @@ fail:
 
 
 void
-llvmpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+llvmpipe_bind_vs_state(struct pipe_context *pipe, void *_vs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   const struct lp_vertex_shader *vs = (const struct lp_vertex_shader *)_vs;
 
-   llvmpipe->vs = (const struct lp_vertex_shader *)vs;
+   if (llvmpipe->vs == vs)
+      return;
 
-   draw_bind_vertex_shader(llvmpipe->draw,
-                           (llvmpipe->vs ? llvmpipe->vs->draw_data : NULL));
+   draw_bind_vertex_shader(llvmpipe->draw, 
+                           vs ? vs->draw_data : NULL);
+
+   llvmpipe->vs = vs;
 
    llvmpipe->dirty |= LP_NEW_VS;
 }
@@ -92,5 +96,6 @@ llvmpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
       (struct lp_vertex_shader *)vs;
 
    draw_delete_vertex_shader(llvmpipe->draw, state->draw_data);
+   FREE( (void *)state->shader.tokens );
    FREE( state );
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index 411e99d06c9..7b65bab43a4 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -461,7 +461,7 @@ compute_blend_ref(const struct pipe_blend_state *blend,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -530,11 +530,11 @@ test_one(unsigned verbose,
    success = TRUE;
    for(i = 0; i < n && success; ++i) {
       if(mode == AoS) {
-         ALIGN16_ATTRIB uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
 
@@ -595,11 +595,11 @@ test_one(unsigned verbose,
 
       if(mode == SoA) {
          const unsigned stride = type.length*type.width/8;
-         ALIGN16_ATTRIB uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
-         ALIGN16_ATTRIB uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t src[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t dst[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t con[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t res[4*LP_NATIVE_VECTOR_WIDTH/8];
+         PIPE_ALIGN_VAR(16) uint8_t ref[4*LP_NATIVE_VECTOR_WIDTH/8];
          int64_t start_counter = 0;
          int64_t end_counter = 0;
          boolean mismatch;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index faddfb96779..c1abee424c9 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -142,7 +142,7 @@ add_conv_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_one(unsigned verbose,
          FILE *fp,
@@ -230,8 +230,8 @@ test_one(unsigned verbose,
    for(i = 0; i < n && success; ++i) {
       unsigned src_stride = src_type.length*src_type.width/8;
       unsigned dst_stride = dst_type.length*dst_type.width/8;
-      ALIGN16_ATTRIB uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
-      ALIGN16_ATTRIB uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(16) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
       int64_t start_counter = 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index 23ea9ebbe7d..2b258f1052e 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -199,7 +199,7 @@ add_store_rgba_test(LLVMModuleRef module,
 }
 
 
-ALIGN_STACK
+PIPE_ALIGN_STACK
 static boolean
 test_format(unsigned verbose, FILE *fp, const struct pixel_test_case *test)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.c b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
index b28f8dc5da2..8094625d74d 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.c
@@ -36,6 +36,7 @@
 #include "util/u_memory.h"
 #include "util/u_tile.h"
 #include "util/u_format.h"
+#include "util/u_math.h"
 #include "lp_context.h"
 #include "lp_texture.h"
 #include "lp_tex_cache.h"
@@ -153,7 +154,6 @@ lp_tex_tile_cache_validate_texture(struct llvmpipe_tex_tile_cache *tc)
       if (lpt->timestamp != tc->timestamp) {
          /* texture was modified, invalidate all cached tiles */
          uint i;
-         debug_printf("INV %d %d\n", tc->timestamp, lpt->timestamp);
          for (i = 0; i < NUM_ENTRIES; i++) {
             tc->entries[i].addr.bits.invalid = 1;
          }
@@ -269,8 +269,8 @@ lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
                                      addr.bits.level, 
                                      addr.bits.z, 
                                      PIPE_TRANSFER_READ, 0, 0,
-                                     tc->texture->width[addr.bits.level],
-                                     tc->texture->height[addr.bits.level]);
+                                     u_minify(tc->texture->width0, addr.bits.level),
+                                     u_minify(tc->texture->height0, addr.bits.level));
 
          tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
 
@@ -289,7 +289,7 @@ lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
             assert(0);
          }
 
-         util_format_read_4ub(tc->tex_trans->format,
+         util_format_read_4ub(tc->tex_trans->texture->format,
                               (uint8_t *)tile->color, sizeof tile->color[0],
                               tc->tex_trans_map, tc->tex_trans->stride,
                               x, y, w, h);
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index 9ad1bde9565..cb59a94464a 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -31,64 +31,11 @@
 
 #include <llvm-c/Core.h>
 
-#include "tgsi/tgsi_exec.h"
 
-
-struct llvmpipe_tex_tile_cache;
 struct lp_sampler_static_state;
 
 
 /**
- * Subclass of tgsi_sampler
- */
-struct lp_shader_sampler
-{
-   struct tgsi_sampler base;  /**< base class */
-
-   unsigned processor;
-
-   /* For lp_get_samples_2d_linear_POT:
-    */
-   unsigned xpot;
-   unsigned ypot;
-   unsigned level;
-
-   const struct pipe_texture *texture;
-   const struct pipe_sampler_state *sampler;
-
-   struct llvmpipe_tex_tile_cache *cache;
-};
-
-
-
-static INLINE struct lp_shader_sampler *
-lp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (struct lp_shader_sampler *) sampler;
-}
-
-
-
-extern void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE]);
-
-
-/**
- * Texture sampling code generator that just calls lp_get_samples C function
- * for the actual sampling computation.
- *
- * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
- */
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr);
-
-
-/**
  * Pure-LLVM texture sampling code generator.
  *
  * @param context_ptr LLVM value with the pointer to the struct lp_jit_context.
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
deleted file mode 100644
index d1f5d9505d7..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
+++ /dev/null
@@ -1,1712 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture sampling
- *
- * Authors:
- *   Brian Paul
- */
-
-#include "lp_context.h"
-#include "lp_quad.h"
-#include "lp_texture.h"
-#include "lp_tex_sample.h"
-#include "lp_tex_cache.h"
-#include "pipe/p_context.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-
-
-/*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
- * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
- */
-#define FRAC(f)  ((f) - util_ifloor(f))
-
-
-/**
- * Linear interpolation macro
- */
-static INLINE float
-lerp(float a, float v0, float v1)
-{
-   return v0 + a * (v1 - v0);
-}
-
-
-/**
- * Do 2D/biliner interpolation of float values.
- * v00, v10, v01 and v11 are typically four texture samples in a square/box.
- * a and b are the horizontal and vertical interpolants.
- * It's important that this function is inlined when compiled with
- * optimization!  If we find that's not true on some systems, convert
- * to a macro.
- */
-static INLINE float
-lerp_2d(float a, float b,
-        float v00, float v10, float v01, float v11)
-{
-   const float temp0 = lerp(a, v00, v10);
-   const float temp1 = lerp(a, v01, v11);
-   return lerp(b, temp0, temp1);
-}
-
-
-/**
- * As above, but 3D interpolation of 8 values.
- */
-static INLINE float
-lerp_3d(float a, float b, float c,
-        float v000, float v100, float v010, float v110,
-        float v001, float v101, float v011, float v111)
-{
-   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
-   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
-   return lerp(c, temp0, temp1);
-}
-
-
-
-/**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
- */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
-
-
-/**
- * Apply texture coord wrapping mode and return integer texture indexes
- * for a vector of four texcoords (S or T or P).
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the incoming texcoords
- * \param size  the texture image size
- * \param icoord  returns the integer texcoords
- * \return  integer texture index
- */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
-      /* s limited to [0,1] */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Used to compute texel locations for linear sampling for four texcoords.
- * \param wrapMode  PIPE_TEX_WRAP_x
- * \param s  the texcoords
- * \param size  the texture image size
- * \param icoord0  returns first texture indexes
- * \param icoord1  returns second texture indexes (usually icoord0 + 1)
- * \param w  returns blend factor/weight between texture indexes
- * \param icoord  returns the computed integer texture coords
- */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                  int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * For RECT textures / unnormalized texcoords.
- * Only a subset of wrap modes supported.
- */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
-{
-   uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
-{
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx > ary && arx > arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
-      }
-   }
-   else if (ary > arx && ary > arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
-      }
-   }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
-      }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
-      }
-   }
-
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
-
-   return face;
-}
-
-
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   float rho, lambda;
-
-   if (samp->processor == TGSI_PROCESSOR_VERTEX)
-      return lodbias;
-
-   assert(sampler->normalized_coords);
-
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * texture->width[0];
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * texture->height[0];
-      rho = MAX2(rho, max);
-   }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * texture->depth[0];
-      rho = MAX2(rho, max);
-   }
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
-}
-
-
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
-static void
-choose_mipmap_levels(struct tgsi_sampler *tgsi_sampler,
-                     const float s[QUAD_SIZE],
-                     const float t[QUAD_SIZE],
-                     const float p[QUAD_SIZE],
-                     float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
-      }
-      else {
-         *imgFilter = sampler->mag_img_filter;
-      }
-   }
-   else {
-      float lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
-      }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
-         }
-      }
-   }
-}
-
-
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into lp_tile_cache.c and merge with the
- * lp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
-static void
-get_texel_quad_2d(const struct tgsi_sampler *tgsi_sampler,
-                  unsigned face, unsigned level, int x, int y, 
-                  const uint8_t *out[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-      
-   out[0] = &tile->color[y  ][x  ][0];
-   out[1] = &tile->color[y  ][x+1][0];
-   out[2] = &tile->color[y+1][x  ][0];
-   out[3] = &tile->color[y+1][x+1][0];
-}
-
-static INLINE const uint8_t *
-get_texel_2d_ptr(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-
-   const struct llvmpipe_cached_tex_tile *tile
-      = lp_get_cached_tex_tile(samp->cache,
-                               tex_tile_address(x, y, 0, face, level));
-
-   y %= TEX_TILE_SIZE;
-   x %= TEX_TILE_SIZE;
-
-   return &tile->color[y][x][0];
-}
-
-
-static void
-get_texel_quad_2d_mt(const struct tgsi_sampler *tgsi_sampler,
-                     unsigned face, unsigned level, 
-                     int x0, int y0, 
-                     int x1, int y1,
-                     const uint8_t *out[4])
-{
-   unsigned i;
-
-   for (i = 0; i < 4; i++) {
-      unsigned tx = (i & 1) ? x1 : x0;
-      unsigned ty = (i >> 1) ? y1 : y0;
-
-      out[i] = get_texel_2d_ptr( tgsi_sampler, face, level, tx, ty );
-   }
-}
-
-static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-                 unsigned face, unsigned level, int x, int y, int z,
-                 float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
-   }
-   else {
-      const unsigned tx = x % TEX_TILE_SIZE;
-      const unsigned ty = y % TEX_TILE_SIZE;
-      const struct llvmpipe_cached_tex_tile *tile;
-
-      tile = lp_get_cached_tex_tile(samp->cache,
-                                    tex_tile_address(x, y, z, face, level));
-
-      rgba[0][j] = ubyte_to_float(tile->color[ty][tx][0]);
-      rgba[1][j] = ubyte_to_float(tile->color[ty][tx][1]);
-      rgba[2][j] = ubyte_to_float(tile->color[ty][tx][2]);
-      rgba[3][j] = ubyte_to_float(tile->color[ty][tx][3]);
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
-   }
-}
-
-
-/**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
- */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
-{
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
-   }
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
-}
-
-
-/**
- * As above, but do four z/texture comparisons.
- */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
-{
-   int j, k0, k1, k2, k3;
-   float val;
-
-   /* compare four texcoords vs. four texture samples */
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k0 = p[0] < rgba[0][0];
-      k1 = p[1] < rgba[0][1];
-      k2 = p[2] < rgba[0][2];
-      k3 = p[3] < rgba[0][3];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k0 = p[0] <= rgba[0][0];
-      k1 = p[1] <= rgba[0][1];
-      k2 = p[2] <= rgba[0][2];
-      k3 = p[3] <= rgba[0][3];
-      break;
-   case PIPE_FUNC_GREATER:
-      k0 = p[0] > rgba[0][0];
-      k1 = p[1] > rgba[0][1];
-      k2 = p[2] > rgba[0][2];
-      k3 = p[3] > rgba[0][3];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k0 = p[0] >= rgba[0][0];
-      k1 = p[1] >= rgba[0][1];
-      k2 = p[2] >= rgba[0][2];
-      k3 = p[3] >= rgba[0][3];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k0 = p[0] == rgba[0][0];
-      k1 = p[1] == rgba[0][1];
-      k2 = p[2] == rgba[0][2];
-      k3 = p[3] == rgba[0][3];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k0 = p[0] != rgba[0][0];
-      k1 = p[1] != rgba[0][1];
-      k2 = p[2] != rgba[0][2];
-      k3 = p[3] != rgba[0][3];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k0 = k1 = k2 = k3 = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k0 = k1 = k2 = k3 = 0;
-      break;
-   default:
-      k0 = k1 = k2 = k3 = 0;
-      assert(0);
-      break;
-   }
-
-   /* convert four pass/fail values to an intensity in [0,1] */
-   val = 0.25F * (k0 + k1 + k2 + k3);
-
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for (j = 0; j < 4; j++) {
-      rgba[0][j] = rgba[1][j] = rgba[2][j] = val;
-      rgba[3][j] = 1.0F;
-   }
-}
-
-
-
-static void
-lp_get_samples_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                    const float s[QUAD_SIZE],
-                                    const float t[QUAD_SIZE],
-                                    const float p[QUAD_SIZE],
-                                    float lodbias,
-                                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-   unsigned xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
-   unsigned ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
-      
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot - 0.5F;
-      float v = t[j] * ypot - 0.5F;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      float xw = u - (float)uflr;
-      float yw = v - (float)vflr;
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *tx[4];
-      
-
-      /* Can we fetch all four at once:
-       */
-      if (x0 < xmax && y0 < ymax)
-      {
-         get_texel_quad_2d(tgsi_sampler, 0, level, x0, y0, tx);
-      }
-      else 
-      {
-         unsigned x1 = (x0 + 1) & (xpot - 1);
-         unsigned y1 = (y0 + 1) & (ypot - 1);
-         get_texel_quad_2d_mt(tgsi_sampler, 0, level, 
-                              x0, y0, x1, y1, tx);
-      }
-
-
-      /* interpolate R, G, B, A */
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = lerp_2d(xw, yw, 
-                              ubyte_to_float(tx[0][c]), ubyte_to_float(tx[1][c]),
-                              ubyte_to_float(tx[2][c]), ubyte_to_float(tx[3][c]));
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int uflr = util_ifloor(u);
-      int vflr = util_ifloor(v);
-
-      int x0 = uflr & (xpot - 1);
-      int y0 = vflr & (ypot - 1);
-
-      const uint8_t *out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
-                                     const float s[QUAD_SIZE],
-                                     const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias,
-                                     float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   unsigned  j;
-   unsigned level = samp->level;
-   unsigned xpot = 1 << (samp->xpot - level);
-   unsigned ypot = 1 << (samp->ypot - level);
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      int c;
-
-      float u = s[j] * xpot;
-      float v = t[j] * ypot;
-
-      int x0, y0;
-      const uint8_t *out;
-
-      x0 = util_ifloor(u);
-      if (x0 < 0) 
-         x0 = 0;
-      else if (x0 > xpot - 1)
-         x0 = xpot - 1;
-
-      y0 = util_ifloor(v);
-      if (y0 < 0) 
-         y0 = 0;
-      else if (y0 > ypot - 1)
-         y0 = ypot - 1;
-      
-      out = get_texel_2d_ptr(tgsi_sampler, 0, level, x0, y0);
-
-      for (c = 0; c < 4; c++) {
-         rgba[c][j] = ubyte_to_float(out[c]);
-      }
-   }
-}
-
-
-static void
-lp_get_samples_2d_linear_mip_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
-                                               const float s[QUAD_SIZE],
-                                               const float t[QUAD_SIZE],
-                                               const float p[QUAD_SIZE],
-                                               float lodbias,
-                                               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   int level0;
-   float lambda;
-
-   lambda = compute_lambda(tgsi_sampler, s, t, p, lodbias);
-   level0 = (int)lambda;
-
-   if (lambda < 0.0) { 
-      samp->level = 0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else if (level0 >= texture->last_level) {
-      samp->level = texture->last_level;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba );
-   }
-   else {
-      float levelBlend = lambda - level0;
-      float rgba0[4][4];
-      float rgba1[4][4];
-      int c,j;
-
-      samp->level = level0;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba0 );
-
-      samp->level = level0+1;
-      lp_get_samples_2d_linear_repeat_POT( tgsi_sampler,
-                                           s, t, p, 0, rgba1 );
-
-      for (j = 0; j < QUAD_SIZE; j++) {
-         for (c = 0; c < 4; c++) {
-            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
-         }
-      }
-   }
-}
-
-/**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
- */
-static void
-lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend = 0.0F;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-
-               /* XXX: This is incorrect -- will often end up with (x0
-                *  == x1 && y0 == y1), meaning that we fetch the same
-                *  texel four times and linearly interpolate between
-                *  identical values.  The correct approach would be to
-                *  call linear_texcoord again for the second level.
-                */
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static INLINE void
-lp_get_samples_1d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_2d(struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   lp_get_samples_2d_common(sampler, s, t, p,
-                            lodbias, rgba, faces);
-}
-
-
-static INLINE void
-lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend;
-   const uint face = 0;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-static void
-lp_get_samples_cube(struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
-   }
-   lp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            lodbias, rgba, faces);
-}
-
-
-static void
-lp_get_samples_rect(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(tgsi_sampler, s, t, p, 
-                        lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-            }
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-}
-
-
-/**
- * Error condition handler
- */
-static INLINE void
-lp_get_samples_null(struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   int i,j;
-
-   for (i = 0; i < 4; i++)
-      for (j = 0; j < 4; j++)
-         rgba[i][j] = 1.0;
-}
-
-/**
- * Called via tgsi_sampler::get_samples() when using a sampler for the
- * first time.  Determine the actual sampler function, link it in and
- * call it.
- */
-void
-lp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   struct lp_shader_sampler *samp = lp_shader_sampler(tgsi_sampler);
-   const struct pipe_texture *texture = samp->texture;
-   const struct pipe_sampler_state *sampler = samp->sampler;
-
-   /* Default to the 'undefined' case:
-    */
-   tgsi_sampler->get_samples = lp_get_samples_null;
-
-   if (!texture) {
-      assert(0);                /* is this legal?? */
-      goto out;
-   }
-
-   if (!sampler->normalized_coords) {
-      assert (texture->target == PIPE_TEXTURE_2D);
-      tgsi_sampler->get_samples = lp_get_samples_rect;
-      goto out;
-   }
-
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      tgsi_sampler->get_samples = lp_get_samples_1d;
-      break;
-   case PIPE_TEXTURE_2D:
-      tgsi_sampler->get_samples = lp_get_samples_2d;
-      break;
-   case PIPE_TEXTURE_3D:
-      tgsi_sampler->get_samples = lp_get_samples_3d;
-      break;
-   case PIPE_TEXTURE_CUBE:
-      tgsi_sampler->get_samples = lp_get_samples_cube;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   /* Do this elsewhere: 
-    */
-   samp->xpot = util_unsigned_logbase2( samp->texture->width[0] );
-   samp->ypot = util_unsigned_logbase2( samp->texture->height[0] );
-
-   /* Try to hook in a faster sampler.  Ultimately we'll have to
-    * code-generate these.  Luckily most of this looks like it is
-    * orthogonal state within the sampler.
-    */
-   if (texture->target == PIPE_TEXTURE_2D &&
-       sampler->min_img_filter == sampler->mag_img_filter &&
-       sampler->wrap_s == sampler->wrap_t &&
-       sampler->compare_mode == FALSE &&
-       sampler->normalized_coords) 
-   {
-      if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-         samp->level = CLAMP((int) sampler->min_lod,
-                             0, (int) texture->last_level);
-
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_repeat_POT;
-               break;
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-         else if (sampler->wrap_s == PIPE_TEX_WRAP_CLAMP) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_NEAREST:
-               tgsi_sampler->get_samples = lp_get_samples_2d_nearest_clamp_POT;
-               break;
-            default:
-               break;
-            }
-         }
-      }
-      else if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT) {
-            switch (sampler->min_img_filter) {
-            case PIPE_TEX_FILTER_LINEAR:
-               tgsi_sampler->get_samples = lp_get_samples_2d_linear_mip_linear_repeat_POT;
-               break;
-            default:
-               break;
-            }
-         } 
-      }
-   }
-   else if (0) {
-      _debug_printf("target %d/%d min_mip %d/%d min_img %d/%d wrap %d/%d compare %d/%d norm %d/%d\n",
-                    texture->target, PIPE_TEXTURE_2D,
-                    sampler->min_mip_filter, PIPE_TEX_MIPFILTER_NONE,
-                    sampler->min_img_filter, sampler->mag_img_filter,
-                    sampler->wrap_s, sampler->wrap_t,
-                    sampler->compare_mode, FALSE,
-                    sampler->normalized_coords, TRUE);
-   }
-
-out:
-   tgsi_sampler->get_samples( tgsi_sampler, s, t, p, lodbias, rgba );
-}
-
-
-void PIPE_CDECL
-lp_fetch_texel_soa( struct tgsi_sampler **samplers,
-                    uint32_t unit,
-                    float *store )
-{
-   struct tgsi_sampler *sampler = samplers[unit];
-
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n",
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   debug_printf("lodbias %f\n", store[12]);
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      sampler->get_samples(sampler,
-                           &store[0],
-                           &store[4],
-                           &store[8],
-                           0.0f, /*store[12],  lodbias */
-                           rgba);
-      memcpy(store, rgba, sizeof rgba);
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n",
-                   j,
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-
-#include "lp_bld_type.h"
-#include "lp_bld_intr.h"
-#include "lp_bld_tgsi.h"
-
-
-struct lp_c_sampler_soa
-{
-   struct lp_build_sampler_soa base;
-
-   LLVMValueRef context_ptr;
-
-   LLVMValueRef samplers_ptr;
-
-   /** Coords/texels store */
-   LLVMValueRef store_ptr;
-};
-
-
-static void
-lp_c_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
-{
-   FREE(sampler);
-}
-
-
-static void
-lp_c_sampler_soa_emit_fetch_texel(struct lp_build_sampler_soa *_sampler,
-                                  LLVMBuilderRef builder,
-                                  struct lp_type type,
-                                  unsigned unit,
-                                  unsigned num_coords,
-                                  const LLVMValueRef *coords,
-                                  LLVMValueRef lodbias,
-                                  LLVMValueRef *texel)
-{
-   struct lp_c_sampler_soa *sampler = (struct lp_c_sampler_soa *)_sampler;
-   LLVMTypeRef vec_type = LLVMTypeOf(coords[0]);
-   LLVMValueRef args[3];
-   unsigned i;
-
-   if(!sampler->samplers_ptr)
-      sampler->samplers_ptr = lp_jit_context_samplers(builder, sampler->context_ptr);
-
-   if(!sampler->store_ptr)
-      sampler->store_ptr = LLVMBuildArrayAlloca(builder,
-                                            vec_type,
-                                            LLVMConstInt(LLVMInt32Type(), 4, 0),
-                                            "texel_store");
-
-   for (i = 0; i < num_coords; i++) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef coord_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      LLVMBuildStore(builder, coords[i], coord_ptr);
-   }
-
-   args[0] = sampler->samplers_ptr;
-   args[1] = LLVMConstInt(LLVMInt32Type(), unit, 0);
-   args[2] = sampler->store_ptr;
-
-   lp_build_intrinsic(builder, "fetch_texel", LLVMVoidType(), args, 3);
-
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef texel_ptr = LLVMBuildGEP(builder, sampler->store_ptr, &index, 1, "");
-      texel[i] = LLVMBuildLoad(builder, texel_ptr, "");
-   }
-}
-
-
-struct lp_build_sampler_soa *
-lp_c_sampler_soa_create(LLVMValueRef context_ptr)
-{
-   struct lp_c_sampler_soa *sampler;
-
-   sampler = CALLOC_STRUCT(lp_c_sampler_soa);
-   if(!sampler)
-      return NULL;
-
-   sampler->base.destroy = lp_c_sampler_soa_destroy;
-   sampler->base.emit_fetch_texel = lp_c_sampler_soa_emit_fetch_texel;
-   sampler->context_ptr = context_ptr;
-
-   return &sampler->base;
-}
-
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 948e3de34d9..039539d682d 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -33,6 +33,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "pipe/internal/p_winsys_screen.h"
+
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -46,7 +49,6 @@
 /* Simple, maximally packed layout.
  */
 
-
 /* Conventional allocation path for non-display textures:
  */
 static boolean
@@ -55,29 +57,21 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
 {
    struct pipe_texture *pt = &lpt->base;
    unsigned level;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
 
    unsigned buffer_size = 0;
 
-   pf_get_block(lpt->base.format, &lpt->base.block);
-
    for (level = 0; level <= pt->last_level; level++) {
       unsigned nblocksx, nblocksy;
 
-      pt->width[level] = width;
-      pt->height[level] = height;
-      pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);
-
       /* Allocate storage for whole quads. This is particularly important
        * for depth surfaces, which are currently stored in a swizzled format. */
-      nblocksx = pf_get_nblocksx(&pt->block, align(width, 2));
-      nblocksy = pf_get_nblocksy(&pt->block, align(height, 2));
+      nblocksx = util_format_get_nblocksx(pt->format, align(width, 2));
+      nblocksy = util_format_get_nblocksy(pt->format, align(height, 2));
 
-      lpt->stride[level] = align(nblocksx*pt->block.size, 16);
+      lpt->stride[level] = align(nblocksx * util_format_get_blocksize(pt->format), 16);
 
       lpt->level_offset[level] = buffer_size;
 
@@ -85,9 +79,9 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
                       ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                       lpt->stride[level]);
 
-      width  = minify(width);
-      height = minify(height);
-      depth = minify(depth);
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
    }
 
    lpt->data = align_malloc(buffer_size, 16);
@@ -101,14 +95,10 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
 {
    struct llvmpipe_winsys *winsys = screen->winsys;
 
-   pf_get_block(lpt->base.format, &lpt->base.block);
-   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
-   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
-
    lpt->dt = winsys->displaytarget_create(winsys,
                                           lpt->base.format,
-                                          lpt->base.width[0],
-                                          lpt->base.height[0],
+                                          lpt->base.width0,
+                                          lpt->base.height0,
                                           16,
                                           &lpt->stride[0] );
 
@@ -170,7 +160,7 @@ llvmpipe_texture_blanket(struct pipe_screen * screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -181,8 +171,6 @@ llvmpipe_texture_blanket(struct pipe_screen * screen,
    lpt->base = *base;
    pipe_reference_init(&lpt->base.reference, 1);
    lpt->base.screen = screen;
-   lpt->base.nblocksx[0] = pf_get_nblocksx(&lpt->base.block, lpt->base.width[0]);  
-   lpt->base.nblocksy[0] = pf_get_nblocksy(&lpt->base.block, lpt->base.height[0]);  
    lpt->stride[0] = stride[0];
 
    pipe_buffer_reference(&lpt->buffer, buffer);
@@ -227,8 +215,8 @@ llvmpipe_get_tex_surface(struct pipe_screen *screen,
       pipe_reference_init(&ps->reference, 1);
       pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
       ps->offset = lpt->level_offset[level];
       ps->usage = usage;
 
@@ -256,11 +244,17 @@ llvmpipe_get_tex_surface(struct pipe_screen *screen,
       ps->level = level;
       ps->zslice = zslice;
 
+      /* XXX shouldn't that rather be
+         tex_height = align(ps->height, 2);
+         to account for alignment done in llvmpipe_texture_layout ?
+      */
       if (pt->target == PIPE_TEXTURE_CUBE) {
-         ps->offset += face * pt->nblocksy[level] * lpt->stride[level];
+         unsigned tex_height = ps->height;
+         ps->offset += face * util_format_get_nblocksy(pt->format, tex_height) * lpt->stride[level];
       }
       else if (pt->target == PIPE_TEXTURE_3D) {
-         ps->offset += zslice * pt->nblocksy[level] * lpt->stride[level];
+         unsigned tex_height = ps->height;
+         ps->offset += zslice * util_format_get_nblocksy(pt->format, tex_height) * lpt->stride[level];
       }
       else {
          assert(face == 0);
@@ -301,14 +295,10 @@ llvmpipe_get_tex_transfer(struct pipe_screen *screen,
    if (lpt) {
       struct pipe_transfer *pt = &lpt->base;
       pipe_texture_reference(&pt->texture, texture);
-      pt->format = texture->format;
-      pt->block = texture->block;
       pt->x = x;
       pt->y = y;
       pt->width = w;
       pt->height = h;
-      pt->nblocksx = texture->nblocksx[level];
-      pt->nblocksy = texture->nblocksy[level];
       pt->stride = lptex->stride[level];
       pt->usage = usage;
       pt->face = face;
@@ -317,11 +307,17 @@ llvmpipe_get_tex_transfer(struct pipe_screen *screen,
 
       lpt->offset = lptex->level_offset[level];
 
+      /* XXX shouldn't that rather be
+         tex_height = align(u_minify(texture->height0, level), 2)
+         to account for alignment done in llvmpipe_texture_layout ?
+      */
       if (texture->target == PIPE_TEXTURE_CUBE) {
-         lpt->offset += face * pt->nblocksy * pt->stride;
+         unsigned tex_height = u_minify(texture->height0, level);
+         lpt->offset += face *  util_format_get_nblocksy(texture->format, tex_height) * pt->stride;
       }
       else if (texture->target == PIPE_TEXTURE_3D) {
-         lpt->offset += zslice * pt->nblocksy * pt->stride;
+         unsigned tex_height = u_minify(texture->height0, level);
+         lpt->offset += zslice * util_format_get_nblocksy(texture->format, tex_height) * pt->stride;
       }
       else {
          assert(face == 0);
@@ -353,9 +349,11 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
    ubyte *map, *xfer_map;
    struct llvmpipe_texture *lpt;
+   enum pipe_format format;
 
    assert(transfer->texture);
    lpt = llvmpipe_texture(transfer->texture);
+   format = lpt->base.format;
 
    if(lpt->dt) {
       struct llvmpipe_winsys *winsys = screen->winsys;
@@ -380,8 +378,8 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    }
    
    xfer_map = map + llvmpipe_transfer(transfer)->offset +
-      transfer->y / transfer->block.height * transfer->stride +
-      transfer->x / transfer->block.width * transfer->block.size;
+      transfer->y / util_format_get_blockheight(format) * transfer->stride +
+      transfer->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
    /*printf("map = %p  xfer map = %p\n", map, xfer_map);*/
    return xfer_map;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index a555149730b..971d9333331 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -250,13 +250,13 @@ lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
                case LP_TILE_STATUS_CLEAR:
                   /* Actually clear the tiles which were flagged as being in a
                    * clear state. */
-                  util_fill_rect(tc->transfer_map, &pt->block, pt->stride,
+                  util_fill_rect(tc->transfer_map, pt->texture->format, pt->stride,
                                  x, y, w, h,
                                  tc->clear_val);
                   break;
 
                case LP_TILE_STATUS_DEFINED:
-                  lp_tile_write_4ub(pt->format,
+                  lp_tile_write_4ub(pt->texture->format,
                                     tile->color,
                                     tc->transfer_map, pt->stride,
                                     x, y, w, h);
@@ -289,6 +289,11 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
    assert(tc->surface);
    assert(tc->transfer);
 
+   if(!tc->transfer_map)
+      lp_tile_cache_map_transfers(tc);
+
+   assert(tc->transfer_map);
+
    switch(tile->status) {
    case LP_TILE_STATUS_CLEAR:
       /* don't get tile from framebuffer, just clear it */
@@ -304,7 +309,7 @@ lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
       y &= ~(TILE_SIZE - 1);
 
       if (!pipe_clip_tile(x, y, &w, &h, tc->transfer))
-         lp_tile_read_4ub(pt->format,
+         lp_tile_read_4ub(pt->texture->format,
                           tile->color,
                           tc->transfer_map, tc->transfer->stride,
                           x, y, w, h);
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
deleted file mode 100644
index 9c235080a55..00000000000
--- a/src/gallium/drivers/nouveau/nouveau_push.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef __NOUVEAU_PUSH_H__
-#define __NOUVEAU_PUSH_H__
-
-#include "nouveau/nouveau_winsys.h"
-
-#ifndef NOUVEAU_PUSH_CONTEXT
-#error undefined push context
-#endif
-
-#define OUT_RING(data) do {                                                    \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	(*pc->base.channel->pushbuf->cur++) = (data);                          \
-} while(0)
-
-#define OUT_RINGp(src,size) do {                                               \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	memcpy(pc->base.channel->pushbuf->cur, (src), (size) * 4);             \
-	pc->base.channel->pushbuf->cur += (size);                              \
-} while(0)
-
-#define OUT_RINGf(data) do {                                                   \
-	union { float v; uint32_t u; } c;                                      \
-	c.v = (data);                                                          \
-	OUT_RING(c.u);                                                         \
-} while(0)
-
-#define BEGIN_RING(obj,mthd,size) do {                                         \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
-	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
-} while(0)
-
-static inline void
-DO_FIRE_RING(struct nouveau_channel *chan, struct pipe_fence_handle **fence)
-{
-	nouveau_pushbuf_flush(chan, 0);
-	if (fence)
-		*fence = NULL;
-}
-
-#define FIRE_RING(fence) do {                                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	DO_FIRE_RING(pc->base.channel, fence);                                 \
-} while(0)
-
-#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, nouveau_bo(bo), \
-				   (data), 0, (flags), (vor), (tor));          \
-} while(0)
-
-/* Raw data + flags depending on FB/TT buffer */
-#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
-	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
-} while(0)
-
-/* FB/TT object handle */
-#define OUT_RELOCo(bo,flags) do {                                              \
-	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
-		  pc->base.channel->vram->handle,                              \
-		  pc->base.channel->gart->handle);                             \
-} while(0)
-
-/* Low 32-bits of offset */
-#define OUT_RELOCl(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
-} while(0)
-
-/* High 32-bits of offset */
-#define OUT_RELOCh(bo,delta,flags) do {                                        \
-	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
-} while(0)
-
-/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
-#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
-	NOUVEAU_PUSH_CONTEXT(pc);                                              \
-	struct nouveau_channel *chan = pc->base.channel;                       \
-	if (chan->pushbuf->remaining < ((size) + 1))                           \
-		nouveau_pushbuf_flush(chan, ((size) + 1));                     \
-	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
-		   (flags), 0, 0);                                             \
-	chan->pushbuf->remaining -= ((size) + 1);                              \
-} while(0)
-
-#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index e4cf91c005c..1ad539d2858 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -31,7 +31,7 @@ nouveau_screen_bo_skel(struct pipe_screen *pscreen, struct nouveau_bo *bo,
 		       unsigned alignment, unsigned usage, unsigned size)
 {
 	struct pipe_buffer *pb;
-	
+
 	pb = CALLOC(1, sizeof(struct pipe_buffer)+sizeof(struct nouveau_bo *));
 	if (!pb) {
 		nouveau_bo_ref(NULL, &bo);
@@ -127,8 +127,18 @@ nouveau_screen_bo_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 		      unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map(bo, nouveau_screen_map_flags(usage));
 	if (ret) {
 		debug_printf("map failed: %d\n", ret);
@@ -143,11 +153,22 @@ nouveau_screen_bo_map_range(struct pipe_screen *pscreen, struct pipe_buffer *pb,
 			    unsigned offset, unsigned length, unsigned usage)
 {
 	struct nouveau_bo *bo = nouveau_bo(pb);
+	struct nouveau_screen *nscreen = nouveau_screen(pscreen);
 	uint32_t flags = nouveau_screen_map_flags(usage);
 	int ret;
 
+	if (nscreen->pre_pipebuffer_map_callback) {
+		ret = nscreen->pre_pipebuffer_map_callback(pscreen, pb, usage);
+		if (ret) {
+			debug_printf("pre_pipebuffer_map_callback failed %d\n",
+				ret);
+			return NULL;
+		}
+	}
+
 	ret = nouveau_bo_map_range(bo, offset, length, flags);
 	if (ret) {
+		nouveau_bo_unmap(bo);
 		if (!(flags & NOUVEAU_BO_NOWAIT) || ret != -EBUSY)
 			debug_printf("map_range failed: %d\n", ret);
 		return NULL;
@@ -239,5 +260,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
+	struct pipe_winsys *ws = screen->base.winsys;
+	nouveau_channel_free(&screen->channel);
+	ws->destroy(ws);
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index ebfc67ad1c1..a7927d88dfc 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -5,6 +5,9 @@ struct nouveau_screen {
 	struct pipe_screen base;
 	struct nouveau_device *device;
 	struct nouveau_channel *channel;
+
+	int (*pre_pipebuffer_map_callback) (struct pipe_screen *pscreen,
+		struct pipe_buffer *pb, unsigned usage);
 };
 
 static inline struct nouveau_screen *
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index b595405357f..e844f6abb3d 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -3,41 +3,95 @@
 
 #include "util/u_debug.h"
 
+#ifdef DEBUG
+#define DEBUG_NOUVEAU_STATEOBJ
+#endif /* DEBUG */
+
 struct nouveau_stateobj_reloc {
 	struct nouveau_bo *bo;
 
-	unsigned offset;
-	unsigned packet;
+	struct nouveau_grobj *gr;
+	uint32_t push_offset;
+	uint32_t mthd;
 
-	unsigned data;
+	uint32_t data;
 	unsigned flags;
 	unsigned vor;
 	unsigned tor;
 };
 
+struct nouveau_stateobj_start {
+	struct nouveau_grobj *gr;
+	uint32_t mthd;
+	uint32_t size;
+	unsigned offset;
+};
+
 struct nouveau_stateobj {
 	struct pipe_reference reference;
 
-	unsigned *push;
+	struct nouveau_stateobj_start *start;
 	struct nouveau_stateobj_reloc *reloc;
 
-	unsigned *cur;
-	unsigned cur_packet;
+	/* Common memory pool for data. */
+	uint32_t *pool;
+	unsigned pool_cur;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	unsigned start_alloc;
+	unsigned reloc_alloc;
+	unsigned pool_alloc;
+#endif  /* DEBUG_NOUVEAU_STATEOBJ */
+
+	unsigned total; /* includes begin_ring */
+	unsigned cur; /* excludes begin_ring, offset from "cur_start" */
+	unsigned cur_start;
 	unsigned cur_reloc;
 };
 
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr, total = 0;
+
+	for (i = 0; i < so->cur_start; i++) {
+		if (so->start[i].gr->subc > -1)
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | (so->start[i].gr->subc << 13)
+				| so->start[i].mthd);
+		else
+			debug_printf("+0x%04x: 0x%08x\n", total++,
+				(so->start[i].size << 18) | so->start[i].mthd);
+		for (nr = 0; nr < so->start[i].size; nr++, total++)
+			debug_printf("+0x%04x: 0x%08x\n", total,
+				so->pool[so->start[i].offset + nr]);
+	}
+}
+
 static INLINE struct nouveau_stateobj *
-so_new(unsigned push, unsigned reloc)
+so_new(unsigned start, unsigned push, unsigned reloc)
 {
 	struct nouveau_stateobj *so;
 
 	so = MALLOC(sizeof(struct nouveau_stateobj));
 	pipe_reference_init(&so->reference, 1);
-	so->push = MALLOC(sizeof(unsigned) * push);
-	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+	so->total = so->cur = so->cur_start = so->cur_reloc = 0;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	so->start_alloc = start;
+	so->reloc_alloc = reloc;
+	so->pool_alloc = push;
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
 
-	so->cur = so->push;
-	so->cur_reloc = so->cur_packet = 0;
+	so->start = MALLOC(start * sizeof(struct nouveau_stateobj_start));
+	so->reloc = MALLOC(reloc * sizeof(struct nouveau_stateobj_reloc));
+	so->pool = MALLOC(push * sizeof(uint32_t));
+	so->pool_cur = 0;
+
+	if (!so->start || !so->reloc || !so->pool) {
+		debug_printf("malloc failed\n");
+		assert(0);
+	}
 
 	return so;
 }
@@ -48,62 +102,128 @@ so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
 	struct nouveau_stateobj *so = *pso;
 	int i;
 
-        if (pipe_reference((struct pipe_reference**)pso, &ref->reference)) {
-		free(so->push);
+	if (pipe_reference(&(*pso)->reference, &ref->reference)) {
+		FREE(so->start);
 		for (i = 0; i < so->cur_reloc; i++)
 			nouveau_bo_ref(NULL, &so->reloc[i].bo);
-		free(so->reloc);
-		free(so);
+		FREE(so->reloc);
+		FREE(so->pool);
+		FREE(so);
 	}
+	*pso = ref;
 }
 
 static INLINE void
-so_data(struct nouveau_stateobj *so, unsigned data)
+so_data(struct nouveau_stateobj *so, uint32_t data)
 {
-	(*so->cur++) = (data);
-	so->cur_packet += 4;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur >= so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->pool[so->start[so->cur_start - 1].offset + so->cur++] = data;
 }
 
 static INLINE void
-so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+so_datap(struct nouveau_stateobj *so, uint32_t *data, unsigned size)
 {
-	so->cur_packet += (4 * size);
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if ((so->cur + size) > so->start[so->cur_start - 1].size) {
+		debug_printf("exceeding specified size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
 	while (size--)
-		(*so->cur++) = (*data++);
+		so->pool[so->start[so->cur_start - 1].offset + so->cur++] =
+			*data++;
 }
 
 static INLINE void
 so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	  unsigned mthd, unsigned size)
 {
-	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
-	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+	struct nouveau_stateobj_start *start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start_alloc <= so->cur_start) {
+		debug_printf("exceeding num_start size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		start = so->start;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->cur_start > 0 && start[so->cur_start - 1].size > so->cur) {
+		debug_printf("previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	so->start = start;
+	start[so->cur_start].gr = gr;
+	start[so->cur_start].mthd = mthd;
+	start[so->cur_start].size = size;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->pool_alloc < (size + so->pool_cur)) {
+		debug_printf("exceeding num_pool size\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	start[so->cur_start].offset = so->pool_cur;
+	so->pool_cur += size;
+
+	so->cur_start++;
+	/* The 1 is for *this* begin_ring. */
+	so->total += so->cur + 1;
+	so->cur = 0;
 }
 
 static INLINE void
 so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
 	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
 {
-	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
-	
-	r->bo = NULL;
-	nouveau_bo_ref(bo, &r->bo);
-	r->offset = so->cur - so->push;
-	r->packet = so->cur_packet;
-	r->data = data;
-	r->flags = flags;
-	r->vor = vor;
-	r->tor = tor;
+	struct nouveau_stateobj_reloc *r;
+
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->reloc_alloc <= so->cur_reloc) {
+		debug_printf("exceeding num_reloc size\n");
+		assert(0);
+	} else
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+		r = so->reloc;
+
+	so->reloc = r;
+	r[so->cur_reloc].bo = NULL;
+	nouveau_bo_ref(bo, &(r[so->cur_reloc].bo));
+	r[so->cur_reloc].gr = so->start[so->cur_start-1].gr;
+	r[so->cur_reloc].push_offset = so->total + so->cur;
+	r[so->cur_reloc].data = data;
+	r[so->cur_reloc].flags = flags;
+	r[so->cur_reloc].mthd = so->start[so->cur_start-1].mthd +
+							(so->cur << 2);
+	r[so->cur_reloc].vor = vor;
+	r[so->cur_reloc].tor = tor;
+
 	so_data(so, data);
+	so->cur_reloc++;
 }
 
-static INLINE void
-so_dump(struct nouveau_stateobj *so)
+/* Determine if this buffer object is referenced by this state object. */
+static INLINE boolean
+so_bo_is_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo)
 {
-	unsigned i, nr = so->cur - so->push;
+	int i;
+
+	for (i = 0; i < so->cur_reloc; i++)
+		if (so->reloc[i].bo == bo)
+			return true;
 
-	for (i = 0; i < nr; i++)
-		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+	return false;
 }
 
 static INLINE void
@@ -111,48 +231,95 @@ so_emit(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = chan->pushbuf;
 	unsigned nr, i;
+	int ret = 0;
 
-	nr = so->cur - so->push;
-	if (pb->remaining < nr)
-		nouveau_pushbuf_flush(chan, nr);
-	pb->remaining -= nr;
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+	if (so->start[so->cur_start - 1].size > so->cur) {
+		debug_printf("emit: previous so_method was not filled\n");
+		assert(0);
+	}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+	/* We cannot update total in case we so_emit again. */
+	nr = so->total + so->cur;
+
+	/* This will flush if we need space.
+	 * We don't actually need the marker.
+	 */
+	if ((ret = nouveau_pushbuf_marker_emit(chan, nr, so->cur_reloc))) {
+		debug_printf("so_emit failed marker emit with error %d\n", ret);
+		assert(0);
+	}
+
+	/* Submit data. This will ensure proper binding of objects. */
+	for (i = 0; i < so->cur_start; i++) {
+		BEGIN_RING(chan, so->start[i].gr, so->start[i].mthd, so->start[i].size);
+		OUT_RINGp(chan, &(so->pool[so->start[i].offset]), so->start[i].size);
+	}
 
-	memcpy(pb->cur, so->push, nr * 4);
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		nouveau_pushbuf_emit_reloc(chan, pb->cur + r->offset,
-					   r->bo, r->data, 0, r->flags,
-					   r->vor, r->tor);
+		if ((ret = nouveau_pushbuf_emit_reloc(chan, pb->cur - nr +
+						r->push_offset, r->bo, r->data,
+						0, r->flags, r->vor, r->tor))) {
+			debug_printf("so_emit failed reloc with error %d\n", ret);
+			assert(0);
+		}
 	}
-	pb->cur += nr;
 }
 
 static INLINE void
 so_emit_reloc_markers(struct nouveau_channel *chan, struct nouveau_stateobj *so)
 {
 	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *gr = NULL;
 	unsigned i;
+	int ret = 0;
 
 	if (!so)
 		return;
 
-	i = so->cur_reloc << 1;
-	if (pb->remaining < i)
-		nouveau_pushbuf_flush(chan, i);
-	pb->remaining -= i;
-
+	/* If we need to flush in flush notify, then we have a problem anyway. */
 	for (i = 0; i < so->cur_reloc; i++) {
 		struct nouveau_stateobj_reloc *r = &so->reloc[i];
 
-		nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo, r->packet, 0,
-					   (r->flags & (NOUVEAU_BO_VRAM |
-							NOUVEAU_BO_GART |
-							NOUVEAU_BO_RDWR)) |
-					   NOUVEAU_BO_DUMMY, 0, 0);
-		nouveau_pushbuf_emit_reloc(chan, pb->cur++, r->bo, r->data, 0,
-					   r->flags | NOUVEAU_BO_DUMMY,
-					   r->vor, r->tor);
+#ifdef DEBUG_NOUVEAU_STATEOBJ
+		if (r->mthd & 0x40000000) {
+			debug_printf("error: NI mthd 0x%08X\n", r->mthd);
+			continue;
+		}
+#endif /* DEBUG_NOUVEAU_STATEOBJ */
+
+		/* The object needs to be bound and the system must know the
+		 * subchannel is being used. Otherwise it will discard it.
+		 */
+		if (gr != r->gr) {
+			BEGIN_RING(chan, r->gr, 0x100, 1);
+			OUT_RING(chan, 0);
+			gr = r->gr;
+		}
+
+		/* Some relocs really don't like to be hammered,
+		 * NOUVEAU_BO_DUMMY makes sure it only
+		 * happens when needed.
+		 */
+		ret = OUT_RELOC(chan, r->bo, (r->gr->subc << 13) | (1<< 18) |
+			r->mthd, (r->flags & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART
+				| NOUVEAU_BO_RDWR)) | NOUVEAU_BO_DUMMY, 0, 0);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+
+		ret = OUT_RELOC(chan, r->bo, r->data, r->flags |
+			NOUVEAU_BO_DUMMY, r->vor, r->tor);
+		if (ret) {
+			debug_printf("OUT_RELOC failed %d\n", ret);
+			assert(0);
+		}
+
+		pb->remaining -= 2;
 	}
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 42c77e5e778..4c3e08a43f5 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -23,6 +23,9 @@
 #define NOUVEAU_BUFFER_USAGE_ZETA     (1 << 17)
 #define NOUVEAU_BUFFER_USAGE_TRANSFER (1 << 18)
 
+/* use along with GPU_WRITE for 2D-only writes */
+#define NOUVEAU_BUFFER_USAGE_NO_RENDER (1 << 19)
+
 extern struct pipe_screen *
 nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
index 10d984ace9b..edd96859cf8 100644
--- a/src/gallium/drivers/nv04/nv04_context.c
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -10,10 +10,14 @@ nv04_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv04->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -27,40 +31,39 @@ nv04_destroy(struct pipe_context *pipe)
 	FREE(nv04);
 }
 
-static void
-nv04_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
-}
-
 static boolean
 nv04_init_hwctx(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
 	// requires a valid handle
-//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
 //	OUT_RING(0);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
-	OUT_RING(0);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(chan, 0);
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(0x40182800);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, 0x40182800);
 //	OUT_RING(1<<20/*no cull*/);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
 //	OUT_RING(0x24|(1<<6)|(1<<8));
-	OUT_RING(0x120001a4);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
-	OUT_RING(0x332213a1);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
-	OUT_RING(0x11001010);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
-	OUT_RING(0x0);
-//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+	OUT_RING(chan, 0x120001a4);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(chan, 0x332213a1);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(chan, 0x11001010);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(chan, 0x0);
+//	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
 //	OUT_RING(SCREEN_OFFSET);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
-	OUT_RING(0xff000000);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(chan, 0xff000000);
 
 
 
-	FIRE_RING (NULL);
+	FIRE_RING (chan);
 	return TRUE;
 }
 
@@ -83,7 +86,6 @@ nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv04->pipe.winsys = ws;
 	nv04->pipe.screen = pscreen;
 	nv04->pipe.destroy = nv04_destroy;
-	nv04->pipe.set_edgeflags = nv04_set_edgeflags;
 	nv04->pipe.draw_arrays = nv04_draw_arrays;
 	nv04->pipe.draw_elements = nv04_draw_elements;
 	nv04->pipe.clear = nv04_clear;
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
index 55326c787a8..fe3b527423c 100644
--- a/src/gallium/drivers/nv04/nv04_context.h
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv04_screen *ctx = nv04->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv04_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -141,9 +137,9 @@ extern void nv04_emit_hw_state(struct nv04_context *nv04);
 extern void nv04_state_tex_update(struct nv04_context *nv04);
 
 /* nv04_vbo.c */
-extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv04_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv04_draw_elements( struct pipe_context *pipe,
+extern void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
index 21f990fd536..c152b52119a 100644
--- a/src/gallium/drivers/nv04/nv04_fragtex.c
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -4,7 +4,7 @@
 #define _(m,tf)                                                                \
 {                                                                              \
   PIPE_FORMAT_##m,                                                             \
-  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+  NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
 }
 
 struct nv04_texture_format {
@@ -53,14 +53,14 @@ nv04_fragtex_build(struct nv04_context *nv04, int unit)
 		return;
 	}
 
-	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+	nv04->fragtex.format = NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
 		| nv04_fragtex_format(pt->format)
-		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
-		| ( log2i(pt->width[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
-		| ( log2i(pt->height[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		| ( (pt->last_level + 1) << NV04_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width0) << NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height0) << NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
 		;
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_miptree.c b/src/gallium/drivers/nv04/nv04_miptree.c
index 93f752faec9..e0a6948aeb4 100644
--- a/src/gallium/drivers/nv04/nv04_miptree.c
+++ b/src/gallium/drivers/nv04/nv04_miptree.c
@@ -1,6 +1,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
 
 #include "nv04_context.h"
 #include "nv04_screen.h"
@@ -9,31 +10,22 @@ static void
 nv04_miptree_layout(struct nv04_miptree *nv04mt)
 {
 	struct pipe_texture *pt = &nv04mt->base;
-	uint width = pt->width[0], height = pt->height[0];
 	uint offset = 0;
 	int nr_faces, l;
 
 	nr_faces = 1;
 
 	for (l = 0; l <= pt->last_level; l++) {
-		pt->width[l] = width;
-		pt->height[l] = height;
-
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
-		
-		nv04mt->level[l].pitch = pt->width[0];
+		nv04mt->level[l].pitch = pt->width0;
 		nv04mt->level[l].pitch = (nv04mt->level[l].pitch + 63) & ~63;
-
-		width  = MAX2(1, width  >> 1);
-		height = MAX2(1, height >> 1);
 	}
 
 	for (l = 0; l <= pt->last_level; l++) {
-
 		nv04mt->level[l].image_offset = 
 			CALLOC(nr_faces, sizeof(unsigned));
-		offset += nv04mt->level[l].pitch * pt->height[l];
+		/* XXX guess was obviously missing */
+		nv04mt->level[l].image_offset[0] = offset;
+		offset += nv04mt->level[l].pitch * u_minify(pt->height0, l);
 	}
 
 	nv04mt->total_size = offset;
@@ -63,7 +55,7 @@ nv04_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 		FREE(mt);
 		return NULL;
 	}
-	
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -75,7 +67,7 @@ nv04_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv04_miptree);
@@ -89,6 +81,7 @@ nv04_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
 	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -120,8 +113,8 @@ nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	pipe_texture_reference(&ns->base.texture, pt);
 	ns->base.format = pt->format;
-	ns->base.width = pt->width[level];
-	ns->base.height = pt->height[level];
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
 	ns->base.usage = flags;
 	pipe_reference_init(&ns->base.reference, 1);
 	ns->base.face = face;
@@ -129,7 +122,7 @@ nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	ns->base.zslice = zslice;
 	ns->pitch = nv04mt->level[level].pitch;
 
-	ns->base.offset = nv04mt->level[level].image_offset;
+	ns->base.offset = nv04mt->level[level].image_offset[0];
 
 	return &ns->base;
 }
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
index f6458232ae5..0b795ea2430 100644
--- a/src/gallium/drivers/nv04/nv04_prim_vbuf.c
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -93,33 +93,45 @@ nv04_vbuf_render_set_primitive( struct vbuf_render *render,
 
 static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
-	OUT_RING(0xFEDCBA);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA), 49);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(chan, 0xFEDCBA);
 }
 
 static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RING(0xFED);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD), 25);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(chan, 0xFED);
 }
 
 static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
-	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
-	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
-	OUT_RING(0xFECEDC);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC), 33);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(chan, 0xFECEDC);
 }
 
 static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
@@ -156,7 +168,10 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 {
 	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
 	for(i = 0; i<nr_indices; i+=14) 
@@ -166,15 +181,15 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 		if (numvert<3)
 			break;
 
-		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8);
 		for(j = 0; j<numvert; j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices [i+j], 8 );
 
-		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
 		for(j = 0; j<numtri/2; j++ )
-			OUT_RING(striptbl[j]);
+			OUT_RING(chan, striptbl[j]);
 		if (numtri%2)
-			OUT_RING(striptbl[numtri/2]&0xFFF);
+			OUT_RING(chan, striptbl[numtri/2]&0xFFF);
 	}
 }
 
@@ -182,11 +197,14 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 {
 	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
 	unsigned char* buffer = render->buffer;
-	struct nv04_context* nv04 = render->nv04;
+	struct nv04_context *nv04 = render->nv04;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	int i,j;
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
-	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[0], 8);
 
 	for(i = 1; i<nr_indices; i+=14)
 	{
@@ -195,16 +213,16 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 		if (numvert < 3)
 			break;
 
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
 
 		for(j=0;j<numvert;j++)
-			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+			OUT_RINGp(chan, buffer + VERTEX_SIZE * indices[ i+j ], 8 );
 
-		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		BEGIN_RING_NI(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
 		for(j = 0; j<numtri/2; j++)
-			OUT_RING(fantbl[j]);
+			OUT_RING(chan, fantbl[j]);
 		if (numtri%2)
-			OUT_RING(fantbl[numtri/2]&0xFFF);
+			OUT_RING(chan, fantbl[numtri/2]&0xFFF);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index 170ce3eb7e5..7c5b6e8229a 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -119,6 +119,8 @@ nv04_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_grobj_free(&screen->fahrenheit);
 	nv04_surface_2d_takedown(&screen->eng2d);
 
+	nouveau_screen_fini(&screen->base);
+
 	FREE(pscreen);
 }
 
@@ -163,10 +165,10 @@ nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		fahrenheit_class = 0;
 		sub3d_class = 0;
 	} else if (dev->chipset >= 0x10) {
-		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		fahrenheit_class = NV10_TEXTURED_TRIANGLE;
 		sub3d_class = NV10_CONTEXT_SURFACES_3D;
 	} else {
-		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		fahrenheit_class=NV04_TEXTURED_TRIANGLE;
 		sub3d_class = NV04_CONTEXT_SURFACES_3D;
 	}
 
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
index d356ebd8b36..b67f1e16b1c 100644
--- a/src/gallium/drivers/nv04/nv04_state.c
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -50,28 +50,28 @@ wrap_mode(unsigned wrap) {
 
 	switch (wrap) {
 	case PIPE_TEX_WRAP_REPEAT:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
 		break;
 	case PIPE_TEX_WRAP_MIRROR_REPEAT:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
 		break;
 	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
 		break;
 	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
 		break;
 	case PIPE_TEX_WRAP_CLAMP:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
 		break;
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 	case PIPE_TEX_WRAP_MIRROR_CLAMP:
 	default:
 		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
 	}
-	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+	return ret >> NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
 }
 
 static void *
@@ -84,20 +84,20 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 
 	ss = MALLOC(sizeof(struct nv04_sampler_state));
 
-	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
-		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
 
 	if (cso->max_anisotropy > 1.0) {
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
 	}
 
 	switch (cso->mag_img_filter) {
 	case PIPE_TEX_FILTER_LINEAR:
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
 		break;
 	case PIPE_TEX_FILTER_NEAREST:
 	default:
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
 		break;
 	}
 
@@ -105,14 +105,14 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 	case PIPE_TEX_FILTER_LINEAR:
 		switch (cso->min_mip_filter) {
 		case PIPE_TEX_MIPFILTER_NEAREST:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
 			break;
 		case PIPE_TEX_MIPFILTER_LINEAR:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
 			break;
 		case PIPE_TEX_MIPFILTER_NONE:
 		default:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
 			break;
 		}
 		break;
@@ -120,14 +120,14 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 	default:
 		switch (cso->min_mip_filter) {
 		case PIPE_TEX_MIPFILTER_NEAREST:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
 		break;
 		case PIPE_TEX_MIPFILTER_LINEAR:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
 			break;
 		case PIPE_TEX_MIPFILTER_NONE:
 		default:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
 			break;
 		}
 		break;
@@ -181,7 +181,7 @@ nv04_rasterizer_state_create(struct pipe_context *pipe,
 	 */
 	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
 
-	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+	rs->blend = cso->flatshade ? NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
 
 	return (void *)rs;
 }
@@ -229,16 +229,16 @@ nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
 
 	hw->control = float_to_ubyte(cso->alpha.ref_value);
-	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
-	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
-	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
-	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
-	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
-	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
-	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_ENABLE : 0;
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? NV04_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? NV04_TEXTURED_TRIANGLE_CONTROL_Z_WRITE : 0;
+	hw->control |= 1 << NV04_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
 
 	return (void *)hw;
 }
@@ -332,7 +332,7 @@ nv04_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv04_context *nv04 = nv04_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -342,13 +342,13 @@ nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf && buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv04->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv04->constbuf[shader], mapped, buf->size);
 			nv04->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
@@ -377,7 +377,7 @@ nv04_set_scissor_state(struct pipe_context *pipe,
 /*	struct nv04_context *nv04 = nv04_context(pipe);
 
 	// XXX
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
 	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
 	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
 }
@@ -425,9 +425,9 @@ nv04_init_state_functions(struct nv04_context *nv04)
 	nv04->pipe.delete_blend_state = nv04_blend_state_delete;
 
 	nv04->pipe.create_sampler_state = nv04_sampler_state_create;
-	nv04->pipe.bind_sampler_states = nv04_sampler_state_bind;
+	nv04->pipe.bind_fragment_sampler_states = nv04_sampler_state_bind;
 	nv04->pipe.delete_sampler_state = nv04_sampler_state_delete;
-	nv04->pipe.set_sampler_textures = nv04_set_sampler_texture;
+	nv04->pipe.set_fragment_sampler_textures = nv04_set_sampler_texture;
 
 	nv04->pipe.create_rasterizer_state = nv04_rasterizer_state_create;
 	nv04->pipe.bind_rasterizer_state = nv04_rasterizer_state_bind;
diff --git a/src/gallium/drivers/nv04/nv04_state.h b/src/gallium/drivers/nv04/nv04_state.h
index 399f750dbe7..81d1d2ebaa9 100644
--- a/src/gallium/drivers/nv04/nv04_state.h
+++ b/src/gallium/drivers/nv04/nv04_state.h
@@ -31,6 +31,7 @@ struct nv04_rasterizer_state {
 
 struct nv04_miptree {
 	struct pipe_texture base;
+	struct nouveau_bo *bo;
 
 	struct pipe_buffer *buffer;
 	uint total_size;
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
index eb2c1c57c67..b8d6dc560f0 100644
--- a/src/gallium/drivers/nv04/nv04_state_emit.c
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -57,13 +57,19 @@ static uint32_t nv04_blend_func(uint32_t f)
 static void nv04_emit_control(struct nv04_context* nv04)
 {
 	uint32_t control = nv04->dsa->control;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
-	OUT_RING(control);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(chan, control);
 }
 
 static void nv04_emit_blend(struct nv04_context* nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
 	uint32_t blend;
 
 	blend=0x4; // texture MODULATE_ALPHA
@@ -75,19 +81,23 @@ static void nv04_emit_blend(struct nv04_context* nv04)
 	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
 	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
-	OUT_RING(blend);
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(chan, blend);
 }
 
 static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
 {
 	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
 	struct pipe_texture *pt = &nv04mt->base;
-
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING(nv04->sampler[unit]->filter);
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
+
+	BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(chan, nv04->sampler[unit]->filter);
 }
 
 static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
@@ -97,6 +107,10 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv04_miptree *nv04mt = 0;
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
+	struct nouveau_bo *bo;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -128,24 +142,29 @@ static void nv04_state_emit_framebuffer(struct nv04_context* nv04)
 		assert(0);
 	}
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
-	OUT_RING(rt_format);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(chan, rt_format);
 
 	nv04mt = (struct nv04_miptree *)rt->base.texture;
+	bo = nouveau_bo(nv04mt->buffer);
 	/* FIXME pitches have to be aligned ! */
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt->pitch|(zeta->pitch<<16));
-	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt->pitch|(zeta->pitch<<16));
+	OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (fb->zsbuf) {
 		nv04mt = (struct nv04_miptree *)zeta->base.texture;
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 }
 
 void
 nv04_emit_hw_state(struct nv04_context *nv04)
 {
+	struct nv04_screen *screen = nv04->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *fahrenheit = screen->fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d = screen->context_surfaces_3d;
 	int i;
 
 	if (nv04->dirty & NV04_NEW_VERTPROG) {
@@ -163,8 +182,8 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	if (nv04->dirty & NV04_NEW_CONTROL) {
 		nv04->dirty &= ~NV04_NEW_CONTROL;
 
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
-		OUT_RING(nv04->dsa->control);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(chan, nv04->dsa->control);
 	}
 
 	if (nv04->dirty & NV04_NEW_BLEND) {
@@ -205,12 +224,12 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	unsigned rt_pitch = ((struct nv04_surface *)nv04->rt)->pitch;
 	unsigned zeta_pitch = ((struct nv04_surface *)nv04->zeta)->pitch;
 
-	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
-	OUT_RING(rt_pitch|(zeta_pitch<<16));
-	OUT_RELOCl(nv04->rt, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(chan, rt_pitch|(zeta_pitch<<16));
+	OUT_RELOCl(chan, nouveau_bo(nv04->rt), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	if (nv04->zeta) {
-		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
-		OUT_RELOCl(nv04->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv04->zeta), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	}
 
 	/* Texture images */
@@ -218,9 +237,10 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 		if (!(nv04->fp_samplers & (1 << i)))
 			continue;
 		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
-		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+		struct nouveau_bo *bo = nouveau_bo(nv04mt->buffer);
+		BEGIN_RING(chan, fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(chan, bo, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
 	}
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 8be134b83dd..b24a9cee5ae 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -1,5 +1,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -76,7 +77,7 @@ nv04_scaled_image_format(enum pipe_format format)
 }
 
 static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y)
+nv04_swizzle_bits_square(unsigned x, unsigned y)
 {
 	unsigned u = (x & 0x001) << 0 |
 	             (x & 0x002) << 1 |
@@ -106,6 +107,15 @@ nv04_swizzle_bits(unsigned x, unsigned y)
 	return v | u;
 }
 
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+}
+
 static int
 nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 			  struct pipe_surface *dst, int dx, int dy,
@@ -133,6 +143,9 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	assert(sub_w == w || util_is_pot(sub_w));
 	assert(sub_h == h || util_is_pot(sub_h));
 
+	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
+			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
+
 	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
 	OUT_RELOCo(chan, dst_bo,
 	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
@@ -154,20 +167,19 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	  for (x = 0; x < w; x += sub_w) {
 	    sub_w = MIN2(sub_w, w - x);
 
-	    /* Must be 64-byte aligned */
-	    assert(!((dst->offset + nv04_swizzle_bits(dx+x, dy+y) * dst->texture->block.size) & 63));
+	    assert(!(dst->offset & 63));
 
 	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(dx+x, dy+y) * dst->texture->block.size,
+	    OUT_RELOCl(chan, dst_bo, dst->offset,
                              NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	    OUT_RING  (chan, 1 << 20);
 	    OUT_RING  (chan, 1 << 20);
@@ -177,7 +189,7 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	    OUT_RING  (chan, src_pitch |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 			     NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
-	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * src->texture->block.size,
+	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
                              NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 	    OUT_RING  (chan, 0);
 	  }
@@ -198,11 +210,11 @@ nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
 	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
 	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
 	unsigned dst_offset = dst->offset + dy * dst_pitch +
-	                      dx * dst->texture->block.size;
+	                      dx * util_format_get_blocksize(dst->texture->format);
 	unsigned src_offset = src->offset + sy * src_pitch +
-	                      sx * src->texture->block.size;
+	                      sx * util_format_get_blocksize(src->texture->format);
 
-	WAIT_RING (chan, 3 + ((h / 2047) + 1) * 9);
+	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
 	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
 	OUT_RELOCo(chan, src_bo,
 		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
@@ -219,7 +231,7 @@ nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
 		OUT_RING  (chan, src_pitch);
 		OUT_RING  (chan, dst_pitch);
-		OUT_RING  (chan, w * src->texture->block.size);
+		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
 		OUT_RING  (chan, count);
 		OUT_RING  (chan, 0x0101);
 		OUT_RING  (chan, 0);
@@ -250,7 +262,7 @@ nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 	if (format < 0)
 		return 1;
 
-	WAIT_RING (chan, 12);
+	MARK_RING (chan, 12, 4);
 	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
 	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
@@ -315,7 +327,7 @@ nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
 	gdirect_format = nv04_rect_format(dst->format);
 	assert(gdirect_format >= 0);
 
-	WAIT_RING (chan, 16);
+	MARK_RING (chan, 16, 4);
 	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
 	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
@@ -487,3 +499,49 @@ nv04_surface_2d_init(struct nouveau_screen *screen)
 	ctx->fill = nv04_surface_fill;
 	return ctx;
 }
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	int temp_flags;
+
+	// printf("creating temp, flags is %i!\n", flags);
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_DISCARD)
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_DISCARD;
+	}
+	else
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_GPU_READ;
+	}
+
+	struct nv40_screen* screen = (struct nv40_screen*)pscreen;
+	ns->base.usage = PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+
+	struct pipe_texture templ;
+	memset(&templ, 0, sizeof(templ));
+	templ.format = ns->base.texture->format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns->base.width;
+	templ.height0 = ns->base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns->base.texture->nr_samples;
+
+	templ.tex_usage = ns->base.texture->tex_usage | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	struct pipe_texture* temp_tex = pscreen->texture_create(pscreen, &templ);
+	struct nv04_surface* temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns->backing = ns;
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_GPU_READ)
+		eng2d->copy(eng2d, &temp_ns->backing->base, 0, 0, &ns->base, 0, 0, ns->base.width, ns->base.height);
+
+	return temp_ns;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
index 02b3f56ba8b..ce696a11a39 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.h
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -4,6 +4,7 @@
 struct nv04_surface {
 	struct pipe_surface base;
 	unsigned pitch;
+	struct nv04_surface* backing;
 };
 
 struct nv04_surface_2d {
@@ -30,4 +31,7 @@ nv04_surface_2d_init(struct nouveau_screen *screen);
 void
 nv04_surface_2d_takedown(struct nv04_surface_2d **);
 
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
 #endif
diff --git a/src/gallium/drivers/nv04/nv04_transfer.c b/src/gallium/drivers/nv04/nv04_transfer.c
index 6618660743d..2dd2e146a8f 100644
--- a/src/gallium/drivers/nv04/nv04_transfer.c
+++ b/src/gallium/drivers/nv04/nv04_transfer.c
@@ -1,7 +1,9 @@
 #include <pipe/p_state.h>
 #include <pipe/p_defines.h>
 #include <pipe/p_inlines.h>
+#include <util/u_format.h>
 #include <util/u_memory.h>
+#include <util/u_math.h>
 #include <nouveau/nouveau_winsys.h>
 #include "nv04_context.h"
 #include "nv04_screen.h"
@@ -10,22 +12,19 @@
 struct nv04_transfer {
 	struct pipe_transfer base;
 	struct pipe_surface *surface;
-	bool direct;
+	boolean direct;
 };
 
 static void
-nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width[0] = pt->width[level];
-	template->height[0] = pt->height[level];
-	template->depth[0] = 1;
-	template->block = pt->block;
-	template->nblocksx[0] = pt->nblocksx[level];
-	template->nblocksy[0] = pt->nblocksx[level];
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
 
@@ -48,14 +47,10 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
 	tx->base.x = x;
 	tx->base.y = y;
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
 	tx->base.stride = mt->level[level].pitch;
 	tx->base.usage = usage;
 	tx->base.face = face;
@@ -76,7 +71,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv04_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv04_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -85,6 +80,8 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv04_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -110,8 +107,8 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -135,9 +132,9 @@ nv04_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -156,8 +153,10 @@ nv04_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
index e3167814f2b..34847718145 100644
--- a/src/gallium/drivers/nv04/nv04_vbo.c
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv04_draw_elements( struct pipe_context *pipe,
+void nv04_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -45,7 +45,7 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
-	draw_set_mapped_constant_buffer(draw,
+	draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
 					nv04->constbuf[PIPE_SHADER_VERTEX],
 					nv04->constbuf_nr[PIPE_SHADER_VERTEX]);
 
@@ -65,15 +65,13 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv04_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv04_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
 	printf("coucou in draw arrays\n");
-	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv04_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
index 65a22b175e1..1ecb73d06e8 100644
--- a/src/gallium/drivers/nv10/nv10_context.c
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -10,10 +10,14 @@ nv10_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv10->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,230 +35,226 @@ static void nv10_init_hwctx(struct nv10_context *nv10)
 {
 	struct nv10_screen *screen = nv10->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	float projectionmatrix[16];
 
-	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle);
-	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0x7ff<<16)|0x800);
 
 	for (i=1;i<8;i++) {
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, 0x290, 1);
-	OUT_RING  ((0x10<<16)|1);
-	BEGIN_RING(celsius, 0x3f4, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, 0x290, 1);
+	OUT_RING  (chan, (0x10<<16)|1);
+	BEGIN_RING(chan, celsius, 0x3f4, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	if (nv10->screen->celsius->grclass != NV10TCL) {
 		/* For nv11, nv17 */
-		BEGIN_RING(celsius, 0x120, 3);
-		OUT_RING  (0);
-		OUT_RING  (1);
-		OUT_RING  (2);
+		BEGIN_RING(chan, celsius, 0x120, 3);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 1);
+		OUT_RING  (chan, 2);
 
-		BEGIN_RING(celsius, NV10TCL_NOP, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
 	/* Set state */
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-
-	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
-	OUT_RING  (0x30141010);
-	OUT_RING  (0);
-	OUT_RING  (0x20040000);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0x18000000);
-	OUT_RING  (0x300e0300);
-	OUT_RING  (0x0c091c80);
-
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (1);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0x8006);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
-	OUT_RING  (0xff);
-	OUT_RING  (0x207);
-	OUT_RING  (0);
-	OUT_RING  (0xff);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1e00);
-	OUT_RING  (0x1d01);
-	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING  (0x201);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (0x1b02);
-	OUT_RING  (0x1b02);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (0x405);
-	OUT_RING  (0x901);
-	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+
+	BEGIN_RING(chan, celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (chan, 0x30141010);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x20040000);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0x18000000);
+	OUT_RING  (chan, 0x300e0300);
+	OUT_RING  (chan, 0x0c091c80);
+
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x8006);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x207);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1e00);
+	OUT_RING  (chan, 0x1d01);
+	BEGIN_RING(chan, celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, 0x201);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, 0x1b02);
+	OUT_RING  (chan, 0x1b02);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, 0x405);
+	OUT_RING  (chan, 0x901);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_GEN_S(0), 8);
 	for (i=0;i<8;i++) {
-		OUT_RING  (0);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RING  (0x3fc00000);	/* -1.50 */
-	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
-	OUT_RING  (0);		/*  0.00 */
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (chan, 0x3fc00000);	/* -1.50 */
+	OUT_RING  (chan, 0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (chan, 0);		/*  0.00 */
 
-	BEGIN_RING(celsius, NV10TCL_NOP, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_NOP, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
-	OUT_RING  (0x802);
-	OUT_RING  (2);
+	BEGIN_RING(chan, celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (chan, 0x802);
+	OUT_RING  (chan, 2);
 	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
 	 * using texturing, except when using the texture matrix
 	 */
-	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
-	OUT_RING  (6);
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (0x01010101);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (chan, 6);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, 0x01010101);
 
 	/* Set vertex component */
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
-	OUT_RINGf (0.0);
-	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (chan, 0.0);
+	BEGIN_RING(chan, celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
-	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 1.0;
 	projectionmatrix[3*4+3] = 1.0;
 	for (i=0;i<16;i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RING  (0.0);
-	OUT_RINGf  (16777216.0);
-
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (-2048.0);
-	OUT_RINGf  (16777215.0 * 0.5);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (chan, 0.0);
+	OUT_RINGf  (chan, 16777216.0);
 
-	FIRE_RING (NULL);
-}
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, -2048.0);
+	OUT_RINGf  (chan, 16777215.0 * 0.5);
+	OUT_RING  (chan, 0);
 
-static void
-nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
@@ -276,7 +276,6 @@ nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv10->pipe.winsys = ws;
 	nv10->pipe.screen = pscreen;
 	nv10->pipe.destroy = nv10_destroy;
-	nv10->pipe.set_edgeflags = nv10_set_edgeflags;
 	nv10->pipe.draw_arrays = nv10_draw_arrays;
 	nv10->pipe.draw_elements = nv10_draw_elements;
 	nv10->pipe.clear = nv10_clear;
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
index 36a6aa7a74e..ab4b825487d 100644
--- a/src/gallium/drivers/nv10/nv10_context.h
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv10_screen *ctx = nv10->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv10_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -144,9 +140,9 @@ extern void nv10_emit_hw_state(struct nv10_context *nv10);
 extern void nv10_state_tex_update(struct nv10_context *nv10);
 
 /* nv10_vbo.c */
-extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv10_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv10_draw_elements( struct pipe_context *pipe,
+extern void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
index 27f2f875847..c1f7ccb9ab6 100644
--- a/src/gallium/drivers/nv10/nv10_fragtex.c
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -52,6 +52,9 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
 	struct pipe_texture *pt = &nv10mt->base;
 	struct nv10_texture_format *tf;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	uint32_t txf, txs, txp;
 
 	tf = nv10_fragtex_format(pt->format);
@@ -62,9 +65,9 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 
 	txf  = tf->format << 8;
 	txf |= (pt->last_level + 1) << 16;
-	txf |= log2i(pt->width[0]) << 20;
-	txf |= log2i(pt->height[0]) << 24;
-	txf |= log2i(pt->depth[0]) << 28;
+	txf |= log2i(pt->width0) << 20;
+	txf |= log2i(pt->height0) << 24;
+	txf |= log2i(pt->depth0) << 28;
 	txf |= 8;
 
 	switch (pt->target) {
@@ -82,15 +85,15 @@ nv10_fragtex_build(struct nv10_context *nv10, int unit)
 		return;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv10mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv10mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 {
 #if 0
 	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	unsigned samplers, unit;
 
 	samplers = nv10->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv10_fragtex_bind(struct nv10_context *nv10)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv10->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv10/nv10_miptree.c b/src/gallium/drivers/nv10/nv10_miptree.c
index 34e3c2ebd77..908482ad854 100644
--- a/src/gallium/drivers/nv10/nv10_miptree.c
+++ b/src/gallium/drivers/nv10/nv10_miptree.c
@@ -1,6 +1,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
 
 #include "nv10_context.h"
 #include "nv10_screen.h"
@@ -10,7 +12,7 @@ nv10_miptree_layout(struct nv10_miptree *nv10mt)
 {
 	struct pipe_texture *pt = &nv10mt->base;
 	boolean swizzled = FALSE;
-	uint width = pt->width[0], height = pt->height[0];
+	uint width = pt->width0;
 	uint offset = 0;
 	int nr_faces, l, f;
 
@@ -21,29 +23,23 @@ nv10_miptree_layout(struct nv10_miptree *nv10mt)
 	}
 	
 	for (l = 0; l <= pt->last_level; l++) {
-		pt->width[l] = width;
-		pt->height[l] = height;
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
-
 		if (swizzled)
-			nv10mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+			nv10mt->level[l].pitch = util_format_get_stride(pt->format, width);
 		else
-			nv10mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+			nv10mt->level[l].pitch = util_format_get_stride(pt->format, pt->width0);
 		nv10mt->level[l].pitch = (nv10mt->level[l].pitch + 63) & ~63;
 
 		nv10mt->level[l].image_offset =
 			CALLOC(nr_faces, sizeof(unsigned));
 
-		width  = MAX2(1, width  >> 1);
-		height = MAX2(1, height >> 1);
+		width  = u_minify(width, 1);
 
 	}
 
 	for (f = 0; f < nr_faces; f++) {
 		for (l = 0; l <= pt->last_level; l++) {
 			nv10mt->level[l].image_offset[f] = offset;
-			offset += nv10mt->level[l].pitch * pt->height[l];
+			offset += nv10mt->level[l].pitch * u_minify(pt->height0, l);
 		}
 	}
 
@@ -58,7 +54,7 @@ nv10_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv10_miptree);
@@ -72,6 +68,7 @@ nv10_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
 	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -95,6 +92,7 @@ nv10_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 		FREE(mt);
 		return NULL;
 	}
+	mt->bo = nouveau_bo(mt->buffer);
 	
 	return &mt->base;
 }
@@ -133,8 +131,8 @@ nv10_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
 		return NULL;
 	pipe_texture_reference(&ns->base.texture, pt);
 	ns->base.format = pt->format;
-	ns->base.width = pt->width[level];
-	ns->base.height = pt->height[level];
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
 	ns->base.usage = flags;
 	pipe_reference_init(&ns->base.reference, 1);
 	ns->base.face = face;
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
index 7ba9777a222..c5dbe43dbc8 100644
--- a/src/gallium/drivers/nv10/nv10_prim_vbuf.c
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -67,12 +67,15 @@ struct nv10_vbuf_render {
 
 void nv10_vtxbuf_bind( struct nv10_context* nv10 )
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int i;
 	for(i = 0; i < 8; i++) {
-		BEGIN_RING(celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv10->vtxbuf*/);
-		BEGIN_RING(celsius, NV10TCL_VTXFMT(i), 1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv10->vtxbuf*/);
+		BEGIN_RING(chan, celsius, NV10TCL_VTXFMT(i), 1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 }
 
@@ -163,19 +166,22 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 {
 	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
 	struct nv10_context *nv10 = nv10_render->nv10;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 	int push, i;
 
 	nv10_emit_hw_state(nv10);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv10_render->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv10_render->hwprim);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv10_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -183,16 +189,16 @@ nv10_vbuf_render_draw( struct vbuf_render *render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index ee5901e743e..69a6dab866a 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -115,6 +115,9 @@ nv10_screen_destroy(struct pipe_screen *pscreen)
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->celsius);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
@@ -177,7 +180,6 @@ nv10_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->celsius, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
index 9b38219b996..ad7def53b12 100644
--- a/src/gallium/drivers/nv10/nv10_state.c
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -458,7 +458,7 @@ nv10_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv10_context *nv10 = nv10_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -468,13 +468,13 @@ nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv10->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv10->constbuf[shader], mapped, buf->size);
 			nv10->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
@@ -553,9 +553,9 @@ nv10_init_state_functions(struct nv10_context *nv10)
 	nv10->pipe.delete_blend_state = nv10_blend_state_delete;
 
 	nv10->pipe.create_sampler_state = nv10_sampler_state_create;
-	nv10->pipe.bind_sampler_states = nv10_sampler_state_bind;
+	nv10->pipe.bind_fragment_sampler_states = nv10_sampler_state_bind;
 	nv10->pipe.delete_sampler_state = nv10_sampler_state_delete;
-	nv10->pipe.set_sampler_textures = nv10_set_sampler_texture;
+	nv10->pipe.set_fragment_sampler_textures = nv10_set_sampler_texture;
 
 	nv10->pipe.create_rasterizer_state = nv10_rasterizer_state_create;
 	nv10->pipe.bind_rasterizer_state = nv10_rasterizer_state_bind;
diff --git a/src/gallium/drivers/nv10/nv10_state.h b/src/gallium/drivers/nv10/nv10_state.h
index 3a3fd0d4f4f..2524ac02e29 100644
--- a/src/gallium/drivers/nv10/nv10_state.h
+++ b/src/gallium/drivers/nv10/nv10_state.h
@@ -126,6 +126,7 @@ struct nv10_depth_stencil_alpha_state {
 
 struct nv10_miptree {
 	struct pipe_texture base;
+	struct nouveau_bo *bo;
 
 	struct pipe_buffer *buffer;
 	uint total_size;
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
index 2577ab73b56..30a596ca604 100644
--- a/src/gallium/drivers/nv10/nv10_state_emit.c
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -4,25 +4,32 @@
 static void nv10_state_emit_blend(struct nv10_context* nv10)
 {
 	struct nv10_blend_state *b = nv10->blend;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
-	OUT_RING  (b->b_enable);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (chan, b->b_enable);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 {
 	struct pipe_blend_color *c = nv10->blend_color;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -31,60 +38,66 @@ static void nv10_state_emit_blend_color(struct nv10_context* nv10)
 static void nv10_state_emit_rast(struct nv10_context* nv10)
 {
 	struct nv10_rasterizer_state *r = nv10->rast;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv10_state_emit_dsa(struct nv10_context* nv10)
 {
 	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
 #if 0
-	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv10_state_emit_viewport(struct nv10_context* nv10)
@@ -108,6 +121,10 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	int colour_format = 0, zeta_format = 0;
         struct nv10_miptree *nv10mt = 0;
 
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
 	colour_format = fb->cbufs[0]->format;
@@ -144,11 +161,11 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 	}
 
 	if (zeta) {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv10mt = (struct nv10_miptree *)rt->base.texture;
@@ -160,13 +177,13 @@ static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
 		nv10->zeta = nv10mt->buffer;
 	}
 
-	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0);
-	OUT_RING  (rt_format);
-	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
-	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+	BEGIN_RING(chan, celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0);
+	OUT_RING  (chan, rt_format);
+	BEGIN_RING(chan, celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (chan, ((h - 1) << 16) | 0 | 0x08000800);
 }
 
 static void nv10_vertex_layout(struct nv10_context *nv10)
@@ -201,6 +218,10 @@ static void nv10_vertex_layout(struct nv10_context *nv10)
 void
 nv10_emit_hw_state(struct nv10_context *nv10)
 {
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *celsius = screen->celsius;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv10->dirty & NV10_NEW_VERTPROG) {
@@ -269,38 +290,41 @@ nv10_emit_hw_state(struct nv10_context *nv10)
 	 */
 
 	/* Render target */
+	rt_bo = nouveau_bo(nv10->rt[0]);
 // XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
-//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
-//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//	BEGIN_RING(chan, celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv10->zeta) {
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv10->zeta);
 // XXX
-//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
-//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+//		BEGIN_RING(chan, celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv10->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv10->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv10->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv10->tex[i].buffer);
+		BEGIN_RING(chan, celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+		BEGIN_RING(chan, celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv10->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
 			   NV10TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv10/nv10_transfer.c b/src/gallium/drivers/nv10/nv10_transfer.c
index 8feb85e4bda..eb04af9782e 100644
--- a/src/gallium/drivers/nv10/nv10_transfer.c
+++ b/src/gallium/drivers/nv10/nv10_transfer.c
@@ -1,7 +1,9 @@
 #include <pipe/p_state.h>
 #include <pipe/p_defines.h>
 #include <pipe/p_inlines.h>
+#include <util/u_format.h>
 #include <util/u_memory.h>
+#include <util/u_math.h>
 #include <nouveau/nouveau_winsys.h>
 #include "nv10_context.h"
 #include "nv10_screen.h"
@@ -10,22 +12,19 @@
 struct nv10_transfer {
 	struct pipe_transfer base;
 	struct pipe_surface *surface;
-	bool direct;
+	boolean direct;
 };
 
 static void
-nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width[0] = pt->width[level];
-	template->height[0] = pt->height[level];
-	template->depth[0] = 1;
-	template->block = pt->block;
-	template->nblocksx[0] = pt->nblocksx[level];
-	template->nblocksy[0] = pt->nblocksx[level];
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
 
@@ -48,14 +47,10 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
 	tx->base.x = x;
 	tx->base.y = y;
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
 	tx->base.stride = mt->level[level].pitch;
 	tx->base.usage = usage;
 	tx->base.face = face;
@@ -76,7 +71,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv10_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv10_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -85,6 +80,8 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv10_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -110,8 +107,8 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -135,9 +132,9 @@ nv10_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -156,8 +153,10 @@ nv10_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
index 441a4f75f3f..9180c72c9b0 100644
--- a/src/gallium/drivers/nv10/nv10_vbo.c
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv10_draw_elements( struct pipe_context *pipe,
+void nv10_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -45,6 +45,7 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 	}
 
 	draw_set_mapped_constant_buffer(draw,
+                                        PIPE_SHADER_VERTEX,
 					nv10->constbuf[PIPE_SHADER_VERTEX],
 					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
 
@@ -64,14 +65,12 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 		pipe_buffer_unmap(pscreen, indexBuffer);
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
-
-	return TRUE;
 }
 
-boolean nv10_draw_arrays( struct pipe_context *pipe,
-				 unsigned prim, unsigned start, unsigned count)
+void nv10_draw_arrays( struct pipe_context *pipe,
+                       unsigned prim, unsigned start, unsigned count)
 {
-	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv10_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
index 276db8b57b6..5b80af2d22a 100644
--- a/src/gallium/drivers/nv20/nv20_context.c
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -10,10 +10,14 @@ nv20_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
 
 	draw_flush(nv20->draw);
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
@@ -31,353 +35,352 @@ static void nv20_init_hwctx(struct nv20_context *nv20)
 {
 	struct nv20_screen *screen = nv20->screen;
 	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	float projectionmatrix[16];
-	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+	const boolean is_nv25tcl = (kelvin->grclass == NV25TCL);
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
-	OUT_RING  (screen->sync->handle);
-	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->gart->handle); /* TEXTURE1 */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
-	OUT_RING  (chan->vram->handle);
-	OUT_RING  (chan->vram->handle); /* ZETA */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (chan, screen->sync->handle);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle); /* ZETA */
 
-	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
-	OUT_RING  (0); /* renouveau: beef0351, unique */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (chan, 0); /* renouveau: beef0351, unique */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
-	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  (chan, (0xfff << 16) | 0x0);
 
 	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x17e0, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
+	BEGIN_RING(chan, kelvin, 0x17e0, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
 	} else {
-		BEGIN_RING(kelvin, 0x1e68, 1);
-		OUT_RING  (0x4b800000); /* 16777216.000000 */
-		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
-		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+		BEGIN_RING(chan, kelvin, 0x1e68, 1);
+		OUT_RING  (chan, 0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (chan, NV20TCL_TX_RCOMP_LEQUAL);
 	}
 
-	BEGIN_RING(kelvin, 0x290, 1);
-	OUT_RING  ((0x10 << 16) | 1);
-	BEGIN_RING(kelvin, 0x9fc, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, 0x1d80, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, 0x9f8, 1);
-	OUT_RING  (4);
-	BEGIN_RING(kelvin, 0x17ec, 3);
-	OUT_RINGf (0.0);
-	OUT_RINGf (1.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, 0x290, 1);
+	OUT_RING  (chan, (0x10 << 16) | 1);
+	BEGIN_RING(chan, kelvin, 0x9fc, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, 0x1d80, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, 0x9f8, 1);
+	OUT_RING  (chan, 4);
+	BEGIN_RING(chan, kelvin, 0x17ec, 3);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 1.0);
+	OUT_RINGf (chan, 0.0);
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d88, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d88, 1);
+		OUT_RING  (chan, 3);
 
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
-		OUT_RING  (chan->vram->handle);
-		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
-		OUT_RING  (chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (chan, chan->vram->handle);
+		BEGIN_RING(chan, kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (chan, chan->vram->handle);
 	}
-	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
-	OUT_RING  (0);	/* renouveau: beef1e10 */
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (chan, 0);	/* renouveau: beef1e10 */
 
-	BEGIN_RING(kelvin, 0x1e98, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, 0x1e98, 1);
+	OUT_RING  (chan, 0);
 #if 0
 	if (is_nv25tcl) {
-		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
-		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+		BEGIN_RING(chan, NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (chan, NvDmaFB);	/* renouveau: beef0201 */
 
-		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
-		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		BEGIN_RING(chan, NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (chan, NvDmaTT);	/* renouveau: beef0202 */
 	}
 #endif
-	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (chan, 0);
 
-	BEGIN_RING(kelvin, 0x120, 3);
-	OUT_RING  (0);
-	OUT_RING  (1);
-	OUT_RING  (2);
+	BEGIN_RING(chan, kelvin, 0x120, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 2);
 
 /* error: ILLEGAL_MTHD, PROTECTION_FAULT
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0);
-	OUT_RINGf (512.0);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 512.0);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);
 */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x022c, 2);
-		OUT_RING  (0x280);
-		OUT_RING  (0x07d28000);
+		BEGIN_RING(chan, kelvin, 0x022c, 2);
+		OUT_RING  (chan, 0x280);
+		OUT_RING  (chan, 0x07d28000);
 	}
 
 /* * illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c2c, 1);
-	OUT_RING  (0); */
+	BEGIN_RING(chan, NvSub3D, 0x1c2c, 1);
+	OUT_RING  (chan, 0); */
 
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1da4, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, 0x1da4, 1);
+		OUT_RING  (chan, 0);
 	}
 
 /* * crashes with illegal method, protection fault
-	BEGIN_RING(NvSub3D, 0x1c18, 1);
-	OUT_RING  (0x200); */
+	BEGIN_RING(chan, NvSub3D, 0x1c18, 1);
+	OUT_RING  (chan, 0x200); */
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
-	OUT_RING  ((0 << 16) | 0);
-	OUT_RING  ((0 << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (chan, (0 << 16) | 0);
+	OUT_RING  (chan, (0 << 16) | 0);
 
 	/* *** Set state *** */
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
-	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (chan, NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_ALPHA_FUNC_REF */
 
 	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
-	OUT_RING  (0x30d410d0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
-	OUT_RING  (0x00011101);
-	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
-	OUT_RING  (0x130e0300);
-	OUT_RING  (0x0c091c80);
-	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
-	OUT_RING  (0x00000c00);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
-	OUT_RING  (0x20c400c0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
-	OUT_RING  (0x035125a0);
-	OUT_RING  (0);
-	OUT_RING  (0x40002000);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
-	OUT_RING  (0xffff0000);
-
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
-	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
-	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
-	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
-	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RING  (0xff);
-	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
-	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
-	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
-	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
-	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
-
-	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
-	BEGIN_RING(kelvin, 0x17cc, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (chan, 0x30d410d0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (chan, 0x00011101);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (chan, 0x130e0300);
+	OUT_RING  (chan, 0x0c091c80);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (chan, 0x00000c00);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (chan, 0x20c400c0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (chan, 0x035125a0);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0x40002000);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (chan, 0xffff0000);
+
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (chan, NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (chan, 0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (chan, NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (chan, 0xff);
+	OUT_RING  (chan, NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (chan, 0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (chan, 0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (chan, NV20TCL_STENCIL_OP_ZPASS_KEEP);
+
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(chan, kelvin, 0x17cc, 1);
+	OUT_RING  (chan, 0);
 	if (is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 1);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
-	OUT_RING  (0x00020000);
-	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (chan, 0x00020000);
+	BEGIN_RING(chan, kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
 					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
 	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
-		OUT_RING(0xffffffff);
+		OUT_RING(chan, 0xffffffff);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
-	OUT_RING  (0);
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
-	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (chan, 0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (chan, NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (chan, 1);
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, 0x1d84, 1);
-		OUT_RING  (3);
+		BEGIN_RING(chan, kelvin, 0x1d84, 1);
+		OUT_RING  (chan, 3);
 	}
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
 	if (!is_nv25tcl) {
-		OUT_RING  (8);
+		OUT_RING  (chan, 8);
 	} else {
-		OUT_RINGf (1.0);
+		OUT_RINGf (chan, 1.0);
 	}
 	if (!is_nv25tcl) {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
-		OUT_RING  (0);
-		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (chan, 0);
+		OUT_RING  (chan, 0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
-		OUT_RING  (0);
-		BEGIN_RING(kelvin, 0x0a1c, 1);
-		OUT_RING  (0x800);
+		BEGIN_RING(chan, kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (chan, 0);
+		BEGIN_RING(chan, kelvin, 0x0a1c, 1);
+		OUT_RING  (chan, 0x800);
 	}
-	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
-	OUT_RING  (8);
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
-	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (NV20TCL_CULL_FACE_BACK);
-	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
-	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
-	OUT_RING  (0);
-	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (chan, 8);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (chan, NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (chan, NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (chan, NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
 	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
-		OUT_RING(0);
+		OUT_RING(chan, 0);
 	}
-	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
-	OUT_RINGf (1.5);
-	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
-	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
-	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
-	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
-	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
-	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
-	OUT_RING  (0);
-	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
-	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
-	OUT_RING  (NV20TCL_ENGINE_FIXED);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (chan, 1.5);
+	OUT_RINGf (chan, -0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (chan, 0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (chan, NV20TCL_FOG_MODE_EXP_SIGNED);
+	OUT_RING  (chan, NV20TCL_FOG_COORD_FOG);
+	BEGIN_RING(chan, kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(chan, kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (chan, NV20TCL_ENGINE_FIXED);
 
 	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
-		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (chan, 0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
-	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
-	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
-	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 0.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
+	OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0); OUT_RINGf(chan, 1.0);
 	for (i = 4; i < 16; ++i) {
-		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 0.0);
+		OUT_RINGf(chan, 1.0);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
-	OUT_RING  (1);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING (0x00010101);
-	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
-	OUT_RING (0);
+	BEGIN_RING(chan, kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (chan, 0x00010101);
+	BEGIN_RING(chan, kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (chan, 0);
 
 	memset(projectionmatrix, 0, sizeof(projectionmatrix));
 	projectionmatrix[0*4+0] = 1.0;
 	projectionmatrix[1*4+1] = 1.0;
 	projectionmatrix[2*4+2] = 16777215.0;
 	projectionmatrix[3*4+3] = 1.0;
-	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	BEGIN_RING(chan, kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
 	for (i = 0; i < 16; i++) {
-		OUT_RINGf  (projectionmatrix[i]);
+		OUT_RINGf  (chan, projectionmatrix[i]);
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
-
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
-	OUT_RINGf (0.0); /* x-offset, w/2 + 1.031250 */
-	OUT_RINGf (0.0); /* y-offset, h/2 + 0.030762 */
-	OUT_RINGf (0.0);
-	OUT_RINGf (16777215.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777216.0); /* [0, 1] scaled approx to [0, 2^24] */
 
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
-	OUT_RINGf (0.0); /* no effect?, w/2 */
-	OUT_RINGf (0.0); /* no effect?, h/2 */
-	OUT_RINGf (16777215.0 * 0.5);
-	OUT_RINGf (65535.0);
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (chan, 0.0); /* x-offset, w/2 + 1.031250 */
+	OUT_RINGf (chan, 0.0); /* y-offset, h/2 + 0.030762 */
+	OUT_RINGf (chan, 0.0);
+	OUT_RINGf (chan, 16777215.0);
 
-	FIRE_RING (NULL);
-}
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf (chan, 0.0); /* no effect?, w/2 */
+	OUT_RINGf (chan, 0.0); /* no effect?, h/2 */
+	OUT_RINGf (chan, 16777215.0 * 0.5);
+	OUT_RINGf (chan, 65535.0);
 
-static void
-nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
+	FIRE_RING (chan);
 }
 
 struct pipe_context *
@@ -399,7 +402,6 @@ nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv20->pipe.winsys = ws;
 	nv20->pipe.screen = pscreen;
 	nv20->pipe.destroy = nv20_destroy;
-	nv20->pipe.set_edgeflags = nv20_set_edgeflags;
 	nv20->pipe.draw_arrays = nv20_draw_arrays;
 	nv20->pipe.draw_elements = nv20_draw_elements;
 	nv20->pipe.clear = nv20_clear;
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
index a4eaa956608..c7dfadaa311 100644
--- a/src/gallium/drivers/nv20/nv20_context.h
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -15,10 +15,6 @@
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
 
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv20_screen *ctx = nv20->screen
-#include "nouveau/nouveau_push.h"
-
 #include "nv20_state.h"
 
 #define NOUVEAU_ERR(fmt, args...) \
@@ -143,9 +139,9 @@ extern void nv20_emit_hw_state(struct nv20_context *nv20);
 extern void nv20_state_tex_update(struct nv20_context *nv20);
 
 /* nv20_vbo.c */
-extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv20_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv20_draw_elements( struct pipe_context *pipe,
+extern void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count);
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
index 495a7be9127..dedbec73f39 100644
--- a/src/gallium/drivers/nv20/nv20_fragtex.c
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -52,6 +52,9 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
 	struct pipe_texture *pt = &nv20mt->base;
 	struct nv20_texture_format *tf;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t txf, txs, txp;
 
 	tf = nv20_fragtex_format(pt->format);
@@ -62,9 +65,9 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 
 	txf  = tf->format << 8;
 	txf |= (pt->last_level + 1) << 16;
-	txf |= log2i(pt->width[0]) << 20;
-	txf |= log2i(pt->height[0]) << 24;
-	txf |= log2i(pt->depth[0]) << 28;
+	txf |= log2i(pt->width0) << 20;
+	txf |= log2i(pt->height0) << 24;
+	txf |= log2i(pt->depth0) << 28;
 	txf |= 8;
 
 	switch (pt->target) {
@@ -82,15 +85,15 @@ nv20_fragtex_build(struct nv20_context *nv20, int unit)
 		return;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
-	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
-	OUT_RING  (ps->wrap);
-	OUT_RING  (0x40000000); /* enable */
-	OUT_RING  (txs);
-	OUT_RING  (ps->filt | 0x2000 /* magic */);
-	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
-	OUT_RING  (ps->bcol);
+	BEGIN_RING(chan, kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(chan, nouveau_bo(nv20mt->buffer), 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(chan, nouveau_bo(nv20mt->buffer),txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (chan, ps->wrap);
+	OUT_RING  (chan, 0x40000000); /* enable */
+	OUT_RING  (chan, txs);
+	OUT_RING  (chan, ps->filt | 0x2000 /* magic */);
+	OUT_RING  (chan, (pt->width0 << 16) | pt->height0);
+	OUT_RING  (chan, ps->bcol);
 #endif
 }
 
@@ -99,6 +102,9 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 {
 #if 0
 	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	unsigned samplers, unit;
 
 	samplers = nv20->fp_samplers & ~fp->samplers;
@@ -106,8 +112,8 @@ nv20_fragtex_bind(struct nv20_context *nv20)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (chan, 0);
 	}
 
 	samplers = nv20->dirty_samplers & fp->samplers;
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
index 185fbf53e0f..8f7538e7f57 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -1,15 +1,18 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
 
 #include "nv20_context.h"
 #include "nv20_screen.h"
+#include "../nv04/nv04_surface_2d.h"
 
 static void
 nv20_miptree_layout(struct nv20_miptree *nv20mt)
 {
 	struct pipe_texture *pt = &nv20mt->base;
-	uint width = pt->width[0], height = pt->height[0];
+	uint width = pt->width0;
 	uint offset = 0;
 	int nr_faces, l, f;
 	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
@@ -25,21 +28,15 @@ nv20_miptree_layout(struct nv20_miptree *nv20mt)
 	}
 	
 	for (l = 0; l <= pt->last_level; l++) {
-		pt->width[l] = width;
-		pt->height[l] = height;
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
-
 		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
-			nv20mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+			nv20mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
 		else
-			nv20mt->level[l].pitch = pt->width[l] * pt->block.size;
+			nv20mt->level[l].pitch = util_format_get_stride(pt->format, width);
 
 		nv20mt->level[l].image_offset =
 			CALLOC(nr_faces, sizeof(unsigned));
 
-		width  = MAX2(1, width  >> 1);
-		height = MAX2(1, height >> 1);
+		width  = u_minify(width, 1);
 	}
 
 	for (f = 0; f < nr_faces; f++) {
@@ -47,14 +44,14 @@ nv20_miptree_layout(struct nv20_miptree *nv20mt)
 			nv20mt->level[l].image_offset[f] = offset;
 
 			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
-			    pt->width[l + 1] > 1 && pt->height[l + 1] > 1)
-				offset += align(nv20mt->level[l].pitch * pt->height[l], 64);
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(nv20mt->level[l].pitch * u_minify(pt->height0, l), 64);
 			else
-				offset += nv20mt->level[l].pitch * pt->height[l];
+				offset += nv20mt->level[l].pitch * u_minify(pt->height0, l);
 		}
 
 		nv20mt->level[l].image_offset[f] = offset;
-		offset += nv20mt->level[l].pitch * pt->height[l];
+		offset += nv20mt->level[l].pitch * u_minify(pt->height0, l);
 	}
 
 	nv20mt->total_size = offset;
@@ -68,7 +65,7 @@ nv20_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv20_miptree);
@@ -82,6 +79,7 @@ nv20_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
 
 	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -100,8 +98,8 @@ nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 	mt->base.screen = screen;
 
 	/* Swizzled textures must be POT */
-	if (pt->width[0] & (pt->width[0] - 1) ||
-	    pt->height[0] & (pt->height[0] - 1))
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
 		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
 	else
 	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
@@ -130,6 +128,12 @@ nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv20_miptree_layout(mt);
 
 	mt->buffer = screen->buffer_create(screen, 256, buf_usage, mt->total_size);
@@ -137,6 +141,7 @@ nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 		FREE(mt);
 		return NULL;
 	}
+	mt->bo = nouveau_bo(mt->buffer);
 	
 	return &mt->base;
 }
@@ -167,8 +172,8 @@ nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
 		return NULL;
 	pipe_texture_reference(&ns->base.texture, pt);
 	ns->base.format = pt->format;
-	ns->base.width = pt->width[level];
-	ns->base.height = pt->height[level];
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
 	ns->base.usage = flags;
 	pipe_reference_init(&ns->base.reference, 1);
 	ns->base.face = face;
@@ -185,12 +190,27 @@ nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
 		ns->base.offset = nv20mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(screen, ((struct nv20_screen*)screen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv20_miptree_surface_destroy(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv20_screen* screen = (struct nv20_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv20_miptree_surface_destroy(&ns->backing->base);
+	}
+	
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
index ddfcdb8057a..2e145672da1 100644
--- a/src/gallium/drivers/nv20/nv20_prim_vbuf.c
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -81,12 +81,15 @@ nv20_vbuf_render(struct vbuf_render *render)
 void nv20_vtxbuf_bind( struct nv20_context* nv20 )
 {
 #if 0
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int i;
 	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
-		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
-		OUT_RING(0/*nv20->vtxbuf*/);
-		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
-		OUT_RING(0/*XXX*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(chan, 0/*nv20->vtxbuf*/);
+		BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(chan, 0/*XXX*/);
 	}
 #endif
 }
@@ -202,6 +205,9 @@ nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
 static unsigned
 nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	uint32_t hwfmt = 0;
 	unsigned fields;
 
@@ -231,8 +237,8 @@ nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
 		return 0;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
-	OUT_RING(hwfmt);
+	BEGIN_RING(chan, kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(chan, hwfmt);
 	return fields;
 }
 
@@ -262,6 +268,9 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	struct vertex_info *vinfo = &nv20->vertex_info;
 	unsigned nr_fields;
 	int max_push;
@@ -270,29 +279,29 @@ nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
 
 	nr_fields = nv20__emit_vertex_array_format(nv20);
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	max_push = 1200 / nr_fields;
 	while (nr_indices) {
 		int i;
 		int push = MIN2(nr_indices, max_push);
 
-		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		BEGIN_RING_NI(chan, kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
 		for (i = 0; i < push; i++) {
 			/* XXX: fixme to handle other than floats? */
 			int f = nr_fields;
 			float *attrv = (float*)&data[indices[i] * vsz];
 			while (f-- > 0)
-				OUT_RINGf(*attrv++);
+				OUT_RINGf(chan, *attrv++);
 		}
 
 		nr_indices -= push;
 		indices += push;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
-	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+	BEGIN_RING(chan, kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(chan, NV20TCL_VERTEX_BEGIN_END_STOP);
 }
 
 static void
@@ -301,20 +310,23 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		uint nr_indices)
 {
 	struct nv20_context *nv20 = nv20_render->nv20;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 	int push, i;
 
 	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
-	OUT_RELOCl(nv20_render->pbuffer, 0,
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(chan, nouveau_bo(nv20_render->pbuffer), 0,
 			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING(nv20_render->hwprim);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(chan, nv20_render->hwprim);
 
 	if (nr_indices & 1) {
-		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
-		OUT_RING  (indices[0]);
+		BEGIN_RING(chan, kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (chan, indices[0]);
 		indices++; nr_indices--;
 	}
 
@@ -322,16 +334,16 @@ nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
 		// XXX too big/small ? check the size
 		push = MIN2(nr_indices, 1200 * 2);
 
-		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		BEGIN_RING_NI(chan, kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
 		for (i = 0; i < push; i+=2)
-			OUT_RING((indices[i+1] << 16) | indices[i]);
+			OUT_RING(chan, (indices[i+1] << 16) | indices[i]);
 
 		nr_indices -= push;
 		indices  += push;
 	}
 
-	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
-	OUT_RING  (0);
+	BEGIN_RING(chan, kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (chan, 0);
 }
 
 static void
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index 4eeacd1afd5..d091335063b 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -115,6 +115,9 @@ nv20_screen_destroy(struct pipe_screen *pscreen)
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->kelvin);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
@@ -173,7 +176,6 @@ nv20_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->kelvin, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
index ed4084980f2..45697a60efd 100644
--- a/src/gallium/drivers/nv20/nv20_state.c
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -451,7 +451,7 @@ nv20_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv20_context *nv20 = nv20_context(pipe);
 	struct pipe_screen *pscreen = pipe->screen;
@@ -461,13 +461,13 @@ nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
 
 	if (buf) {
 		void *mapped;
-		if (buf->buffer && buf->buffer->size &&
-                    (mapped = pipe_buffer_map(pscreen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		if (buf->size &&
+                    (mapped = pipe_buffer_map(pscreen, buf, PIPE_BUFFER_USAGE_CPU_READ)))
 		{
-			memcpy(nv20->constbuf[shader], mapped, buf->buffer->size);
+			memcpy(nv20->constbuf[shader], mapped, buf->size);
 			nv20->constbuf_nr[shader] =
-				buf->buffer->size / (4 * sizeof(float));
-			pipe_buffer_unmap(pscreen, buf->buffer);
+				buf->size / (4 * sizeof(float));
+			pipe_buffer_unmap(pscreen, buf);
 		}
 	}
 }
@@ -546,9 +546,9 @@ nv20_init_state_functions(struct nv20_context *nv20)
 	nv20->pipe.delete_blend_state = nv20_blend_state_delete;
 
 	nv20->pipe.create_sampler_state = nv20_sampler_state_create;
-	nv20->pipe.bind_sampler_states = nv20_sampler_state_bind;
+	nv20->pipe.bind_fragment_sampler_states = nv20_sampler_state_bind;
 	nv20->pipe.delete_sampler_state = nv20_sampler_state_delete;
-	nv20->pipe.set_sampler_textures = nv20_set_sampler_texture;
+	nv20->pipe.set_fragment_sampler_textures = nv20_set_sampler_texture;
 
 	nv20->pipe.create_rasterizer_state = nv20_rasterizer_state_create;
 	nv20->pipe.bind_rasterizer_state = nv20_rasterizer_state_bind;
diff --git a/src/gallium/drivers/nv20/nv20_state.h b/src/gallium/drivers/nv20/nv20_state.h
index 34f402fdcbf..dde41065685 100644
--- a/src/gallium/drivers/nv20/nv20_state.h
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -126,6 +126,7 @@ struct nv20_depth_stencil_alpha_state {
 
 struct nv20_miptree {
 	struct pipe_texture base;
+	struct nouveau_bo *bo;
 
 	struct pipe_buffer *buffer;
 	uint total_size;
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
index 0122b1c2cdb..6bbd1fdae9d 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -5,27 +5,34 @@
 static void nv20_state_emit_blend(struct nv20_context* nv20)
 {
 	struct nv20_blend_state *b = nv20->blend;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
-	OUT_RING  (b->d_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (chan, b->d_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
-	OUT_RING  (b->b_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (chan, b->b_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
-	OUT_RING  (b->b_srcfunc);
-	OUT_RING  (b->b_dstfunc);
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (chan, b->b_srcfunc);
+	OUT_RING  (chan, b->b_dstfunc);
 
-	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
-	OUT_RING  (b->c_mask);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (chan, b->c_mask);
 }
 
 static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 {
 	struct pipe_blend_color *c = nv20->blend_color;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
-	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+	BEGIN_RING(chan, kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  (chan,
+		   (float_to_ubyte(c->color[3]) << 24)|
 		   (float_to_ubyte(c->color[0]) << 16)|
 		   (float_to_ubyte(c->color[1]) << 8) |
 		   (float_to_ubyte(c->color[2]) << 0));
@@ -34,63 +41,69 @@ static void nv20_state_emit_blend_color(struct nv20_context* nv20)
 static void nv20_state_emit_rast(struct nv20_context* nv20)
 {
 	struct nv20_rasterizer_state *r = nv20->rast;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
-	OUT_RING  (r->shade_model);
-	OUT_RING  (r->line_width);
+	BEGIN_RING(chan, kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (chan, r->shade_model);
+	OUT_RING  (chan, r->line_width);
 
 
-	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
-	OUT_RING  (r->point_size);
+	BEGIN_RING(chan, kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (chan, r->point_size);
 
-	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
-	OUT_RING  (r->poly_mode_front);
-	OUT_RING  (r->poly_mode_back);
+	BEGIN_RING(chan, kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (chan, r->poly_mode_front);
+	OUT_RING  (chan, r->poly_mode_back);
 
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
-	OUT_RING  (r->cull_face);
-	OUT_RING  (r->front_face);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (chan, r->cull_face);
+	OUT_RING  (chan, r->front_face);
 
-	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
-	OUT_RING  (r->line_smooth_en);
-	OUT_RING  (r->poly_smooth_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (chan, r->line_smooth_en);
+	OUT_RING  (chan, r->poly_smooth_en);
 
-	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
-	OUT_RING  (r->cull_face_en);
+	BEGIN_RING(chan, kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (chan, r->cull_face_en);
 }
 
 static void nv20_state_emit_dsa(struct nv20_context* nv20)
 {
 	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
-	OUT_RING (d->depth.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (chan, d->depth.func);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
-	OUT_RING (d->depth.write_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (chan, d->depth.write_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
-	OUT_RING (d->depth.test_enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (chan, d->depth.test_enable);
 
-	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
-	OUT_RING (1);
+	BEGIN_RING(chan, kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING (chan, 1);
 
 #if 0
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
-	OUT_RING (d->stencil.enable);
-	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
-	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (chan, d->stencil.enable);
+	BEGIN_RING(chan, kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp (chan, (uint32_t *)&(d->stencil.wmask), 7);
 #endif
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
-	OUT_RING (d->alpha.enabled);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (chan, d->alpha.enabled);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
-	OUT_RING (d->alpha.func);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (chan, d->alpha.func);
 
-	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
-	OUT_RING (d->alpha.ref);
+	BEGIN_RING(chan, kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (chan, d->alpha.ref);
 }
 
 static void nv20_state_emit_viewport(struct nv20_context* nv20)
@@ -101,9 +114,13 @@ static void nv20_state_emit_scissor(struct nv20_context* nv20)
 {
 	/* NV20TCL_SCISSOR_* is probably a software method */
 /*	struct pipe_scissor_state *s = nv20->scissor;
-	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
-	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
-	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+
+	BEGIN_RING(chan, kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (chan, ((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (chan, ((s->maxy - s->miny) << 16) | s->miny);*/
 }
 
 static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
@@ -113,6 +130,9 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	uint32_t rt_format, w, h;
 	int colour_format = 0, zeta_format = 0;
 	struct nv20_miptree *nv20mt = 0;
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
 
 	w = fb->cbufs[0]->width;
 	h = fb->cbufs[0]->height;
@@ -150,11 +170,11 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 	}
 
 	if (zeta) {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (zeta->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (zeta->pitch << 16));
 	} else {
-		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
-		OUT_RING  (rt->pitch | (rt->pitch << 16));
+		BEGIN_RING(chan, kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (chan, rt->pitch | (rt->pitch << 16));
 	}
 
 	nv20mt = (struct nv20_miptree *)rt->base.texture;
@@ -166,13 +186,13 @@ static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
 		nv20->zeta = nv20mt->buffer;
 	}
 
-	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
-	OUT_RING  ((w << 16) | 0);
-	OUT_RING  ((h << 16) | 0); /*NV20TCL_RT_VERT */
-	OUT_RING  (rt_format); /* NV20TCL_RT_FORMAT */
-	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
-	OUT_RING  (((w - 1) << 16) | 0);
-	OUT_RING  (((h - 1) << 16) | 0);
+	BEGIN_RING(chan, kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  (chan, (w << 16) | 0);
+	OUT_RING  (chan, (h << 16) | 0); /*NV20TCL_RT_VERT */
+	OUT_RING  (chan, rt_format); /* NV20TCL_RT_FORMAT */
+	BEGIN_RING(chan, kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (chan, ((w - 1) << 16) | 0);
+	OUT_RING  (chan, ((h - 1) << 16) | 0);
 }
 
 static void nv20_vertex_layout(struct nv20_context *nv20)
@@ -228,7 +248,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	}
 
 	/* always do position */ {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_POSITION, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_POSITION, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
 		vinfo->hwfmt[0] |= (1 << 0);
 	}
@@ -237,19 +257,19 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 4; i < 6; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i - 3));
 	}
 
 	if (colors[0]) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_COLOR, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
 		vinfo->hwfmt[0] |= (1 << 3);
 	}
 
 	if (colors[1]) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 1);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_COLOR, 1);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
 		vinfo->hwfmt[0] |= (1 << 4);
 	}
@@ -258,7 +278,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 6; i < 10; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i - 1));
 	}
@@ -267,7 +287,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 0; i < 4; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i + 9));
 	}
@@ -276,13 +296,13 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 10; i < 12; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i + 3));
 	}
 
 	if (fog) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_FOG, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_FOG, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << 15);
 	}
@@ -293,6 +313,10 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 void
 nv20_emit_hw_state(struct nv20_context *nv20)
 {
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *kelvin = screen->kelvin;
+	struct nouveau_bo *rt_bo;
 	int i;
 
 	if (nv20->dirty & NV20_NEW_VERTPROG) {
@@ -361,36 +385,39 @@ nv20_emit_hw_state(struct nv20_context *nv20)
 	 */
 
 	/* Render target */
-	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	rt_bo = nouveau_bo(nv20->rt[0]);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_COLOR, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	if (nv20->zeta) {
-		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
-		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		struct nouveau_bo *zeta_bo = nouveau_bo(nv20->zeta);
+		BEGIN_RING(chan, kelvin, NV20TCL_DMA_ZETA, 1);
+		OUT_RELOCo(chan, zeta_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(chan, kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(chan, zeta_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 		/* XXX for when we allocate LMA on nv17 */
-/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
-		OUT_RELOCl(nv20->zeta + lma_offset);*/
+/*		BEGIN_RING(chan, kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(chan, nouveau_bo(nv20->zeta + lma_offset));*/
 	}
 
 	/* Vertex buffer */
-	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
-	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
-	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(chan, rt_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(chan, rt_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	/* Texture images */
 	for (i = 0; i < 2; i++) {
 		if (!(nv20->fp_samplers & (1 << i)))
 			continue;
-		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
-		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+		struct nouveau_bo *bo = nouveau_bo(nv20->tex[i].buffer);
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(chan, bo, 0, NOUVEAU_BO_VRAM |
 			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
-		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+		BEGIN_RING(chan, kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(chan, bo, nv20->tex[i].format,
 			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
 			   NV20TCL_TX_FORMAT_DMA1);
diff --git a/src/gallium/drivers/nv20/nv20_transfer.c b/src/gallium/drivers/nv20/nv20_transfer.c
index 81b4f1a9177..699773e8e6f 100644
--- a/src/gallium/drivers/nv20/nv20_transfer.c
+++ b/src/gallium/drivers/nv20/nv20_transfer.c
@@ -1,7 +1,9 @@
 #include <pipe/p_state.h>
 #include <pipe/p_defines.h>
 #include <pipe/p_inlines.h>
+#include <util/u_format.h>
 #include <util/u_memory.h>
+#include <util/u_math.h>
 #include <nouveau/nouveau_winsys.h>
 #include "nv20_context.h"
 #include "nv20_screen.h"
@@ -10,22 +12,19 @@
 struct nv20_transfer {
 	struct pipe_transfer base;
 	struct pipe_surface *surface;
-	bool direct;
+	boolean direct;
 };
 
 static void
-nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width[0] = pt->width[level];
-	template->height[0] = pt->height[level];
-	template->depth[0] = 1;
-	template->block = pt->block;
-	template->nblocksx[0] = pt->nblocksx[level];
-	template->nblocksy[0] = pt->nblocksx[level];
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
 
@@ -48,14 +47,10 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
 	tx->base.x = x;
 	tx->base.y = y;
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
 	tx->base.stride = mt->level[level].pitch;
 	tx->base.usage = usage;
 	tx->base.face = face;
@@ -76,7 +71,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv20_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv20_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -85,6 +80,8 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv20_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -110,8 +107,8 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -131,13 +128,13 @@ nv20_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -156,8 +153,10 @@ nv20_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
index 84d7db6c5e2..52991a0d856 100644
--- a/src/gallium/drivers/nv20/nv20_vbo.c
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -9,7 +9,7 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_pushbuf.h"
 
-boolean nv20_draw_elements( struct pipe_context *pipe,
+void nv20_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *indexBuffer,
                     unsigned indexSize,
                     unsigned prim, unsigned start, unsigned count)
@@ -45,7 +45,7 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
-	draw_set_mapped_constant_buffer(draw,
+	draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
 					nv20->constbuf[PIPE_SHADER_VERTEX],
 					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
 
@@ -67,13 +67,12 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 	}
 
 	draw_flush(nv20->draw);
-	return TRUE;
 }
 
-boolean nv20_draw_arrays( struct pipe_context *pipe,
+void nv20_draw_arrays( struct pipe_context *pipe,
 				 unsigned prim, unsigned start, unsigned count)
 {
-	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+	nv20_draw_elements(pipe, NULL, 0, prim, start, count);
 }
 
 
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
index 388245ecb04..7886c2af7e6 100644
--- a/src/gallium/drivers/nv20/nv20_vertprog.c
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -253,32 +253,32 @@ static INLINE struct nv20_sreg
 tgsi_src(struct nv20_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	struct nv20_sreg src;
 
-	switch (fsrc->SrcRegister.File) {
+	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nv20_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		src = nv20_sr(NV30SR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		src = vpc->imm[fsrc->SrcRegister.Index];
+		src = vpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		if (vpc->high_temp < fsrc->SrcRegister.Index)
-			vpc->high_temp = fsrc->SrcRegister.Index;
-		src = nv20_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		if (vpc->high_temp < fsrc->Register.Index)
+			vpc->high_temp = fsrc->Register.Index;
+		src = nv20_sr(NV30SR_TEMP, fsrc->Register.Index);
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
 		break;
 	}
 
-	src.abs = fsrc->SrcRegisterExtMod.Absolute;
-	src.negate = fsrc->SrcRegister.Negate;
-	src.swz[0] = fsrc->SrcRegister.SwizzleX;
-	src.swz[1] = fsrc->SrcRegister.SwizzleY;
-	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
-	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
 	return src;
 }
 
@@ -286,14 +286,14 @@ static INLINE struct nv20_sreg
 tgsi_dst(struct nv20_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 	struct nv20_sreg dst;
 
-	switch (fdst->DstRegister.File) {
+	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
 		dst = nv20_sr(NV30SR_OUTPUT,
-			      vpc->output_map[fdst->DstRegister.Index]);
+			      vpc->output_map[fdst->Register.Index]);
 
 		break;
 	case TGSI_FILE_TEMPORARY:
-		dst = nv20_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		dst = nv20_sr(NV30SR_TEMP, fdst->Register.Index);
 		if (vpc->high_temp < dst.index)
 			vpc->high_temp = dst.index;
 		break;
@@ -334,8 +334,8 @@ nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 			src[i] = tgsi_src(vpc, fsrc);
 		}
 	}
@@ -343,11 +343,11 @@ nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		switch (fsrc->SrcRegister.File) {
+		fsrc = &finst->Src[i];
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
-			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
-				ai = fsrc->SrcRegister.Index;
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -360,8 +360,8 @@ nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
 		 */
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_IMMEDIATE:
-			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
-				ci = fsrc->SrcRegister.Index;
+			if (ci == -1 || ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -378,8 +378,8 @@ nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
 		}
 	}
 
-	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
-	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
@@ -490,15 +490,15 @@ nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
 {
 	int hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = NV30_VP_INST_DEST_POS;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV30_VP_INST_DEST_COL0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV30_VP_INST_DEST_COL1;
 		} else {
 			NOUVEAU_ERR("bad colour semantic index\n");
@@ -506,10 +506,10 @@ nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
 		}
 		break;
 	case TGSI_SEMANTIC_BCOLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV30_VP_INST_DEST_BFC0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV30_VP_INST_DEST_BFC1;
 		} else {
 			NOUVEAU_ERR("bad bcolour semantic index\n");
@@ -523,19 +523,22 @@ nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
 		hw = NV30_VP_INST_DEST_PSZ;
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.SemanticIndex <= 7) {
-			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		if (fdec->Semantic.Index <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.Index);
 		} else {
 			NOUVEAU_ERR("bad generic semantic index\n");
 			return FALSE;
 		}
 		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
 	default:
 		NOUVEAU_ERR("bad output semantic\n");
 		return FALSE;
 	}
 
-	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	vpc->output_map[fdec->Range.First] = hw;
 	return TRUE;
 }
 
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
index d8300fd69f6..54572e9ab3a 100644
--- a/src/gallium/drivers/nv30/nv30_context.c
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -10,21 +10,32 @@ nv30_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(rankine, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, rankine, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
 nv30_destroy(struct pipe_context *pipe)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < NV30_STATE_MAX; i++) {
+		if (nv30->state.hw[i])
+			so_ref(NULL, &nv30->state.hw[i]);
+	}
 
 	if (nv30->draw)
 		draw_destroy(nv30->draw);
@@ -58,6 +69,9 @@ nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv30->pipe.is_texture_referenced = nouveau_is_texture_referenced;
 	nv30->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
+	screen->base.channel->user_private = nv30;
+	screen->base.channel->flush_notify = nv30_state_flush_notify;
+
 	nv30_init_query_functions(nv30);
 	nv30_init_surface_functions(nv30);
 	nv30_init_state_functions(nv30);
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
index 8d49366dfcb..e59449287b5 100644
--- a/src/gallium/drivers/nv30/nv30_context.h
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv30_screen *ctx = nv30->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv30_state.h"
@@ -144,7 +140,6 @@ struct nv30_context {
 	unsigned vtxbuf_nr;
 	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
 	unsigned vtxelt_nr;
-	const unsigned *edgeflags;
 };
 
 static INLINE struct nv30_context *
@@ -184,6 +179,7 @@ extern void nv30_fragtex_bind(struct nv30_context *);
 /* nv30_state.c and friends */
 extern boolean nv30_state_validate(struct nv30_context *nv30);
 extern void nv30_state_emit(struct nv30_context *nv30);
+extern void nv30_state_flush_notify(struct nouveau_channel *chan);
 extern struct nv30_state_entry nv30_state_rasterizer;
 extern struct nv30_state_entry nv30_state_scissor;
 extern struct nv30_state_entry nv30_state_stipple;
@@ -198,9 +194,9 @@ extern struct nv30_state_entry nv30_state_fragtex;
 extern struct nv30_state_entry nv30_state_vbo;
 
 /* nv30_vbo.c */
-extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv30_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv30_draw_elements(struct pipe_context *pipe,
+extern void nv30_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index 0ce702d6f84..2d565cb631b 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -237,20 +237,20 @@ tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
 	struct nv30_sreg src;
 
-	switch (fsrc->SrcRegister.File) {
+	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
 		src = nv30_sr(NV30SR_INPUT,
-			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+			      fpc->attrib_map[fsrc->Register.Index]);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		src = constant(fpc, fsrc->Register.Index, NULL);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
-		src = fpc->imm[fsrc->SrcRegister.Index];
+		assert(fsrc->Register.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		src = nv30_sr(NV30SR_TEMP, fsrc->Register.Index + 1);
 		if (fpc->high_temp < src.index)
 			fpc->high_temp = src.index;
 		break;
@@ -258,7 +258,7 @@ tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 	 * Luckily fragprog results are just temp regs..
 	 */
 	case TGSI_FILE_OUTPUT:
-		if (fsrc->SrcRegister.Index == fpc->colour_id)
+		if (fsrc->Register.Index == fpc->colour_id)
 			return nv30_sr(NV30SR_OUTPUT, 0);
 		else
 			return nv30_sr(NV30SR_OUTPUT, 1);
@@ -268,12 +268,12 @@ tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 		break;
 	}
 
-	src.abs = fsrc->SrcRegisterExtMod.Absolute;
-	src.negate = fsrc->SrcRegister.Negate;
-	src.swz[0] = fsrc->SrcRegister.SwizzleX;
-	src.swz[1] = fsrc->SrcRegister.SwizzleY;
-	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
-	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
 	return src;
 }
 
@@ -281,22 +281,22 @@ static INLINE struct nv30_sreg
 tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
 	int idx;
 
-	switch (fdst->DstRegister.File) {
+	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
-		if (fdst->DstRegister.Index == fpc->colour_id)
+		if (fdst->Register.Index == fpc->colour_id)
 			return nv30_sr(NV30SR_OUTPUT, 0);
 		else
 			return nv30_sr(NV30SR_OUTPUT, 1);
 		break;
 	case TGSI_FILE_TEMPORARY:
-		idx = fdst->DstRegister.Index + 1;
+		idx = fdst->Register.Index + 1;
 		if (fpc->high_temp < idx)
 			fpc->high_temp = idx;
 		return nv30_sr(NV30SR_TEMP, idx);
 	case TGSI_FILE_NULL:
 		return nv30_sr(NV30SR_NONE, 0);
 	default:
-		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
 		return nv30_sr(NV30SR_NONE, 0);
 	}
 }
@@ -363,8 +363,8 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 			src[i] = tgsi_src(fpc, fsrc);
 		}
 	}
@@ -372,9 +372,9 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
+		fsrc = &finst->Src[i];
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_TEMPORARY:
@@ -385,14 +385,14 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 			break;
 		}
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
-			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
-				ai = fsrc->SrcRegister.Index;
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
 				NOUVEAU_MSG("extra src attr %d\n",
-					 fsrc->SrcRegister.Index);
+					 fsrc->Register.Index);
 				src[i] = temp(fpc);
 				arith(fpc, 0, MOV, src[i], MASK_ALL,
 				      tgsi_src(fpc, fsrc), none, none);
@@ -400,8 +400,8 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 			break;
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_IMMEDIATE:
-			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
-				ci = fsrc->SrcRegister.Index;
+			if (ci == -1 || ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
 				src[i] = temp(fpc);
@@ -413,7 +413,7 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 			/* handled above */
 			break;
 		case TGSI_FILE_SAMPLER:
-			unit = fsrc->SrcRegister.Index;
+			unit = fsrc->Register.Index;
 			break;
 		case TGSI_FILE_OUTPUT:
 			break;
@@ -423,8 +423,8 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 		}
 	}
 
-	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
-	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	dst  = tgsi_dst(fpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
 
 	switch (finst->Instruction.Opcode) {
@@ -435,10 +435,11 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = temp(fpc);
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp = nv30_sr(NV30SR_NONE, 0);
 		tmp.cc_update = 1;
 		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
 		dst.cc_test = NV30_VP_INST_COND_LT;
 		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
 		break;
@@ -517,13 +518,28 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
 		break;
 	case TGSI_OPCODE_SCS:
-		if (mask & MASK_X) {
-			arith(fpc, sat, COS, dst, MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
+		/* avoid overwriting the source */
+		if(src[0].swz[SWZ_X] != SWZ_X)
+		{
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
-		if (mask & MASK_Y) {
-			arith(fpc, sat, SIN, dst, MASK_Y,
-			      swz(src[0], X, X, X, X), none, none);
+		else
+		{
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
 		break;
 	case TGSI_OPCODE_SIN:
@@ -572,15 +588,15 @@ nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
 {
 	int hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = NV30_FP_OP_INPUT_SRC_POSITION;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV30_FP_OP_INPUT_SRC_COL0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV30_FP_OP_INPUT_SRC_COL1;
 		} else {
 			NOUVEAU_ERR("bad colour semantic index\n");
@@ -591,9 +607,9 @@ nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
 		hw = NV30_FP_OP_INPUT_SRC_FOGC;
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.SemanticIndex <= 7) {
+		if (fdec->Semantic.Index <= 7) {
 			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
-						     SemanticIndex);
+						     Index);
 		} else {
 			NOUVEAU_ERR("bad generic semantic index\n");
 			return FALSE;
@@ -604,7 +620,7 @@ nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
 		return FALSE;
 	}
 
-	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	fpc->attrib_map[fdec->Range.First] = hw;
 	return TRUE;
 }
 
@@ -612,12 +628,12 @@ static boolean
 nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
 				const struct tgsi_full_declaration *fdec)
 {
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
-		fpc->depth_id = fdec->DeclarationRange.First;
+		fpc->depth_id = fdec->Range.First;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		fpc->colour_id = fdec->DeclarationRange.First;
+		fpc->colour_id = fdec->Range.First;
 		break;
 	default:
 		NOUVEAU_ERR("bad output semantic\n");
@@ -653,9 +669,9 @@ nv30_fragprog_prepare(struct nv30_fpc *fpc)
 					goto out_err;
 				break;
 			/*case TGSI_FILE_TEMPORARY:
-				if (fdec->DeclarationRange.Last > high_temp) {
+				if (fdec->Range.Last > high_temp) {
 					high_temp =
-						fdec->DeclarationRange.Last;
+						fdec->Range.Last;
 				}
 				break;*/
 			default:
@@ -821,7 +837,7 @@ nv30_fragprog_validate(struct nv30_context *nv30)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv30_fragprog_upload(nv30, fp);
 
-	so = so_new(8, 1);
+	so = so_new(4, 4, 1);
 	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
@@ -870,6 +886,12 @@ void
 nv30_fragprog_destroy(struct nv30_context *nv30,
 		      struct nv30_fragment_program *fp)
 {
+	if (fp->buffer)
+		pipe_buffer_reference(&fp->buffer, NULL);
+
+	if (fp->so)
+		so_ref(NULL, &fp->so);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
index dca760cae62..0cc3172dcd5 100644
--- a/src/gallium/drivers/nv30/nv30_fragtex.c
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -43,7 +43,6 @@ static struct nv30_texture_format *
 nv30_fragtex_format(uint pipe_format)
 {
 	struct nv30_texture_format *tf = nv30_texture_formats;
-	char fs[128];
 
 	while (tf->defined) {
 		if (tf->pipe == pipe_format)
@@ -65,7 +64,7 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 	struct nouveau_bo *bo = nouveau_bo(nv30mt->buffer);
 	struct nv30_texture_format *tf;
 	struct nouveau_stateobj *so;
-	uint32_t txf, txs , txp;
+	uint32_t txf, txs;
 	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 
 	tf = nv30_fragtex_format(pt->format);
@@ -74,9 +73,9 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 
 	txf  = tf->format;
 	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(pt->width[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
-	txf |= log2i(pt->height[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
-	txf |= log2i(pt->depth[0]) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
 	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
 
 	switch (pt->target) {
@@ -97,16 +96,9 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 		return NULL;
 	}
 
-	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
-		txp = 0;
-	} else {
-		txp  = nv30mt->level[0].pitch;
-		txf |= (1<<13) /*FIXME: NV34TCL_TX_FORMAT_LINEAR ? */;
-	}
-
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(1, 8, 2);
 	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -115,8 +107,8 @@ nv30_fragtex_build(struct nv30_context *nv30, int unit)
 	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
 	so_data  (so, txs);
 	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
-	so_data  (so, (pt->width[0] << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
-		       pt->height[0]);
+	so_data  (so, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height0);
 	so_data  (so, ps->bcol);
 
 	return so;
@@ -135,7 +127,7 @@ nv30_fragtex_validate(struct nv30_context *nv30)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
index 280696d4503..8fbba38e78f 100644
--- a/src/gallium/drivers/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -1,14 +1,17 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
 
 #include "nv30_context.h"
+#include "../nv04/nv04_surface_2d.h"
 
 static void
 nv30_miptree_layout(struct nv30_miptree *nv30mt)
 {
 	struct pipe_texture *pt = &nv30mt->base;
-	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint width = pt->width0;
 	uint offset = 0;
 	int nr_faces, l, f;
 	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
@@ -21,29 +24,21 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 		nr_faces = 6;
 	} else
 	if (pt->target == PIPE_TEXTURE_3D) {
-		nr_faces = pt->depth[0];
+		nr_faces = pt->depth0;
 	} else {
 		nr_faces = 1;
 	}
 
 	for (l = 0; l <= pt->last_level; l++) {
-		pt->width[l] = width;
-		pt->height[l] = height;
-		pt->depth[l] = depth;
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
-
 		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
-			nv30mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+			nv30mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
 		else
-			nv30mt->level[l].pitch = pt->width[l] * pt->block.size;
+			nv30mt->level[l].pitch = util_format_get_stride(pt->format, width);
 
 		nv30mt->level[l].image_offset =
 			CALLOC(nr_faces, sizeof(unsigned));
 
-		width  = MAX2(1, width  >> 1);
-		height = MAX2(1, height >> 1);
-		depth  = MAX2(1, depth  >> 1);
+		width  = u_minify(width, 1);
 	}
 
 	for (f = 0; f < nr_faces; f++) {
@@ -51,14 +46,14 @@ nv30_miptree_layout(struct nv30_miptree *nv30mt)
 			nv30mt->level[l].image_offset[f] = offset;
 
 			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
-			    pt->width[l + 1] > 1 && pt->height[l + 1] > 1)
-				offset += align(nv30mt->level[l].pitch * pt->height[l], 64);
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(nv30mt->level[l].pitch * u_minify(pt->height0, l), 64);
 			else
-				offset += nv30mt->level[l].pitch * pt->height[l];
+				offset += nv30mt->level[l].pitch * u_minify(pt->height0, l);
 		}
 
 		nv30mt->level[l].image_offset[f] = offset;
-		offset += nv30mt->level[l].pitch * pt->height[l];
+		offset += nv30mt->level[l].pitch * u_minify(pt->height0, l);
 	}
 
 	nv30mt->total_size = offset;
@@ -79,8 +74,8 @@ nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	mt->base.screen = pscreen;
 
 	/* Swizzled textures must be POT */
-	if (pt->width[0] & (pt->width[0] - 1) ||
-	    pt->height[0] & (pt->height[0] - 1))
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
 		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
 	else
 	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
@@ -114,6 +109,12 @@ nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv30_miptree_layout(mt);
 
 	mt->buffer = pscreen->buffer_create(pscreen, 256, buf_usage,
@@ -122,6 +123,7 @@ nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 		FREE(mt);
 		return NULL;
 	}
+	mt->bo = nouveau_bo(mt->buffer);
 
 	return &mt->base;
 }
@@ -134,7 +136,7 @@ nv30_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv30_miptree);
@@ -151,6 +153,7 @@ nv30_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
 
 	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -182,8 +185,8 @@ nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	pipe_texture_reference(&ns->base.texture, pt);
 	ns->base.format = pt->format;
-	ns->base.width = pt->width[level];
-	ns->base.height = pt->height[level];
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
 	ns->base.usage = flags;
 	pipe_reference_init(&ns->base.reference, 1);
 	ns->base.face = face;
@@ -200,12 +203,27 @@ nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		ns->base.offset = nv30mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(pscreen, ((struct nv30_screen*)pscreen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv30_miptree_surface_del(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv30_screen* screen = (struct nv30_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv30_miptree_surface_del(&ns->backing->base);
+	}
+
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
index 1d1c8a484e1..e27e9ccbf60 100644
--- a/src/gallium/drivers/nv30/nv30_query.c
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -41,6 +41,9 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_query *q = nv30_query(pq);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv30->screen->query, q->object->start);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -69,12 +72,15 @@ static void
 nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_query *q = nv30_query(pq);
 
-	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index 7cd36902eb4..9ed48178dc2 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -156,6 +156,12 @@ static void
 nv30_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv30_screen *screen = nv30_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < NV30_STATE_MAX; i++) {
+		if (screen->state[i])
+			so_ref(NULL, &screen->state[i]);
+	}
 
 	nouveau_resource_free(&screen->vp_exec_heap);
 	nouveau_resource_free(&screen->vp_data_heap);
@@ -163,6 +169,9 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->rankine);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
@@ -224,7 +233,6 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->rankine, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -261,7 +269,7 @@ nv30_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static rankine initialisation */
-	so = so_new(128, 0);
+	so = so_new(36, 60, 0);
 	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
index b91e972c123..065c927a10d 100644
--- a/src/gallium/drivers/nv30/nv30_state.c
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -14,7 +14,7 @@ nv30_blend_state_create(struct pipe_context *pipe,
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
@@ -300,7 +300,7 @@ nv30_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(9, 19, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	/*XXX: ignored:
@@ -435,7 +435,7 @@ nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(5, 21, 0);
 	struct nouveau_grobj *rankine = nv30->screen->rankine;
 
 	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
@@ -590,12 +590,12 @@ nv30_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
 
-	nv30->constbuf[shader] = buf->buffer;
-	nv30->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+	nv30->constbuf[shader] = buf;
+	nv30->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
 
 	if (shader == PIPE_SHADER_VERTEX) {
 		nv30->dirty |= NV30_NEW_VERTPROG;
@@ -672,16 +672,6 @@ nv30_set_vertex_elements(struct pipe_context *pipe, unsigned count,
 	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
 }
 
-static void
-nv30_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
-	struct nv30_context *nv30 = nv30_context(pipe);
-
-	nv30->edgeflags = bitfield;
-	nv30->dirty |= NV30_NEW_ARRAYS;
-	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
-}
-
 void
 nv30_init_state_functions(struct nv30_context *nv30)
 {
@@ -690,9 +680,9 @@ nv30_init_state_functions(struct nv30_context *nv30)
 	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
 
 	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
-	nv30->pipe.bind_sampler_states = nv30_sampler_state_bind;
+	nv30->pipe.bind_fragment_sampler_states = nv30_sampler_state_bind;
 	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
-	nv30->pipe.set_sampler_textures = nv30_set_sampler_texture;
+	nv30->pipe.set_fragment_sampler_textures = nv30_set_sampler_texture;
 
 	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
 	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
@@ -721,7 +711,6 @@ nv30_init_state_functions(struct nv30_context *nv30)
 	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
 	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
 
-	nv30->pipe.set_edgeflags = nv30_set_edgeflags;
 	nv30->pipe.set_vertex_buffers = nv30_set_vertex_buffers;
 	nv30->pipe.set_vertex_elements = nv30_set_vertex_elements;
 }
diff --git a/src/gallium/drivers/nv30/nv30_state.h b/src/gallium/drivers/nv30/nv30_state.h
index e6f23bf1667..e42e872de75 100644
--- a/src/gallium/drivers/nv30/nv30_state.h
+++ b/src/gallium/drivers/nv30/nv30_state.h
@@ -72,6 +72,7 @@ struct nv30_fragment_program {
 
 struct nv30_miptree {
 	struct pipe_texture base;
+	struct nouveau_bo *bo;
 
 	struct pipe_buffer *buffer;
 	uint total_size;
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
index 64cf9ae93a0..c36d58c040c 100644
--- a/src/gallium/drivers/nv30/nv30_state_blend.c
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -18,7 +18,7 @@ struct nv30_state_entry nv30_state_blend = {
 static boolean
 nv30_state_blend_colour_validate(struct nv30_context *nv30)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv30->blend_colour;
 
 	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
index 621b8846c8e..ac52d946f02 100644
--- a/src/gallium/drivers/nv30/nv30_state_emit.c
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -41,7 +41,7 @@ nv30_state_emit(struct nv30_context *nv30)
 	struct nouveau_channel *chan = nv30->screen->base.channel;
 	struct nv30_state *state = &nv30->state;
 	struct nv30_screen *screen = nv30->screen;
-	unsigned i, samplers;
+	unsigned i;
 	uint64_t states;
 
 	if (nv30->pctx_id != screen->cur_pctx) {
@@ -63,6 +63,14 @@ nv30_state_emit(struct nv30_context *nv30)
 	}
 
 	state->dirty = 0;
+}
+
+void
+nv30_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv30_context *nv30 = chan->user_private;
+	struct nv30_state *state = &nv30->state;
+	unsigned i, samplers;
 
 	so_emit_reloc_markers(chan, state->hw[NV30_STATE_FB]);
 	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
index 6f6d1740d6e..2ed2ea55e84 100644
--- a/src/gallium/drivers/nv30/nv30_state_fb.c
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -10,7 +10,7 @@ nv30_state_framebuffer_validate(struct nv30_context *nv30)
 	struct nv04_surface *rt[2], *zeta = NULL;
 	uint32_t rt_enable = 0, rt_format = 0;
 	int i, colour_format = 0, zeta_format = 0, depth_only = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(12, 18, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
index 3ac7a8471ea..ba61a9e24a4 100644
--- a/src/gallium/drivers/nv30/nv30_state_scissor.c
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -12,7 +12,7 @@ nv30_state_scissor_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
 	if (nv30->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
index d0c791ac082..ed520a4f439 100644
--- a/src/gallium/drivers/nv30/nv30_state_stipple.c
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -14,14 +14,14 @@ nv30_state_stipple_validate(struct nv30_context *nv30)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv30->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
index c3eb413dac6..2d7781292bd 100644
--- a/src/gallium/drivers/nv30/nv30_state_viewport.c
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -19,7 +19,7 @@ nv30_state_viewport_validate(struct nv30_context *nv30)
 		return FALSE;
 	nv30->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(3, 10, 0);
 	if (!bypass) {
 		so_method(so, nv30->screen->rankine,
 			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv30/nv30_transfer.c b/src/gallium/drivers/nv30/nv30_transfer.c
index 98011decf7c..65598991c68 100644
--- a/src/gallium/drivers/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nv30/nv30_transfer.c
@@ -1,7 +1,9 @@
 #include <pipe/p_state.h>
 #include <pipe/p_defines.h>
 #include <pipe/p_inlines.h>
+#include <util/u_format.h>
 #include <util/u_memory.h>
+#include <util/u_math.h>
 #include <nouveau/nouveau_winsys.h>
 #include "nv30_context.h"
 #include "nv30_screen.h"
@@ -10,22 +12,19 @@
 struct nv30_transfer {
 	struct pipe_transfer base;
 	struct pipe_surface *surface;
-	bool direct;
+	boolean direct;
 };
 
 static void
-nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width[0] = pt->width[level];
-	template->height[0] = pt->height[level];
-	template->depth[0] = 1;
-	template->block = pt->block;
-	template->nblocksx[0] = pt->nblocksx[level];
-	template->nblocksy[0] = pt->nblocksx[level];
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
 
@@ -48,14 +47,10 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
 	tx->base.x = x;
 	tx->base.y = y;
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
 	tx->base.stride = mt->level[level].pitch;
 	tx->base.usage = usage;
 	tx->base.face = face;
@@ -76,7 +71,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv30_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv30_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -85,6 +80,8 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv30_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -110,8 +107,8 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -131,13 +128,13 @@ nv30_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -156,8 +153,10 @@ nv30_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
index 189656ec817..1c5db03ea24 100644
--- a/src/gallium/drivers/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -163,19 +163,21 @@ nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv30_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	nv30_vbo_set_idxbuf(nv30, NULL, 0);
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;
+		return;
 	}
 
 	while (count) {
@@ -186,17 +188,17 @@ nv30_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,15 +208,15 @@ nv30_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
@@ -228,7 +230,9 @@ static INLINE void
 nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv30_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv30_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv30_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
-	struct nouveau_channel *chan = nv30->screen->base.channel;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	unsigned restart = 0;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv30_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, rankine, NV34TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv30_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -461,7 +468,7 @@ nv30_draw_elements(struct pipe_context *pipe,
 	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
 		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
 						mode, start, count);*/
-		return FALSE;	
+		return;	
 	}
 
 	if (idxbuf) {
@@ -472,7 +479,6 @@ nv30_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -485,14 +491,9 @@ nv30_vbo_validate(struct nv30_context *nv30)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	if (nv30->edgeflags) {
-		/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
-		return FALSE;
-	}
-
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
 
 	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
@@ -505,7 +506,7 @@ nv30_vbo_validate(struct nv30_context *nv30)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
index 14a5c0260d0..e77a5be3f23 100644
--- a/src/gallium/drivers/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -253,32 +253,32 @@ static INLINE struct nv30_sreg
 tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	struct nv30_sreg src;
 
-	switch (fsrc->SrcRegister.File) {
+	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		src = nv30_sr(NV30SR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		src = vpc->imm[fsrc->SrcRegister.Index];
+		src = vpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		if (vpc->high_temp < fsrc->SrcRegister.Index)
-			vpc->high_temp = fsrc->SrcRegister.Index;
-		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		if (vpc->high_temp < fsrc->Register.Index)
+			vpc->high_temp = fsrc->Register.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->Register.Index);
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
 		break;
 	}
 
-	src.abs = fsrc->SrcRegisterExtMod.Absolute;
-	src.negate = fsrc->SrcRegister.Negate;
-	src.swz[0] = fsrc->SrcRegister.SwizzleX;
-	src.swz[1] = fsrc->SrcRegister.SwizzleY;
-	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
-	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
 	return src;
 }
 
@@ -286,14 +286,14 @@ static INLINE struct nv30_sreg
 tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 	struct nv30_sreg dst;
 
-	switch (fdst->DstRegister.File) {
+	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
 		dst = nv30_sr(NV30SR_OUTPUT,
-			      vpc->output_map[fdst->DstRegister.Index]);
+			      vpc->output_map[fdst->Register.Index]);
 
 		break;
 	case TGSI_FILE_TEMPORARY:
-		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		dst = nv30_sr(NV30SR_TEMP, fdst->Register.Index);
 		if (vpc->high_temp < dst.index)
 			vpc->high_temp = dst.index;
 		break;
@@ -334,8 +334,8 @@ nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 			src[i] = tgsi_src(vpc, fsrc);
 		}
 	}
@@ -343,11 +343,11 @@ nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		switch (fsrc->SrcRegister.File) {
+		fsrc = &finst->Src[i];
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
-			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
-				ai = fsrc->SrcRegister.Index;
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -360,8 +360,8 @@ nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
 		 */
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_IMMEDIATE:
-			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
-				ci = fsrc->SrcRegister.Index;
+			if (ci == -1 || ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -378,8 +378,8 @@ nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
 		}
 	}
 
-	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
-	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
@@ -490,15 +490,15 @@ nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
 {
 	int hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = NV30_VP_INST_DEST_POS;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV30_VP_INST_DEST_COL0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV30_VP_INST_DEST_COL1;
 		} else {
 			NOUVEAU_ERR("bad colour semantic index\n");
@@ -506,10 +506,10 @@ nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
 		}
 		break;
 	case TGSI_SEMANTIC_BCOLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV30_VP_INST_DEST_BFC0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV30_VP_INST_DEST_BFC1;
 		} else {
 			NOUVEAU_ERR("bad bcolour semantic index\n");
@@ -523,19 +523,22 @@ nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
 		hw = NV30_VP_INST_DEST_PSZ;
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.SemanticIndex <= 7) {
-			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		if (fdec->Semantic.Index <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.Index);
 		} else {
 			NOUVEAU_ERR("bad generic semantic index\n");
 			return FALSE;
 		}
 		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
 	default:
 		NOUVEAU_ERR("bad output semantic\n");
 		return FALSE;
 	}
 
-	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	vpc->output_map[fdec->Range.First] = hw;
 	return TRUE;
 }
 
@@ -647,7 +650,9 @@ static boolean
 nv30_vertprog_validate(struct nv30_context *nv30)
 { 
 	struct pipe_screen *pscreen = nv30->pipe.screen;
-	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_screen *screen = nv30->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *rankine = screen->rankine;
 	struct nv30_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -681,7 +686,7 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				assert(0);
 		}
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_ref(so, &vp->so);
@@ -767,9 +772,9 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -785,11 +790,11 @@ nv30_vertprog_validate(struct nv30_context *nv30)
 				vp->insns[i].data[2], vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index 7f008274a4e..f79ae4db84e 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -10,21 +10,32 @@ nv40_flush(struct pipe_context *pipe, unsigned flags,
 	   struct pipe_fence_handle **fence)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, 0x1fd8, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
 	}
 
-	FIRE_RING(fence);
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
 }
 
 static void
 nv40_destroy(struct pipe_context *pipe)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < NV40_STATE_MAX; i++) {
+		if (nv40->state.hw[i])
+			so_ref(NULL, &nv40->state.hw[i]);
+	}
 
 	if (nv40->draw)
 		draw_destroy(nv40->draw);
@@ -58,6 +69,9 @@ nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
 	nv40->pipe.is_texture_referenced = nouveau_is_texture_referenced;
 	nv40->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
 
+	screen->base.channel->user_private = nv40;
+	screen->base.channel->flush_notify = nv40_state_flush_notify;
+
 	nv40_init_query_functions(nv40);
 	nv40_init_surface_functions(nv40);
 	nv40_init_state_functions(nv40);
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
index a3d594167aa..e219bb537ac 100644
--- a/src/gallium/drivers/nv40/nv40_context.h
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -14,10 +14,6 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_gldefs.h"
 #include "nouveau/nouveau_context.h"
-
-#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
-	struct nv40_screen *ctx = nv40->screen
-#include "nouveau/nouveau_push.h"
 #include "nouveau/nouveau_stateobj.h"
 
 #include "nv40_state.h"
@@ -159,7 +155,6 @@ struct nv40_context {
 	unsigned vtxbuf_nr;
 	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
 	unsigned vtxelt_nr;
-	const unsigned *edgeflags;
 };
 
 static INLINE struct nv40_context *
@@ -184,7 +179,7 @@ extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
 
 /* nv40_draw.c */
 extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
-extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+extern void nv40_draw_elements_swtnl(struct pipe_context *pipe,
 					struct pipe_buffer *idxbuf,
 					unsigned ib_size, unsigned mode,
 					unsigned start, unsigned count);
@@ -204,6 +199,7 @@ extern void nv40_fragtex_bind(struct nv40_context *);
 extern boolean nv40_state_validate(struct nv40_context *nv40);
 extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
 extern void nv40_state_emit(struct nv40_context *nv40);
+extern void nv40_state_flush_notify(struct nouveau_channel *chan);
 extern struct nv40_state_entry nv40_state_rasterizer;
 extern struct nv40_state_entry nv40_state_scissor;
 extern struct nv40_state_entry nv40_state_stipple;
@@ -219,9 +215,9 @@ extern struct nv40_state_entry nv40_state_vbo;
 extern struct nv40_state_entry nv40_state_vtxfmt;
 
 /* nv40_vbo.c */
-extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv40_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv40_draw_elements(struct pipe_context *pipe,
+extern void nv40_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index b2f19ecb699..d826f8c2f5f 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -31,6 +31,9 @@ nv40_render_stage(struct draw_stage *stage)
 static INLINE void
 nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 {
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
@@ -41,30 +44,30 @@ nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (fui(v->data[idx][0]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
 			break;
 		case EMIT_2F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
 			break;
 		case EMIT_3F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
 			break;
 		case EMIT_4F:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (fui(v->data[idx][0]));
-			OUT_RING  (fui(v->data[idx][1]));
-			OUT_RING  (fui(v->data[idx][2]));
-			OUT_RING  (fui(v->data[idx][3]));
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
 			break;
 		case EMIT_4UB:
-			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
-			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+			BEGIN_RING(chan, curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
 					    float_to_ubyte(v->data[idx][1]),
 					    float_to_ubyte(v->data[idx][2]),
 					    float_to_ubyte(v->data[idx][3])));
@@ -82,7 +85,11 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 {
 	struct nv40_render_stage *rs = nv40_render_stage(stage);
 	struct nv40_context *nv40 = rs->nv40;
-	struct nouveau_pushbuf *pb = nv40->screen->base.channel->pushbuf;
+
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_pushbuf *pb = chan->pushbuf;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned i;
 
 	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
@@ -91,19 +98,19 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 			NOUVEAU_ERR("AIII, missed flush\n");
 			assert(0);
 		}
-		FIRE_RING(NULL);
+		FIRE_RING(chan);
 		nv40_state_emit(nv40);
 	}
 
 	/* Switch primitive modes if necessary */
 	if (rs->prim != mode) {
 		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+			BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (mode);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
 		rs->prim = mode;
 	}
 
@@ -115,8 +122,8 @@ nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
 	 * off the primitive now.
 	 */
 	if (pb->remaining < ((count * 20) + 6)) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -144,10 +151,13 @@ nv40_render_flush(struct draw_stage *draw, unsigned flags)
 {
 	struct nv40_render_stage *rs = nv40_render_stage(draw);
 	struct nv40_context *nv40 = rs->nv40;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, NV40TCL_BEGIN_END_STOP);
 		rs->prim = NV40TCL_BEGIN_END_STOP;
 	}
 }
@@ -226,7 +236,7 @@ nv40_draw_render_stage(struct nv40_context *nv40)
 	return &render->stage;
 }
 
-boolean
+void
 nv40_draw_elements_swtnl(struct pipe_context *pipe,
 			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
 			 unsigned mode, unsigned start, unsigned count)
@@ -237,7 +247,7 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 	void *map;
 
 	if (!nv40_state_validate_swtnl(nv40))
-		return FALSE;
+		return;
 	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
 	nv40_state_emit(nv40);
 
@@ -261,7 +271,8 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 		map = pipe_buffer_map(pscreen,
 				      nv40->constbuf[PIPE_SHADER_VERTEX],
 				      PIPE_BUFFER_USAGE_CPU_READ);
-		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+		draw_set_mapped_constant_buffer(nv40->draw, PIPE_SHADER_VERTEX,
+                                                map, nr);
 	}
 
 	draw_arrays(nv40->draw, mode, start, count);
@@ -277,15 +288,13 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 
 	draw_flush(nv40->draw);
 	pipe->flush(pipe, 0, NULL);
-
-	return TRUE;
 }
 
 static INLINE void
 emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
 	    unsigned semantic, unsigned index)
 {
-	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned draw_out = draw_find_shader_output(nv40->draw, semantic, index);
 	unsigned a = nv40->swtnl.nr_attribs++;
 
 	nv40->swtnl.hw[a] = hw;
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index 99277506fc2..1237066c398 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -149,7 +149,7 @@ emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
 				sizeof(uint32_t) * 4);
 		}
 
-		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);
 		break;
 	case NV40SR_NONE:
 		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
@@ -255,50 +255,50 @@ tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
 {
 	struct nv40_sreg src;
 
-	switch (fsrc->SrcRegister.File) {
+	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
 		src = nv40_sr(NV40SR_INPUT,
-			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+			      fpc->attrib_map[fsrc->Register.Index]);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		src = constant(fpc, fsrc->Register.Index, NULL);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
-		src = fpc->imm[fsrc->SrcRegister.Index];
+		assert(fsrc->Register.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		src = fpc->r_temp[fsrc->Register.Index];
 		break;
 	/* NV40 fragprog result regs are just temps, so this is simple */
 	case TGSI_FILE_OUTPUT:
-		src = fpc->r_result[fsrc->SrcRegister.Index];
+		src = fpc->r_result[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
 		break;
 	}
 
-	src.abs = fsrc->SrcRegisterExtMod.Absolute;
-	src.negate = fsrc->SrcRegister.Negate;
-	src.swz[0] = fsrc->SrcRegister.SwizzleX;
-	src.swz[1] = fsrc->SrcRegister.SwizzleY;
-	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
-	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
 	return src;
 }
 
 static INLINE struct nv40_sreg
 tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
-	switch (fdst->DstRegister.File) {
+	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
-		return fpc->r_result[fdst->DstRegister.Index];
+		return fpc->r_result[fdst->Register.Index];
 	case TGSI_FILE_TEMPORARY:
-		return fpc->r_temp[fdst->DstRegister.Index];
+		return fpc->r_temp[fdst->Register.Index];
 	case TGSI_FILE_NULL:
 		return nv40_sr(NV40SR_NONE, 0);
 	default:
-		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
 		return nv40_sr(NV40SR_NONE, 0);
 	}
 }
@@ -364,8 +364,8 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 			src[i] = tgsi_src(fpc, fsrc);
 		}
 	}
@@ -373,9 +373,9 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
+		fsrc = &finst->Src[i];
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_TEMPORARY:
@@ -386,10 +386,10 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 			break;
 		}
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
-			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
-				ai = fsrc->SrcRegister.Index;
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
 				src[i] = temp(fpc);
@@ -399,8 +399,8 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 			break;
 		case TGSI_FILE_CONSTANT:
 			if ((ci == -1 && ii == -1) ||
-			    ci == fsrc->SrcRegister.Index) {
-				ci = fsrc->SrcRegister.Index;
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
 				src[i] = temp(fpc);
@@ -410,8 +410,8 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 			break;
 		case TGSI_FILE_IMMEDIATE:
 			if ((ci == -1 && ii == -1) ||
-			    ii == fsrc->SrcRegister.Index) {
-				ii = fsrc->SrcRegister.Index;
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(fpc, fsrc);
 			} else {
 				src[i] = temp(fpc);
@@ -423,7 +423,7 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 			/* handled above */
 			break;
 		case TGSI_FILE_SAMPLER:
-			unit = fsrc->SrcRegister.Index;
+			unit = fsrc->Register.Index;
 			break;
 		case TGSI_FILE_OUTPUT:
 			break;
@@ -433,8 +433,8 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 		}
 	}
 
-	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
-	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	dst  = tgsi_dst(fpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
 
 	switch (finst->Instruction.Opcode) {
@@ -445,10 +445,11 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = temp(fpc);
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp = nv40_sr(NV40SR_NONE, 0);
 		tmp.cc_update = 1;
 		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
 		dst.cc_test = NV40_VP_INST_COND_LT;
 		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
 		break;
@@ -573,13 +574,28 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 		      neg(swz(tmp, X, X, X, X)), none, none);
 		break;
 	case TGSI_OPCODE_SCS:
-		if (mask & MASK_X) {
-			arith(fpc, sat, COS, dst, MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
+		/* avoid overwriting the source */
+		if(src[0].swz[SWZ_X] != SWZ_X)
+		{
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
-		if (mask & MASK_Y) {
-			arith(fpc, sat, SIN, dst, MASK_Y,
-			      swz(src[0], X, X, X, X), none, none);
+		else
+		{
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
 		break;
 	case TGSI_OPCODE_SEQ:
@@ -644,15 +660,15 @@ nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
 {
 	int hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = NV40_FP_OP_INPUT_SRC_POSITION;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV40_FP_OP_INPUT_SRC_COL0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV40_FP_OP_INPUT_SRC_COL1;
 		} else {
 			NOUVEAU_ERR("bad colour semantic index\n");
@@ -663,9 +679,9 @@ nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
 		hw = NV40_FP_OP_INPUT_SRC_FOGC;
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.SemanticIndex <= 7) {
+		if (fdec->Semantic.Index <= 7) {
 			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
-						     SemanticIndex);
+						     Index);
 		} else {
 			NOUVEAU_ERR("bad generic semantic index\n");
 			return FALSE;
@@ -676,7 +692,7 @@ nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
 		return FALSE;
 	}
 
-	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	fpc->attrib_map[fdec->Range.First] = hw;
 	return TRUE;
 }
 
@@ -684,15 +700,15 @@ static boolean
 nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
 				const struct tgsi_full_declaration *fdec)
 {
-	unsigned idx = fdec->DeclarationRange.First;
+	unsigned idx = fdec->Range.First;
 	unsigned hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = 1;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		switch (fdec->Semantic.SemanticIndex) {
+		switch (fdec->Semantic.Index) {
 		case 0: hw = 0; break;
 		case 1: hw = 2; break;
 		case 2: hw = 3; break;
@@ -738,9 +754,9 @@ nv40_fragprog_prepare(struct nv40_fpc *fpc)
 					goto out_err;
 				break;
 			case TGSI_FILE_TEMPORARY:
-				if (fdec->DeclarationRange.Last > high_temp) {
+				if (fdec->Range.Last > high_temp) {
 					high_temp =
-						fdec->DeclarationRange.Last;
+						fdec->Range.Last;
 				}
 				break;
 			default:
@@ -752,7 +768,7 @@ nv40_fragprog_prepare(struct nv40_fpc *fpc)
 		{
 			struct tgsi_full_immediate *imm;
 			float vals[4];
-			
+
 			imm = &p.FullToken.FullImmediate;
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
@@ -836,7 +852,7 @@ nv40_fragprog_translate(struct nv40_context *nv40,
 	fp->insn[fpc->inst_offset + 1] = 0x00000000;
 	fp->insn[fpc->inst_offset + 2] = 0x00000000;
 	fp->insn[fpc->inst_offset + 3] = 0x00000000;
-	
+
 	fp->translated = TRUE;
 out_err:
 	tgsi_parse_free(&parse);
@@ -903,7 +919,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
 	nv40_fragprog_upload(nv40, fp);
 
-	so = so_new(4, 1);
+	so = so_new(2, 2, 1);
 	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
 	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
 		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
@@ -917,7 +933,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 update_constants:
 	if (fp->nr_consts) {
 		float *map;
-		
+
 		map = pipe_buffer_map(pscreen, constbuf,
 				      PIPE_BUFFER_USAGE_CPU_READ);
 		for (i = 0; i < fp->nr_consts; i++) {
@@ -948,6 +964,12 @@ void
 nv40_fragprog_destroy(struct nv40_context *nv40,
 		      struct nv40_fragment_program *fp)
 {
+	if (fp->buffer)
+		pipe_buffer_reference(&fp->buffer, NULL);
+
+	if (fp->so)
+		so_ref(NULL, &fp->so);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
index e2ec57564d1..aad9198210f 100644
--- a/src/gallium/drivers/nv40/nv40_fragtex.c
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -108,7 +108,7 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 
 	txs = tf->swizzle;
 
-	so = so_new(16, 2);
+	so = so_new(2, 9, 2);
 	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
 	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
 	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
@@ -117,11 +117,11 @@ nv40_fragtex_build(struct nv40_context *nv40, int unit)
 	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
 	so_data  (so, txs);
 	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
-	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
-		       pt->height[0]);
+	so_data  (so, (pt->width0 << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height0);
 	so_data  (so, ps->bcol);
 	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
-	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+	so_data  (so, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
 
 	return so;
 }
@@ -139,7 +139,7 @@ nv40_fragtex_validate(struct nv40_context *nv40)
 		unit = ffs(samplers) - 1;
 		samplers &= ~(1 << unit);
 
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
 		so_data  (so, 0);
 		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
index 465dd3b0693..89bd155ff49 100644
--- a/src/gallium/drivers/nv40/nv40_miptree.c
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -1,14 +1,19 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
 
 #include "nv40_context.h"
+#include "../nv04/nv04_surface_2d.h"
+
+
 
 static void
 nv40_miptree_layout(struct nv40_miptree *mt)
 {
 	struct pipe_texture *pt = &mt->base;
-	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint width = pt->width0;
 	uint offset = 0;
 	int nr_faces, l, f;
 	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
@@ -21,29 +26,21 @@ nv40_miptree_layout(struct nv40_miptree *mt)
 		nr_faces = 6;
 	} else
 	if (pt->target == PIPE_TEXTURE_3D) {
-		nr_faces = pt->depth[0];
+		nr_faces = pt->depth0;
 	} else {
 		nr_faces = 1;
 	}
 
 	for (l = 0; l <= pt->last_level; l++) {
-		pt->width[l] = width;
-		pt->height[l] = height;
-		pt->depth[l] = depth;
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
-
 		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
-			mt->level[l].pitch = align(pt->width[0] * pt->block.size, 64);
+			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
 		else
-			mt->level[l].pitch = pt->width[l] * pt->block.size;
+			mt->level[l].pitch = util_format_get_stride(pt->format, width);
 
 		mt->level[l].image_offset =
 			CALLOC(nr_faces, sizeof(unsigned));
 
-		width  = MAX2(1, width  >> 1);
-		height = MAX2(1, height >> 1);
-		depth  = MAX2(1, depth  >> 1);
+		width  = u_minify(width, 1);
 	}
 
 	for (f = 0; f < nr_faces; f++) {
@@ -51,14 +48,14 @@ nv40_miptree_layout(struct nv40_miptree *mt)
 			mt->level[l].image_offset[f] = offset;
 
 			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
-			    pt->width[l + 1] > 1 && pt->height[l + 1] > 1)
-				offset += align(mt->level[l].pitch * pt->height[l], 64);
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
 			else
-				offset += mt->level[l].pitch * pt->height[l];
+				offset += mt->level[l].pitch * u_minify(pt->height0, l);
 		}
 
 		mt->level[l].image_offset[f] = offset;
-		offset += mt->level[l].pitch * pt->height[l];
+		offset += mt->level[l].pitch * u_minify(pt->height0, l);
 	}
 
 	mt->total_size = offset;
@@ -79,8 +76,8 @@ nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	mt->base.screen = pscreen;
 
 	/* Swizzled textures must be POT */
-	if (pt->width[0] & (pt->width[0] - 1) ||
-	    pt->height[0] & (pt->height[0] - 1))
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
 		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
 	else
 	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
@@ -109,6 +106,12 @@ nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv40_miptree_layout(mt);
 
 	mt->buffer = pscreen->buffer_create(pscreen, 256, buf_usage, mt->total_size);
@@ -116,7 +119,7 @@ nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 		FREE(mt);
 		return NULL;
 	}
-
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -128,7 +131,7 @@ nv40_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv40_miptree);
@@ -145,6 +148,7 @@ nv40_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 	mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
 
 	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
 	return &mt->base;
 }
 
@@ -176,8 +180,8 @@ nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	pipe_texture_reference(&ns->base.texture, pt);
 	ns->base.format = pt->format;
-	ns->base.width = pt->width[level];
-	ns->base.height = pt->height[level];
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
 	ns->base.usage = flags;
 	pipe_reference_init(&ns->base.reference, 1);
 	ns->base.face = face;
@@ -194,12 +198,27 @@ nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		ns->base.offset = mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(pscreen, ((struct nv40_screen*)pscreen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv40_miptree_surface_del(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv40_screen* screen = (struct nv40_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv40_miptree_surface_del(&ns->backing->base);
+	}
+
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
index 7874aedd428..8ed4a67dd03 100644
--- a/src/gallium/drivers/nv40/nv40_query.c
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -41,6 +41,9 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 
@@ -57,10 +60,10 @@ nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 		assert(0);
 	nouveau_notifier_reset(nv40->screen->query, q->object->start);
 
-	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
-	OUT_RING  (1);
-	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
-	OUT_RING  (1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
 }
@@ -70,11 +73,14 @@ nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_query *q = nv40_query(pq);
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
-	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
-	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+	BEGIN_RING(chan, curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
 		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
-	FIRE_RING(NULL);
+	FIRE_RING(chan);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index bd13dfddd1c..9e55e5a089c 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -140,6 +140,12 @@ static void
 nv40_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv40_screen *screen = nv40_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < NV40_STATE_MAX; i++) {
+		if (screen->state[i])
+			so_ref(NULL, &screen->state[i]);
+	}
 
 	nouveau_resource_free(&screen->vp_exec_heap);
 	nouveau_resource_free(&screen->vp_data_heap);
@@ -147,6 +153,7 @@ nv40_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->curie);
+	nv04_surface_2d_takedown(&screen->eng2d);
 
 	nouveau_screen_fini(&screen->base);
 
@@ -208,7 +215,6 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
 		return FALSE;
 	}
-	BIND_RING(chan, screen->curie, 7);
 
 	/* 2D engine setup */
 	screen->eng2d = nv04_surface_2d_init(&screen->base);
@@ -245,7 +251,7 @@ nv40_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static curie initialisation */
-	so = so_new(128, 0);
+	so = so_new(16, 25, 0);
 	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
 	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
index c3ee4d23453..7d990f7d567 100644
--- a/src/gallium/drivers/nv40/nv40_state.c
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -16,7 +16,7 @@ nv40_blend_state_create(struct pipe_context *pipe,
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
-	struct nouveau_stateobj *so = so_new(16, 0);
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
 
 	if (cso->blend_enable) {
 		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
@@ -310,7 +310,7 @@ nv40_rasterizer_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(8, 18, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	/*XXX: ignored:
@@ -445,7 +445,7 @@ nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
-	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_stateobj *so = so_new(4, 21, 0);
 	struct nouveau_grobj *curie = nv40->screen->curie;
 
 	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
@@ -605,12 +605,12 @@ nv40_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
 
-	nv40->constbuf[shader] = buf->buffer;
-	nv40->constbuf_nr[shader] = buf->buffer->size / (4 * sizeof(float));
+	nv40->constbuf[shader] = buf;
+	nv40->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
 
 	if (shader == PIPE_SHADER_VERTEX) {
 		nv40->dirty |= NV40_NEW_VERTPROG;
@@ -687,16 +687,6 @@ nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
 	nv40->draw_dirty |= NV40_NEW_ARRAYS;
 }
 
-static void
-nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
-	struct nv40_context *nv40 = nv40_context(pipe);
-
-	nv40->edgeflags = bitfield;
-	nv40->dirty |= NV40_NEW_ARRAYS;
-	nv40->draw_dirty |= NV40_NEW_ARRAYS;
-}
-
 void
 nv40_init_state_functions(struct nv40_context *nv40)
 {
@@ -705,9 +695,9 @@ nv40_init_state_functions(struct nv40_context *nv40)
 	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
 
 	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
-	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.bind_fragment_sampler_states = nv40_sampler_state_bind;
 	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
-	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+	nv40->pipe.set_fragment_sampler_textures = nv40_set_sampler_texture;
 
 	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
 	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
@@ -736,7 +726,6 @@ nv40_init_state_functions(struct nv40_context *nv40)
 	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
 	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
 
-	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
 	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
 	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
 }
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
index 8a9d8c8fdf6..192074e7471 100644
--- a/src/gallium/drivers/nv40/nv40_state.h
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -75,6 +75,7 @@ struct nv40_fragment_program {
 
 struct nv40_miptree {
 	struct pipe_texture base;
+	struct nouveau_bo *bo;
 
 	struct pipe_buffer *buffer;
 	uint total_size;
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
index 8cd05ce66ef..3ff00a37f66 100644
--- a/src/gallium/drivers/nv40/nv40_state_blend.c
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -18,7 +18,7 @@ struct nv40_state_entry nv40_state_blend = {
 static boolean
 nv40_state_blend_colour_validate(struct nv40_context *nv40)
 {
-	struct nouveau_stateobj *so = so_new(2, 0);
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
 	struct pipe_blend_color *bcol = &nv40->blend_colour;
 
 	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
index 198692965dc..13fe854915b 100644
--- a/src/gallium/drivers/nv40/nv40_state_emit.c
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -54,10 +54,11 @@ nv40_state_do_validate(struct nv40_context *nv40,
 void
 nv40_state_emit(struct nv40_context *nv40)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
 	struct nv40_state *state = &nv40->state;
 	struct nv40_screen *screen = nv40->screen;
-	unsigned i, samplers;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
+	unsigned i;
 	uint64_t states;
 
 	if (nv40->pctx_id != screen->cur_pctx) {
@@ -80,13 +81,21 @@ nv40_state_emit(struct nv40_context *nv40)
 
 	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
 			    (1ULL << NV40_STATE_FRAGTEX0))) {
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (2);
-		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
-		OUT_RING  (1);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (chan, 1);
 	}
 
 	state->dirty = 0;
+}
+
+void
+nv40_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nv40_context *nv40 = chan->user_private;
+	struct nv40_state *state = &nv40->state;
+	unsigned i, samplers;
 
 	so_emit_reloc_markers(chan, state->hw[NV40_STATE_FB]);
 	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
@@ -160,7 +169,6 @@ nv40_state_validate_swtnl(struct nv40_context *nv40)
 		draw_set_viewport_state(draw, &nv40->viewport);
 
 	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
-		draw_set_edgeflags(draw, nv40->edgeflags);
 		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
 		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
 	}
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
index 1c7a7cd64f0..a58fe9ddb19 100644
--- a/src/gallium/drivers/nv40/nv40_state_fb.c
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -19,7 +19,7 @@ nv40_state_framebuffer_validate(struct nv40_context *nv40)
 	struct nv04_surface *rt[4], *zeta;
 	uint32_t rt_enable, rt_format;
 	int i, colour_format = 0, zeta_format = 0;
-	struct nouveau_stateobj *so = so_new(64, 10);
+	struct nouveau_stateobj *so = so_new(18, 24, 10);
 	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
 	unsigned w = fb->width;
 	unsigned h = fb->height;
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
index cf58d33906a..753a505e934 100644
--- a/src/gallium/drivers/nv40/nv40_state_scissor.c
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -12,7 +12,7 @@ nv40_state_scissor_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.scissor_enabled = rast->scissor;
 
-	so = so_new(3, 0);
+	so = so_new(1, 2, 0);
 	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
 	if (nv40->state.scissor_enabled) {
 		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
index b51024ad9b2..2b371ebfec0 100644
--- a/src/gallium/drivers/nv40/nv40_state_stipple.c
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -14,14 +14,14 @@ nv40_state_stipple_validate(struct nv40_context *nv40)
 	if (rast->poly_stipple_enable) {
 		unsigned i;
 
-		so = so_new(35, 0);
+		so = so_new(2, 33, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
 			so_data(so, nv40->stipple[i]);
 	} else {
-		so = so_new(2, 0);
+		so = so_new(1, 1, 0);
 		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
 		so_data  (so, 0);
 	}
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
index 665d2d5fcac..9919ba1d0b0 100644
--- a/src/gallium/drivers/nv40/nv40_state_viewport.c
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -19,7 +19,7 @@ nv40_state_viewport_validate(struct nv40_context *nv40)
 		return FALSE;
 	nv40->state.viewport_bypass = bypass;
 
-	so = so_new(11, 0);
+	so = so_new(2, 9, 0);
 	if (!bypass) {
 		so_method(so, nv40->screen->curie,
 			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index 92caee6f382..791ee6823d3 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -1,7 +1,9 @@
 #include <pipe/p_state.h>
 #include <pipe/p_defines.h>
 #include <pipe/p_inlines.h>
+#include <util/u_format.h>
 #include <util/u_memory.h>
+#include <util/u_math.h>
 #include <nouveau/nouveau_winsys.h>
 #include "nv40_context.h"
 #include "nv40_screen.h"
@@ -10,22 +12,19 @@
 struct nv40_transfer {
 	struct pipe_transfer base;
 	struct pipe_surface *surface;
-	bool direct;
+	boolean direct;
 };
 
 static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width[0] = pt->width[level];
-	template->height[0] = pt->height[level];
-	template->depth[0] = 1;
-	template->block = pt->block;
-	template->nblocksx[0] = pt->nblocksx[level];
-	template->nblocksy[0] = pt->nblocksx[level];
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
 
@@ -48,14 +47,10 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
 	tx->base.x = x;
 	tx->base.y = y;
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	tx->base.nblocksx = pt->nblocksx[level];
-	tx->base.nblocksy = pt->nblocksy[level];
 	tx->base.stride = mt->level[level].pitch;
 	tx->base.usage = usage;
 	tx->base.face = face;
@@ -76,7 +71,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv40_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv40_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -85,6 +80,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv40_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -110,8 +107,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -131,13 +128,13 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -156,8 +153,10 @@ nv40_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
index b2753b8e2e0..a777898f688 100644
--- a/src/gallium/drivers/nv40/nv40_vbo.c
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -164,18 +164,21 @@ nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
 	return TRUE;
 }
 
-boolean
+void
 nv40_draw_arrays(struct pipe_context *pipe,
 		 unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	nv40_vbo_set_idxbuf(nv40, NULL, 0);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	while (count) {
@@ -186,17 +189,17 @@ nv40_draw_arrays(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -206,29 +209,30 @@ nv40_draw_arrays(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_VERTEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static INLINE void
 nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint8_t *elts = (uint8_t *)ib + start;
@@ -239,17 +243,17 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -258,16 +262,16 @@ nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -277,7 +281,9 @@ static INLINE void
 nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint16_t *elts = (uint16_t *)ib + start;
@@ -288,17 +294,17 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		if (vc & 1) {
-			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
-			OUT_RING  (elts[0]);
+			BEGIN_RING(chan, curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
 			elts++; vc--;
 		}
 
@@ -307,16 +313,16 @@ nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
 
 			push = MIN2(vc, 2047 * 2);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
 			for (i = 0; i < push; i+=2)
-				OUT_RING((elts[i+1] << 16) | elts[i]);
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
@@ -326,7 +332,9 @@ static INLINE void
 nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		       unsigned mode, unsigned start, unsigned count)
 {
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 
 	while (count) {
 		uint32_t *elts = (uint32_t *)ib + start;
@@ -337,32 +345,32 @@ nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
 					mode, start, count, &restart);
 		if (vc == 0) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		count -= vc;
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		while (vc) {
 			push = MIN2(vc, 2047);
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
-			OUT_RINGp    (elts, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
 
 			vc -= push;
 			elts += push;
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		start = restart;
 	}
 }
 
-static boolean
+static void
 nv40_draw_elements_inline(struct pipe_context *pipe,
 			  struct pipe_buffer *ib, unsigned ib_size,
 			  unsigned mode, unsigned start, unsigned count)
@@ -393,15 +401,16 @@ nv40_draw_elements_inline(struct pipe_context *pipe,
 	}
 
 	pipe_buffer_unmap(pscreen, ib);
-	return TRUE;
 }
 
-static boolean
+static void
 nv40_draw_elements_vbo(struct pipe_context *pipe,
 		       unsigned mode, unsigned start, unsigned count)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
-	struct nouveau_channel *chan = nv40->screen->base.channel;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	unsigned restart;
 
 	while (count) {
@@ -412,17 +421,17 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
 					mode, start, count, &restart);
 		if (!vc) {
-			FIRE_RING(NULL);
+			FIRE_RING(chan);
 			continue;
 		}
 		
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (nvgl_primitive(mode));
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
 
 		nr = (vc & 0xff);
 		if (nr) {
-			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
-			OUT_RING  (((nr - 1) << 24) | start);
+			BEGIN_RING(chan, curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
 			start += nr;
 		}
 
@@ -432,24 +441,22 @@ nv40_draw_elements_vbo(struct pipe_context *pipe,
 
 			nr -= push;
 
-			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			BEGIN_RING_NI(chan, curie, NV40TCL_VB_INDEX_BATCH, push);
 			while (push--) {
-				OUT_RING(((0x100 - 1) << 24) | start);
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
 				start += 0x100;
 			}
 		}
 
-		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
-		OUT_RING  (0);
+		BEGIN_RING(chan, curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
 
 		count -= vc;
 		start = restart;
 	}
-
-	return TRUE;
 }
 
-boolean
+void
 nv40_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -459,8 +466,9 @@ nv40_draw_elements(struct pipe_context *pipe,
 
 	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
 	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
-		return nv40_draw_elements_swtnl(pipe, NULL, 0,
-						mode, start, count);
+		nv40_draw_elements_swtnl(pipe, NULL, 0,
+                                         mode, start, count);
+                return;
 	}
 
 	if (idxbuf) {
@@ -471,7 +479,6 @@ nv40_draw_elements(struct pipe_context *pipe,
 	}
 
 	pipe->flush(pipe, 0, NULL);
-	return TRUE;
 }
 
 static boolean
@@ -484,14 +491,9 @@ nv40_vbo_validate(struct nv40_context *nv40)
 	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
 	int hw;
 
-	if (nv40->edgeflags) {
-		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
-		return FALSE;
-	}
-
-	vtxbuf = so_new(20, 18);
+	vtxbuf = so_new(3, 17, 18);
 	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
-	vtxfmt = so_new(17, 0);
+	vtxfmt = so_new(1, 16, 0);
 	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
 
 	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
@@ -504,7 +506,7 @@ nv40_vbo_validate(struct nv40_context *nv40)
 
 		if (!vb->stride) {
 			if (!sattr)
-				sattr = so_new(16 * 5, 0);
+				sattr = so_new(16, 16 * 4, 0);
 
 			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
 				so_data(vtxbuf, 0);
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
index 31dae2457fd..8d80fcad38e 100644
--- a/src/gallium/drivers/nv40/nv40_vertprog.c
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -295,30 +295,30 @@ static INLINE struct nv40_sreg
 tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
 	struct nv40_sreg src;
 
-	switch (fsrc->SrcRegister.File) {
+	switch (fsrc->Register.File) {
 	case TGSI_FILE_INPUT:
-		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		src = nv40_sr(NV40SR_INPUT, fsrc->Register.Index);
 		break;
 	case TGSI_FILE_CONSTANT:
-		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
 		break;
 	case TGSI_FILE_IMMEDIATE:
-		src = vpc->imm[fsrc->SrcRegister.Index];
+		src = vpc->imm[fsrc->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		src = vpc->r_temp[fsrc->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad src file\n");
 		break;
 	}
 
-	src.abs = fsrc->SrcRegisterExtMod.Absolute;
-	src.negate = fsrc->SrcRegister.Negate;
-	src.swz[0] = fsrc->SrcRegister.SwizzleX;
-	src.swz[1] = fsrc->SrcRegister.SwizzleY;
-	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
-	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
 	return src;
 }
 
@@ -326,15 +326,15 @@ static INLINE struct nv40_sreg
 tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
 	struct nv40_sreg dst;
 
-	switch (fdst->DstRegister.File) {
+	switch (fdst->Register.File) {
 	case TGSI_FILE_OUTPUT:
-		dst = vpc->r_result[fdst->DstRegister.Index];
+		dst = vpc->r_result[fdst->Register.Index];
 		break;
 	case TGSI_FILE_TEMPORARY:
-		dst = vpc->r_temp[fdst->DstRegister.Index];
+		dst = vpc->r_temp[fdst->Register.Index];
 		break;
 	case TGSI_FILE_ADDRESS:
-		dst = vpc->r_address[fdst->DstRegister.Index];
+		dst = vpc->r_address[fdst->Register.Index];
 		break;
 	default:
 		NOUVEAU_ERR("bad dst file\n");
@@ -405,8 +405,8 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
-		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
 			src[i] = tgsi_src(vpc, fsrc);
 		}
 	}
@@ -414,9 +414,9 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
 		const struct tgsi_full_src_register *fsrc;
 
-		fsrc = &finst->FullSrcRegisters[i];
+		fsrc = &finst->Src[i];
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
 		case TGSI_FILE_CONSTANT:
 		case TGSI_FILE_TEMPORARY:
@@ -427,10 +427,10 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 			break;
 		}
 
-		switch (fsrc->SrcRegister.File) {
+		switch (fsrc->Register.File) {
 		case TGSI_FILE_INPUT:
-			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
-				ai = fsrc->SrcRegister.Index;
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -440,8 +440,8 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 			break;
 		case TGSI_FILE_CONSTANT:
 			if ((ci == -1 && ii == -1) ||
-			    ci == fsrc->SrcRegister.Index) {
-				ci = fsrc->SrcRegister.Index;
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -451,8 +451,8 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 			break;
 		case TGSI_FILE_IMMEDIATE:
 			if ((ci == -1 && ii == -1) ||
-			    ii == fsrc->SrcRegister.Index) {
-				ii = fsrc->SrcRegister.Index;
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
 				src[i] = tgsi_src(vpc, fsrc);
 			} else {
 				src[i] = temp(vpc);
@@ -469,8 +469,8 @@ nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
 		}
 	}
 
-	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
-	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
 
 	switch (finst->Instruction.Opcode) {
 	case TGSI_OPCODE_ABS:
@@ -577,19 +577,19 @@ static boolean
 nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
 				const struct tgsi_full_declaration *fdec)
 {
-	unsigned idx = fdec->DeclarationRange.First;
+	unsigned idx = fdec->Range.First;
 	int hw;
 
-	switch (fdec->Semantic.SemanticName) {
+	switch (fdec->Semantic.Name) {
 	case TGSI_SEMANTIC_POSITION:
 		hw = NV40_VP_INST_DEST_POS;
 		vpc->hpos_idx = idx;
 		break;
 	case TGSI_SEMANTIC_COLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV40_VP_INST_DEST_COL0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV40_VP_INST_DEST_COL1;
 		} else {
 			NOUVEAU_ERR("bad colour semantic index\n");
@@ -597,10 +597,10 @@ nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
 		}
 		break;
 	case TGSI_SEMANTIC_BCOLOR:
-		if (fdec->Semantic.SemanticIndex == 0) {
+		if (fdec->Semantic.Index == 0) {
 			hw = NV40_VP_INST_DEST_BFC0;
 		} else
-		if (fdec->Semantic.SemanticIndex == 1) {
+		if (fdec->Semantic.Index == 1) {
 			hw = NV40_VP_INST_DEST_BFC1;
 		} else {
 			NOUVEAU_ERR("bad bcolour semantic index\n");
@@ -614,13 +614,17 @@ nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
 		hw = NV40_VP_INST_DEST_PSZ;
 		break;
 	case TGSI_SEMANTIC_GENERIC:
-		if (fdec->Semantic.SemanticIndex <= 7) {
-			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		if (fdec->Semantic.Index <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.Index);
 		} else {
 			NOUVEAU_ERR("bad generic semantic index\n");
 			return FALSE;
 		}
 		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		/* not really an error just a fallback */
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
 	default:
 		NOUVEAU_ERR("bad output semantic\n");
 		return FALSE;
@@ -652,16 +656,16 @@ nv40_vertprog_prepare(struct nv40_vpc *vpc)
 			fdec = &p.FullToken.FullDeclaration;
 			switch (fdec->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
-				if (fdec->DeclarationRange.Last > high_temp) {
+				if (fdec->Range.Last > high_temp) {
 					high_temp =
-						fdec->DeclarationRange.Last;
+						fdec->Range.Last;
 				}
 				break;
 #if 0 /* this would be nice.. except gallium doesn't track it */
 			case TGSI_FILE_ADDRESS:
-				if (fdec->DeclarationRange.Last > high_addr) {
+				if (fdec->Range.Last > high_addr) {
 					high_addr =
-						fdec->DeclarationRange.Last;
+						fdec->Range.Last;
 				}
 				break;
 #endif
@@ -681,11 +685,11 @@ nv40_vertprog_prepare(struct nv40_vpc *vpc)
 			const struct tgsi_full_dst_register *fdst;
 
 			finst = &p.FullToken.FullInstruction;
-			fdst = &finst->FullDstRegisters[0];
+			fdst = &finst->Dst[0];
 
-			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
-				if (fdst->DstRegister.Index > high_addr)
-					high_addr = fdst->DstRegister.Index;
+			if (fdst->Register.File == TGSI_FILE_ADDRESS) {
+				if (fdst->Register.Index > high_addr)
+					high_addr = fdst->Register.Index;
 			}
 		
 		}
@@ -830,7 +834,9 @@ static boolean
 nv40_vertprog_validate(struct nv40_context *nv40)
 { 
 	struct pipe_screen *pscreen = nv40->pipe.screen;
-	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_screen *screen = nv40->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *curie = screen->curie;
 	struct nv40_vertex_program *vp;
 	struct pipe_buffer *constbuf;
 	boolean upload_code = FALSE, upload_data = FALSE;
@@ -880,7 +886,7 @@ check_gpu_resources:
 				assert(0);
 		}
 
-		so = so_new(7, 0);
+		so = so_new(3, 4, 0);
 		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
 		so_data  (so, vp->exec->start);
 		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
@@ -970,9 +976,9 @@ check_gpu_resources:
 				       4 * sizeof(float));
 			}
 
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
-			OUT_RING  (i + vp->data->start);
-			OUT_RINGp ((uint32_t *)vpd->value, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
 
 		if (constbuf)
@@ -989,11 +995,11 @@ check_gpu_resources:
 			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
 		}
 #endif
-		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
-		OUT_RING  (vp->exec->start);
+		BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
 		for (i = 0; i < vp->nr_insns; i++) {
-			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
-			OUT_RINGp (vp->insns[i].data, 4);
+			BEGIN_RING(chan, curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
 		}
 	}
 
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index 219e7a78623..5c705ccc8f1 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -34,6 +34,11 @@ nv50_flush(struct pipe_context *pipe, unsigned flags,
 	struct nv50_context *nv50 = nv50_context(pipe);
 	struct nouveau_channel *chan = nv50->screen->base.channel;
 
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(chan, nv50->screen->tesla, 0x1338, 1);
+		OUT_RING  (chan, 0x20);
+	}
+
 	if (flags & PIPE_FLUSH_FRAME)
 		FIRE_RING(chan);
 }
@@ -43,16 +48,52 @@ nv50_destroy(struct pipe_context *pipe)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
 
+        if (nv50->state.fb)
+		so_ref(NULL, &nv50->state.fb);
+	if (nv50->state.blend)
+		so_ref(NULL, &nv50->state.blend);
+	if (nv50->state.blend_colour)
+		so_ref(NULL, &nv50->state.blend_colour);
+	if (nv50->state.zsa)
+		so_ref(NULL, &nv50->state.zsa);
+	if (nv50->state.rast)
+		so_ref(NULL, &nv50->state.rast);
+	if (nv50->state.stipple)
+		so_ref(NULL, &nv50->state.stipple);
+	if (nv50->state.scissor)
+		so_ref(NULL, &nv50->state.scissor);
+	if (nv50->state.viewport)
+		so_ref(NULL, &nv50->state.viewport);
+	if (nv50->state.tsc_upload)
+		so_ref(NULL, &nv50->state.tsc_upload);
+	if (nv50->state.tic_upload)
+		so_ref(NULL, &nv50->state.tic_upload);
+	if (nv50->state.vertprog)
+		so_ref(NULL, &nv50->state.vertprog);
+	if (nv50->state.fragprog)
+		so_ref(NULL, &nv50->state.fragprog);
+	if (nv50->state.geomprog)
+		so_ref(NULL, &nv50->state.geomprog);
+	if (nv50->state.fp_linkage)
+		so_ref(NULL, &nv50->state.fp_linkage);
+	if (nv50->state.gp_linkage)
+		so_ref(NULL, &nv50->state.gp_linkage);
+	if (nv50->state.vtxfmt)
+		so_ref(NULL, &nv50->state.vtxfmt);
+	if (nv50->state.vtxbuf)
+		so_ref(NULL, &nv50->state.vtxbuf);
+	if (nv50->state.vtxattr)
+		so_ref(NULL, &nv50->state.vtxattr);
+
 	draw_destroy(nv50->draw);
-	FREE(nv50);
-}
 
+	if (nv50->screen->cur_ctx == nv50)
+		nv50->screen->cur_ctx = NULL;
 
-static void
-nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
-{
+	FREE(nv50);
 }
 
+
 struct pipe_context *
 nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 {
@@ -71,9 +112,10 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
 
 	nv50->pipe.destroy = nv50_destroy;
 
-	nv50->pipe.set_edgeflags = nv50_set_edgeflags;
 	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_arrays_instanced = nv50_draw_arrays_instanced;
 	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.draw_elements_instanced = nv50_draw_elements_instanced;
 	nv50->pipe.clear = nv50_clear;
 
 	nv50->pipe.flush = nv50_flush;
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 4b0f0622953..bebcd95054f 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -29,9 +29,7 @@
 #define NV50_CB_PVP		1
 #define NV50_CB_PFP		2
 #define NV50_CB_PGP		3
-#define NV50_CB_TIC		4
-#define NV50_CB_TSC		5
-#define NV50_CB_PUPLOAD         6
+#define NV50_CB_AUX		4
 
 #define NV50_NEW_BLEND		(1 << 0)
 #define NV50_NEW_ZSA		(1 << 1)
@@ -45,9 +43,11 @@
 #define NV50_NEW_VERTPROG_CB	(1 << 9)
 #define NV50_NEW_FRAGPROG	(1 << 10)
 #define NV50_NEW_FRAGPROG_CB	(1 << 11)
-#define NV50_NEW_ARRAYS		(1 << 12)
-#define NV50_NEW_SAMPLER	(1 << 13)
-#define NV50_NEW_TEXTURE	(1 << 14)
+#define NV50_NEW_GEOMPROG	(1 << 12)
+#define NV50_NEW_GEOMPROG_CB	(1 << 13)
+#define NV50_NEW_ARRAYS		(1 << 14)
+#define NV50_NEW_SAMPLER	(1 << 15)
+#define NV50_NEW_TEXTURE	(1 << 16)
 
 struct nv50_blend_stateobj {
 	struct pipe_blend_state pipe;
@@ -65,7 +65,7 @@ struct nv50_rasterizer_stateobj {
 };
 
 struct nv50_sampler_stateobj {
-	bool normalized;
+	boolean normalized;
 	unsigned tsc[8];
 };
 
@@ -126,13 +126,16 @@ struct nv50_state {
 	unsigned viewport_bypass;
 	struct nouveau_stateobj *tsc_upload;
 	struct nouveau_stateobj *tic_upload;
-	unsigned miptree_nr;
+	unsigned miptree_nr[PIPE_SHADER_TYPES];
 	struct nouveau_stateobj *vertprog;
 	struct nouveau_stateobj *fragprog;
-	struct nouveau_stateobj *programs;
+	struct nouveau_stateobj *geomprog;
+	struct nouveau_stateobj *fp_linkage;
+	struct nouveau_stateobj *gp_linkage;
 	struct nouveau_stateobj *vtxfmt;
 	struct nouveau_stateobj *vtxbuf;
 	struct nouveau_stateobj *vtxattr;
+	struct nouveau_stateobj *instbuf;
 	unsigned vtxelt_nr;
 };
 
@@ -157,15 +160,16 @@ struct nv50_context {
 	struct pipe_framebuffer_state framebuffer;
 	struct nv50_program *vertprog;
 	struct nv50_program *fragprog;
+	struct nv50_program *geomprog;
 	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
 	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
 	unsigned vtxelt_nr;
-	struct nv50_sampler_stateobj *sampler[PIPE_MAX_SAMPLERS];
-	unsigned sampler_nr;
-	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
-	unsigned miptree_nr;
+	struct nv50_sampler_stateobj *sampler[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr[PIPE_SHADER_TYPES];
+	struct nv50_miptree *miptree[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
+	unsigned miptree_nr[PIPE_SHADER_TYPES];
 
 	uint16_t vbo_fifo;
 };
@@ -191,13 +195,24 @@ nv50_surface_do_copy(struct nv50_screen *screen, struct pipe_surface *dst,
 extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
 
 /* nv50_vbo.c */
-extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+extern void nv50_draw_arrays(struct pipe_context *, unsigned mode,
 				unsigned start, unsigned count);
-extern boolean nv50_draw_elements(struct pipe_context *pipe,
+extern void nv50_draw_arrays_instanced(struct pipe_context *, unsigned mode,
+					unsigned start, unsigned count,
+					unsigned startInstance,
+					unsigned instanceCount);
+extern void nv50_draw_elements(struct pipe_context *pipe,
 				  struct pipe_buffer *indexBuffer,
 				  unsigned indexSize,
 				  unsigned mode, unsigned start,
 				  unsigned count);
+extern void nv50_draw_elements_instanced(struct pipe_context *pipe,
+					 struct pipe_buffer *indexBuffer,
+					 unsigned indexSize,
+					 unsigned mode, unsigned start,
+					 unsigned count,
+					 unsigned startInstance,
+					 unsigned instanceCount);
 extern void nv50_vbo_validate(struct nv50_context *nv50);
 
 /* nv50_clear.c */
@@ -207,7 +222,9 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers,
 /* nv50_program.c */
 extern void nv50_vertprog_validate(struct nv50_context *nv50);
 extern void nv50_fragprog_validate(struct nv50_context *nv50);
-extern void nv50_linkage_validate(struct nv50_context *nv50);
+extern void nv50_geomprog_validate(struct nv50_context *nv50);
+extern void nv50_fp_linkage_validate(struct nv50_context *nv50);
+extern void nv50_gp_linkage_validate(struct nv50_context *nv50);
 extern void nv50_program_destroy(struct nv50_context *nv50,
 				 struct nv50_program *p);
 
@@ -218,7 +235,7 @@ extern void nv50_state_flush_notify(struct nouveau_channel *chan);
 extern void nv50_so_init_sifc(struct nv50_context *nv50,
 			      struct nouveau_stateobj *so,
 			      struct nouveau_bo *bo, unsigned reloc,
-			      unsigned size);
+			      unsigned offset, unsigned size);
 
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 9c20c5cc282..dc8364ced7e 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -23,6 +23,7 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
 
 #include "nv50_context.h"
 
@@ -55,14 +56,28 @@ get_tile_mode(unsigned ny, unsigned d)
 	return tile_mode | 0x10;
 }
 
+static INLINE unsigned
+get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned nb_h)
+{
+	unsigned tile_h = get_tile_height(tile_mode);
+	unsigned tile_d = get_tile_depth(tile_mode);
+
+	/* pitch_2d == to next slice within this volume-tile */
+	/* pitch_3d == size (in bytes) of a volume-tile */
+	unsigned pitch_2d = tile_h * 64;
+	unsigned pitch_3d = tile_d * align(nb_h, tile_h) * pitch;
+
+	return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d;
+}
+
 static struct pipe_texture *
 nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
 	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
 	struct pipe_texture *pt = &mt->base.base;
-	unsigned width = tmp->width[0], height = tmp->height[0];
-	unsigned depth = tmp->depth[0], image_alignment;
+	unsigned width = tmp->width0, height = tmp->height0;
+	unsigned depth = tmp->depth0, image_alignment;
 	uint32_t tile_flags;
 	int ret, i, l;
 
@@ -77,12 +92,23 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 	case PIPE_FORMAT_Z24S8_UNORM:
 		tile_flags = 0x1800;
 		break;
+	case PIPE_FORMAT_Z16_UNORM:
+		tile_flags = 0x6c00;
+		break;
 	case PIPE_FORMAT_X8Z24_UNORM:
 	case PIPE_FORMAT_S8Z24_UNORM:
 		tile_flags = 0x2800;
 		break;
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+		tile_flags = 0x7400;
+		break;
 	default:
-		tile_flags = 0x7000;
+		if ((pt->tex_usage & PIPE_TEXTURE_USAGE_PRIMARY) &&
+		    util_format_get_blocksizebits(pt->format) == 32)
+			tile_flags = 0x7a00;
+		else
+			tile_flags = 0x7000;
 		break;
 	}
 
@@ -91,20 +117,15 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 
 	for (l = 0; l <= pt->last_level; l++) {
 		struct nv50_miptree_level *lvl = &mt->level[l];
-
-		pt->width[l] = width;
-		pt->height[l] = height;
-		pt->depth[l] = depth;
-		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
-		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+		unsigned nblocksy = util_format_get_nblocksy(pt->format, height);
 
 		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
-		lvl->pitch = align(pt->nblocksx[l] * pt->block.size, 64);
-		lvl->tile_mode = get_tile_mode(pt->nblocksy[l], depth);
+		lvl->pitch = align(util_format_get_stride(pt->format, width), 64);
+		lvl->tile_mode = get_tile_mode(nblocksy, depth);
 
-		width = MAX2(1, width >> 1);
-		height = MAX2(1, height >> 1);
-		depth = MAX2(1, depth >> 1);
+		width = u_minify(width, 1);
+		height = u_minify(height, 1);
+		depth = u_minify(depth, 1);
 	}
 
 	image_alignment  = get_tile_height(mt->level[0].tile_mode) * 64;
@@ -121,8 +142,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 			unsigned tile_d = get_tile_depth(lvl->tile_mode);
 
 			size  = lvl->pitch;
-			size *= align(pt->nblocksy[l], tile_h);
-			size *= align(pt->depth[l], tile_d);
+			size *= align(util_format_get_nblocksy(pt->format, u_minify(pt->height0, l)), tile_h);
+			size *= align(u_minify(pt->depth0, l), tile_d);
 
 			lvl->image_offset[i] = mt->total_size;
 
@@ -135,6 +156,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 				  mt->level[0].tile_mode, tile_flags,
 				  &mt->base.bo);
 	if (ret) {
+		for (l = 0; l <= pt->last_level; ++l)
+			FREE(mt->level[l].image_offset);
 		FREE(mt);
 		return NULL;
 	}
@@ -151,7 +174,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
 
 	/* Only supports 2D, non-mipmapped textures for the moment */
 	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
-	    pt->depth[0] != 1)
+	    pt->depth0 != 1)
 		return NULL;
 
 	mt = CALLOC_STRUCT(nv50_miptree);
@@ -174,6 +197,10 @@ static void
 nv50_miptree_destroy(struct pipe_texture *pt)
 {
 	struct nv50_miptree *mt = nv50_miptree(pt);
+	unsigned l;
+
+	for (l = 0; l <= pt->last_level; ++l)
+		FREE(mt->level[l].image_offset);
 
 	nouveau_bo_ref(NULL, &mt->base.bo);
 	FREE(mt);
@@ -187,23 +214,18 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	struct nv50_miptree *mt = nv50_miptree(pt);
 	struct nv50_miptree_level *lvl = &mt->level[level];
 	struct pipe_surface *ps;
-	int img;
+	unsigned img = 0;
 
 	if (pt->target == PIPE_TEXTURE_CUBE)
 		img = face;
-	else
-	if (pt->target == PIPE_TEXTURE_3D)
-		img = zslice;
-	else
-		img = 0;
 
 	ps = CALLOC_STRUCT(pipe_surface);
 	if (!ps)
 		return NULL;
 	pipe_texture_reference(&ps->texture, pt);
 	ps->format = pt->format;
-	ps->width = pt->width[level];
-	ps->height = pt->height[level];
+	ps->width = u_minify(pt->width0, level);
+	ps->height = u_minify(pt->height0, level);
 	ps->usage = flags;
 	pipe_reference_init(&ps->reference, 1);
 	ps->face = face;
@@ -211,6 +233,12 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	ps->zslice = zslice;
 	ps->offset = lvl->image_offset[img];
 
+	if (pt->target == PIPE_TEXTURE_3D) {
+		unsigned nb_h = util_format_get_nblocksy(pt->format, ps->height);
+		ps->offset += get_zslice_offset(lvl->tile_mode, zslice,
+						lvl->pitch, nb_h);
+	}
+
 	return ps;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index bf50982dd16..e74a526c626 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -92,15 +92,32 @@ struct nv50_reg {
 
 	int rhw; /* result hw for FP outputs, or interpolant index */
 	int acc; /* instruction where this reg is last read (first insn == 1) */
+
+	int vtx; /* vertex index, for GP inputs (TGSI Dimension.Index) */
+	int indirect[2]; /* index into pc->addr, or -1 */
+
+	ubyte buf_index; /* c{0 .. 15}[] or g{0 .. 15}[] */
 };
 
 #define NV50_MOD_NEG 1
 #define NV50_MOD_ABS 2
+#define NV50_MOD_NEG_ABS (NV50_MOD_NEG | NV50_MOD_ABS)
 #define NV50_MOD_SAT 4
+#define NV50_MOD_I32 8
+
+/* NV50_MOD_I32 is used to indicate integer mode for neg/abs */
+
+/* STACK: Conditionals and loops have to use the (per warp) stack.
+ * Stack entries consist of an entry type (divergent path, join at),
+ * a mask indicating the active threads of the warp, and an address.
+ * MPs can store 12 stack entries internally, if we need more (and
+ * we probably do), we have to create a stack buffer in VRAM.
+ */
+/* impose low limits for now */
+#define NV50_MAX_COND_NESTING 4
+#define NV50_MAX_LOOP_NESTING 3
 
-/* arbitrary limits */
-#define MAX_IF_DEPTH 4
-#define MAX_LOOP_DEPTH 4
+#define JOIN_ON(e) e; pc->p->exec_tail->inst[1] |= 2
 
 struct nv50_pc {
 	struct nv50_program *p;
@@ -119,37 +136,49 @@ struct nv50_pc {
 	struct nv50_reg *param;
 	int param_nr;
 	struct nv50_reg *immd;
-	float *immd_buf;
+	uint32_t *immd_buf;
 	int immd_nr;
 	struct nv50_reg **addr;
 	int addr_nr;
+	struct nv50_reg *sysval;
+	int sysval_nr;
 
 	struct nv50_reg *temp_temp[16];
+	struct nv50_program_exec *temp_temp_exec[16];
 	unsigned temp_temp_nr;
 
 	/* broadcast and destination replacement regs */
 	struct nv50_reg *r_brdc;
 	struct nv50_reg *r_dst[4];
 
+	struct nv50_reg reg_instances[16];
+	unsigned reg_instance_nr;
+
 	unsigned interp_mode[32];
 	/* perspective interpolation registers */
 	struct nv50_reg *iv_p;
 	struct nv50_reg *iv_c;
 
-	struct nv50_program_exec *if_cond;
-	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
-	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
-	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
+	struct nv50_program_exec *if_insn[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *if_join[NV50_MAX_COND_NESTING];
+	struct nv50_program_exec *loop_brka[NV50_MAX_LOOP_NESTING];
 	int if_lvl, loop_lvl;
-	unsigned loop_pos[MAX_LOOP_DEPTH];
+	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
+
+	unsigned *insn_pos; /* actual program offset of each TGSI insn */
+	boolean in_subroutine;
 
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
 	unsigned insn_nr;
 
 	boolean allow32;
+
+	uint8_t edgeflag_out;
 };
 
+static struct nv50_reg *get_address_reg(struct nv50_pc *, struct nv50_reg *);
+
 static INLINE void
 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
 {
@@ -158,7 +187,10 @@ ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
 	reg->hw = hw;
 	reg->mod = 0;
 	reg->rhw = -1;
+	reg->vtx = -1;
 	reg->acc = 0;
+	reg->indirect[0] = reg->indirect[1] = -1;
+	reg->buf_index = (type == P_CONST) ? 1 : 0;
 }
 
 static INLINE unsigned
@@ -177,7 +209,7 @@ terminate_mbb(struct nv50_pc *pc)
 	/* remove records of temporary address register values */
 	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
 		if (pc->r_addr[i].index < 0)
-			pc->r_addr[i].rhw = -1;
+			pc->r_addr[i].acc = 0;
 }
 
 static void
@@ -226,7 +258,24 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
+}
+
+static INLINE struct nv50_reg *
+reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *ri;
+
+	assert(pc->reg_instance_nr < 16);
+	ri = &pc->reg_instances[pc->reg_instance_nr++];
+	if (reg) {
+		alloc_reg(pc, reg);
+		*ri = *reg;
+		reg->indirect[0] = reg->indirect[1] = -1;
+		reg->mod = 0;
+	}
+	return ri;
 }
 
 /* XXX: For shaders that aren't executed linearly (e.g. shaders that
@@ -251,26 +300,11 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 		}
 	}
 
-	assert(0);
+	NOUVEAU_ERR("out of registers\n");
+	abort();
 	return NULL;
 }
 
-/* Assign the hw of the discarded temporary register src
- * to the tgsi register dst and free src.
- */
-static void
-assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	assert(src->index == -1 && src->hw != -1);
-
-	if (dst->hw != -1)
-		pc->r_temp[dst->hw] = NULL;
-	pc->r_temp[src->hw] = dst;
-	dst->hw = src->hw;
-
-	FREE(src);
-}
-
 /* release the hardware resource held by r */
 static void
 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
@@ -329,38 +363,53 @@ free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4])
 }
 
 static struct nv50_reg *
-temp_temp(struct nv50_pc *pc)
+temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	if (pc->temp_temp_nr >= 16)
 		assert(0);
 
 	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	pc->temp_temp_exec[pc->temp_temp_nr] = e;
 	return pc->temp_temp[pc->temp_temp_nr++];
 }
 
+/* This *must* be called for all nv50_program_exec that have been
+ * given as argument to temp_temp, or the temps will be leaked !
+ */
 static void
-kill_temp_temp(struct nv50_pc *pc)
+kill_temp_temp(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	int i;
-	
+
 	for (i = 0; i < pc->temp_temp_nr; i++)
-		free_temp(pc, pc->temp_temp[i]);
-	pc->temp_temp_nr = 0;
+		if (pc->temp_temp_exec[i] == e)
+			free_temp(pc, pc->temp_temp[i]);
+	if (!e)
+		pc->temp_temp_nr = 0;
 }
 
 static int
-ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+ctor_immd_4u32(struct nv50_pc *pc,
+	       uint32_t x, uint32_t y, uint32_t z, uint32_t w)
 {
-	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)),
-			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	unsigned size = pc->immd_nr * 4 * sizeof(uint32_t);
+
+	pc->immd_buf = REALLOC(pc->immd_buf, size, size + 4 * sizeof(uint32_t));
+
 	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
 	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
 	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
 	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
-	
+
 	return pc->immd_nr++;
 }
 
+static INLINE int
+ctor_immd_4f32(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	return ctor_immd_4u32(pc, fui(x), fui(y), fui(z), fui(w));
+}
+
 static struct nv50_reg *
 alloc_immd(struct nv50_pc *pc, float f)
 {
@@ -368,11 +417,11 @@ alloc_immd(struct nv50_pc *pc, float f)
 	unsigned hw;
 
 	for (hw = 0; hw < pc->immd_nr * 4; hw++)
-		if (pc->immd_buf[hw] == f)
+		if (pc->immd_buf[hw] == fui(f))
 			break;
 
 	if (hw == pc->immd_nr * 4)
-		hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4;
+		hw = ctor_immd_4f32(pc, f, -f, 0.5 * f, 0) * 4;
 
 	ctor_reg(r, P_IMMD, -1, hw);
 	return r;
@@ -398,6 +447,8 @@ emit(struct nv50_pc *pc, struct nv50_program_exec *e)
 		p->exec_head = e;
 	p->exec_tail = e;
 	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+
+	kill_temp_temp(pc, e);
 }
 
 static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
@@ -418,10 +469,25 @@ is_immd(struct nv50_program_exec *e)
 	return FALSE;
 }
 
+static boolean
+is_join(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 2)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE boolean
+is_control_flow(struct nv50_program_exec *e)
+{
+	return (e->inst[0] & 2);
+}
+
 static INLINE void
 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
 	 struct nv50_program_exec *e)
 {
+	assert(!is_immd(e));
 	set_long(pc, e);
 	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
 	e->inst[1] |= (pred << 7) | (idx << 12);
@@ -464,32 +530,47 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-	unsigned val;
-	float f = pc->immd_buf[imm->hw];
-
-	if (imm->mod & NV50_MOD_ABS)
-		f = fabsf(f);
-	val = fui((imm->mod & NV50_MOD_NEG) ? -f : f);
-
 	set_long(pc, e);
-	/*XXX: can't be predicated - bits overlap.. catch cases where both
-	 *     are required and avoid them. */
+	/* XXX: can't be predicated - bits overlap; cases where both
+	 * are required should be avoided by using pc->allow32 */
 	set_pred(pc, 0, 0, e);
 	set_pred_wr(pc, 0, 0, e);
 
 	e->inst[1] |= 0x00000002 | 0x00000001;
-	e->inst[0] |= (val & 0x3f) << 16;
-	e->inst[1] |= (val >> 6) << 2;
+	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16;
+	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2;
 }
 
 static INLINE void
 set_addr(struct nv50_program_exec *e, struct nv50_reg *a)
 {
+	assert(a->type == P_ADDR);
+
 	assert(!(e->inst[0] & 0x0c000000));
 	assert(!(e->inst[1] & 0x00000004));
 
 	e->inst[0] |= (a->hw & 3) << 26;
-	e->inst[1] |= (a->hw >> 2) << 2;
+	e->inst[1] |= a->hw & 4;
+}
+
+static void
+emit_arl(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, uint8_t);
+
+static void
+emit_shl_imm(struct nv50_pc *, struct nv50_reg *, struct nv50_reg *, int);
+
+static void
+emit_mov_from_addr(struct nv50_pc *pc, struct nv50_reg *dst,
+		   struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[1] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_addr(e, src);
+
+	emit(pc, e);
 }
 
 static void
@@ -508,68 +589,6 @@ emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst,
 	emit(pc, e);
 }
 
-static struct nv50_reg *
-alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref)
-{
-	int i;
-	struct nv50_reg *a_tgsi = NULL, *a = NULL;
-
-	if (!ref) {
-		/* allocate for TGSI address reg */
-		for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
-			if (pc->r_addr[i].index >= 0)
-				continue;
-			if (pc->r_addr[i].rhw >= 0 &&
-			    pc->r_addr[i].acc == pc->insn_cur)
-				continue;
-
-			pc->r_addr[i].rhw = -1;
-			pc->r_addr[i].index = i;
-			return &pc->r_addr[i];
-		}
-		assert(0);
-		return NULL;
-	}
-
-	/* Allocate and set an address reg so we can access 'ref'.
-	 *
-	 * If and r_addr has index < 0, it is not reserved for TGSI,
-	 * and index will be the negative of the TGSI addr index the
-	 * value in rhw is relative to, or -256 if rhw is an offset
-	 * from 0. If rhw < 0, the reg has not been initialized.
-	 */
-	for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) {
-		if (pc->r_addr[i].index >= 0) /* occupied for TGSI */
-			continue;
-		if (pc->r_addr[i].rhw < 0) { /* unused */
-			a = &pc->r_addr[i];
-			continue;
-		}
-		if (!a && pc->r_addr[i].acc != pc->insn_cur)
-			a = &pc->r_addr[i];
-
-		if (ref->hw - pc->r_addr[i].rhw >= 128)
-			continue;
-
-		if ((ref->acc >= 0 && pc->r_addr[i].index == -256) ||
-		    (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) {
-			pc->r_addr[i].acc = pc->insn_cur;
-			return &pc->r_addr[i];
-		}
-	}
-	assert(a);
-
-	if (ref->acc < 0)
-		a_tgsi = pc->addr[ref->index];
-
-	emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4);
-
-	a->rhw = ref->hw & ~0x7f;
-	a->acc = pc->insn_cur;
-	a->index = a_tgsi ? -ref->index : -256;
-	return a;
-}
-
 #define INTERP_LINEAR		0
 #define INTERP_FLAT		1
 #define INTERP_PERSPECTIVE	2
@@ -613,17 +632,18 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 	e->param.shift = s;
 	e->param.mask = m << (s % 32);
 
-	if (src->hw > 127)
-		set_addr(e, alloc_addr(pc, src));
+	if (src->hw < 0 || src->hw > 127) /* need (additional) address reg */
+		set_addr(e, get_address_reg(pc, src));
 	else
 	if (src->acc < 0) {
 		assert(src->type == P_CONST);
-		set_addr(e, pc->addr[src->index]);
+		set_addr(e, pc->addr[src->indirect[0]]);
 	}
 
-	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
+	e->inst[1] |= (src->buf_index << 22);
 }
 
+/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
 static void
 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
@@ -644,11 +664,17 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	if (src->type == P_IMMD || src->type == P_CONST) {
 		set_long(pc, e);
 		set_data(pc, src, 0x7f, 9, e);
-		e->inst[1] |= 0x20000000; /* src0 const? */
+		e->inst[1] |= 0x20000000; /* mov from c[] */
 	} else {
 		if (src->type == P_ATTR) {
 			set_long(pc, e);
 			e->inst[1] |= 0x00200000;
+
+			if (src->vtx >= 0) {
+				/* indirect (vertex base + c) load from p[] */
+				e->inst[0] |= 0x01800000;
+				set_addr(e, get_address_reg(pc, src));
+			}
 		}
 
 		alloc_reg(pc, src);
@@ -659,9 +685,9 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 	if (is_long(e) && !is_immd(e)) {
 		e->inst[1] |= 0x04000000; /* 32-bit */
-		e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */
+		e->inst[1] |= 0x0000c000; /* 32-bit c[] load / lane mask 0:1 */
 		if (!(e->inst[1] & 0x20000000))
-			e->inst[1] |= 0x00030000; /* "subsubop" 0xf */
+			e->inst[1] |= 0x00030000; /* lane mask 2:3 */
 	} else
 		e->inst[0] |= 0x00008000;
 
@@ -676,6 +702,45 @@ emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
 	FREE(imm);
 }
 
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	assert(src->index == -1 && src->hw != -1);
+
+	if (pc->if_lvl || pc->loop_lvl ||
+	    (dst->type != P_TEMP) ||
+	    (src->hw < pc->result_nr * 4 &&
+	     pc->p->type == PIPE_SHADER_FRAGMENT) ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_CAL] ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_BRA]) {
+
+		emit_mov(pc, dst, src);
+		free_temp(pc, src);
+		return;
+	}
+
+	if (dst->hw != -1)
+		pc->r_temp[dst->hw] = NULL;
+	pc->r_temp[src->hw] = dst;
+	dst->hw = src->hw;
+
+	FREE(src);
+}
+
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
 static boolean
 check_swap_src_0_1(struct nv50_pc *pc,
 		   struct nv50_reg **s0, struct nv50_reg **s1)
@@ -707,7 +772,7 @@ set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
 	struct nv50_reg *temp;
 
 	if (src->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, e);
 		emit_mov(pc, temp, src);
 		src = temp;
 	}
@@ -724,9 +789,14 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	if (src->type == P_ATTR) {
 		set_long(pc, e);
 		e->inst[1] |= 0x00200000;
+
+		if (src->vtx >= 0) {
+			e->inst[0] |= 0x01800000; /* src from p[] */
+			set_addr(e, get_address_reg(pc, src));
+		}
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
@@ -742,19 +812,19 @@ static void
 set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		assert(!(e->inst[0] & 0x00800000));
-		if (e->inst[0] & 0x01000000) {
-			struct nv50_reg *temp = temp_temp(pc);
+		if (e->inst[0] & 0x01800000) {
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
 		} else {
+			assert(!(e->inst[0] & 0x00800000));
 			set_data(pc, src, 0x7f, 16, e);
 			e->inst[0] |= 0x00800000;
 		}
@@ -772,19 +842,19 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 	set_long(pc, e);
 
 	if (src->type == P_ATTR) {
-		struct nv50_reg *temp = temp_temp(pc);
+		struct nv50_reg *temp = temp_temp(pc, e);
 
 		emit_mov(pc, temp, src);
 		src = temp;
 	} else
 	if (src->type == P_CONST || src->type == P_IMMD) {
-		assert(!(e->inst[0] & 0x01000000));
-		if (e->inst[0] & 0x00800000) {
-			struct nv50_reg *temp = temp_temp(pc);
+		if (e->inst[0] & 0x01800000) {
+			struct nv50_reg *temp = temp_temp(pc, e);
 
 			emit_mov(pc, temp, src);
 			src = temp;
 		} else {
+			assert(!(e->inst[0] & 0x01000000));
 			set_data(pc, src, 0x7f, 32+14, e);
 			e->inst[0] |= 0x01000000;
 		}
@@ -795,6 +865,53 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 }
 
 static void
+set_half_src(struct nv50_pc *pc, struct nv50_reg *src, int lh,
+	     struct nv50_program_exec *e, int pos)
+{
+	struct nv50_reg *r = src;
+
+	alloc_reg(pc, r);
+	if (r->type != P_TEMP) {
+		r = temp_temp(pc, e);
+		emit_mov(pc, r, src);
+	}
+
+	if (r->hw > (NV50_SU_MAX_TEMP / 2)) {
+		NOUVEAU_ERR("out of low GPRs\n");
+		abort();
+	}
+
+	e->inst[pos / 32] |= ((src->hw * 2) + lh) << (pos % 32);
+}
+
+static void
+emit_mov_from_pred(struct nv50_pc *pc, struct nv50_reg *dst, int pred)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	assert(dst->type == P_TEMP);
+	e->inst[1] = 0x20000000 | (pred << 12);
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mov_to_pred(struct nv50_pc *pc, int pred, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x000001fc;
+	e->inst[1] = 0xa0000008;
+	set_long(pc, e);
+	set_pred_wr(pc, 1, pred, e);
+	set_src_0_restricted(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
@@ -809,7 +926,7 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
 	if (src1->type == P_IMMD && !is_long(e)) {
-		if (src0->mod & NV50_MOD_NEG)
+		if (src0->mod ^ src1->mod)
 			e->inst[0] |= 0x00008000;
 		set_immd(pc, src1, e);
 	} else {
@@ -866,11 +983,132 @@ emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
 
 	e->inst[0] |= dst->hw << 2;
 	e->inst[0] |= s << 16; /* shift left */
-	set_src_0_restricted(pc, src, e);
+	set_src_0(pc, src, e);
 
 	emit(pc, e);
 }
 
+static boolean
+address_reg_suitable(struct nv50_reg *a, struct nv50_reg *r)
+{
+	if (!r)
+		return FALSE;
+
+	if (r->vtx != a->vtx)
+		return FALSE;
+	if (r->vtx >= 0)
+		return (r->indirect[1] == a->indirect[1]);
+
+	if (r->hw < a->rhw || (r->hw - a->rhw) >= 128)
+		return FALSE;
+
+	if (a->index >= 0)
+		return (a->index == r->indirect[0]);
+	return (a->indirect[0] == r->indirect[0]);
+}
+
+static void
+load_vertex_base(struct nv50_pc *pc, struct nv50_reg *dst,
+		 struct nv50_reg *a, int shift)
+{
+	struct nv50_reg mem, *temp;
+
+	ctor_reg(&mem, P_ATTR, -1, dst->vtx);
+
+	assert(dst->type == P_ADDR);
+	if (!a) {
+		emit_arl(pc, dst, &mem, 0);
+		return;
+	}
+	temp = alloc_temp(pc, NULL);
+
+	if (shift) {
+		emit_mov_from_addr(pc, temp, a);
+		if (shift < 0)
+			emit_shl_imm(pc, temp, temp, shift);
+		emit_arl(pc, dst, temp, MAX2(shift, 0));
+	}
+	emit_mov(pc, temp, &mem);
+	set_addr(pc->p->exec_tail, dst);
+
+	emit_arl(pc, dst, temp, 0);
+	free_temp(pc, temp);
+}
+
+/* case (ref == NULL): allocate address register for TGSI_FILE_ADDRESS
+ * case (vtx >= 0, acc >= 0): load vertex base from a[vtx * 4] to $aX
+ * case (vtx >= 0, acc < 0): load vertex base from s[$aY + vtx * 4] to $aX
+ * case (vtx < 0, acc >= 0): memory address too high to encode
+ * case (vtx < 0, acc < 0): get source register for TGSI_FILE_ADDRESS
+ */
+static struct nv50_reg *
+get_address_reg(struct nv50_pc *pc, struct nv50_reg *ref)
+{
+	int i;
+	struct nv50_reg *a_ref, *a = NULL;
+
+	for (i = 0; i < NV50_SU_MAX_ADDR; ++i) {
+		if (pc->r_addr[i].acc == 0)
+			a = &pc->r_addr[i]; /* an unused address reg */
+		else
+		if (address_reg_suitable(&pc->r_addr[i], ref)) {
+			pc->r_addr[i].acc = pc->insn_cur;
+			return &pc->r_addr[i];
+		} else
+		if (!a && pc->r_addr[i].index < 0 &&
+		    pc->r_addr[i].acc < pc->insn_cur)
+			a = &pc->r_addr[i];
+	}
+	if (!a) {
+		/* We'll be able to spill address regs when this
+		 * mess is replaced with a proper compiler ...
+		 */
+		NOUVEAU_ERR("out of address regs\n");
+		abort();
+		return NULL;
+	}
+
+	/* initialize and reserve for this TGSI instruction */
+	a->rhw = 0;
+	a->index = a->indirect[0] = a->indirect[1] = -1;
+	a->acc = pc->insn_cur;
+
+	if (!ref) {
+		a->vtx = -1;
+		return a;
+	}
+	a->vtx = ref->vtx;
+
+	/* now put in the correct value ... */
+
+	if (ref->vtx >= 0) {
+		a->indirect[1] = ref->indirect[1];
+
+		/* For an indirect vertex index, we need to shift address right
+		 * by 2, the address register will contain vtx * 16, we need to
+		 * load from a[vtx * 4].
+		 */
+		load_vertex_base(pc, a, (ref->acc < 0) ?
+				 pc->addr[ref->indirect[1]] : NULL, -2);
+	} else {
+		assert(ref->acc < 0 || ref->indirect[0] < 0);
+
+		a->rhw = ref->hw & ~0x7f;
+		a->indirect[0] = ref->indirect[0];
+		a_ref = (ref->acc < 0) ? pc->addr[ref->indirect[0]] : NULL;
+
+		emit_add_addr_imm(pc, a, a_ref, a->rhw * 4);
+	}
+	return a;
+}
+
+#define NV50_MAX_F32 0x880
+#define NV50_MAX_S32 0x08c
+#define NV50_MAX_U32 0x084
+#define NV50_MIN_F32 0x8a0
+#define NV50_MIN_S32 0x0ac
+#define NV50_MIN_U32 0x0a4
+
 static void
 emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	    struct nv50_reg *src0, struct nv50_reg *src1)
@@ -878,8 +1116,8 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
 	struct nv50_program_exec *e = exec(pc);
 
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
-	e->inst[1] |= (sub << 29);
+	e->inst[0] |= 0x30000000 | ((sub & 0x800) << 20);
+	e->inst[1] |= (sub << 24);
 
 	check_swap_src_0_1(pc, &src0, &src1);
 	set_dst(pc, dst, e);
@@ -898,7 +1136,6 @@ static INLINE void
 emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1)
 {
-	assert(src0 != src1);
 	src1->mod ^= NV50_MOD_NEG;
 	emit_add(pc, dst, src0, src1);
 	src1->mod ^= NV50_MOD_NEG;
@@ -921,6 +1158,8 @@ emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	    op != TGSI_OPCODE_XOR)
 		assert(!"invalid bit op");
 
+	assert(!(src0->mod | src1->mod));
+
 	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
 		set_immd(pc, src1, e);
 		if (op == TGSI_OPCODE_OR)
@@ -942,6 +1181,69 @@ emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 }
 
 static void
+emit_not(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xd0000000;
+	e->inst[1] = 0x0402c000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_1(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_shift(struct nv50_pc *pc, struct nv50_reg *dst,
+	   struct nv50_reg *src0, struct nv50_reg *src1, unsigned dir)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4000000;
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (src1->type == P_IMMD) {
+		e->inst[1] |= (1 << 20);
+		e->inst[0] |= (pc->immd_buf[src1->hw] & 0x7f) << 16;
+	} else
+		set_src_1(pc, src1, e);
+
+	if (dir != TGSI_OPCODE_SHL)
+		e->inst[1] |= (1 << 29);
+
+	if (dir == TGSI_OPCODE_ISHR)
+		e->inst[1] |= (1 << 27);
+
+	emit(pc, e);
+}
+
+static void
+emit_shl_imm(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src, int s)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x30000000;
+	e->inst[1] = 0xc4100000;
+	if (s < 0) {
+		e->inst[1] |= 1 << 29;
+		s = -s;
+	}
+	e->inst[1] |= ((s & 0x7f) << 16);
+
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
 emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
@@ -967,12 +1269,19 @@ static INLINE void
 emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	 struct nv50_reg *src1, struct nv50_reg *src2)
 {
-	assert(src2 != src0 && src2 != src1);
 	src2->mod ^= NV50_MOD_NEG;
 	emit_mad(pc, dst, src0, src1, src2);
 	src2->mod ^= NV50_MOD_NEG;
 }
 
+#define NV50_FLOP_RCP 0
+#define NV50_FLOP_RSQ 2
+#define NV50_FLOP_LG2 3
+#define NV50_FLOP_SIN 4
+#define NV50_FLOP_COS 5
+#define NV50_FLOP_EX2 6
+
+/* rcp, rsqrt, lg2 support neg and abs */
 static void
 emit_flop(struct nv50_pc *pc, unsigned sub,
 	  struct nv50_reg *dst, struct nv50_reg *src)
@@ -980,17 +1289,20 @@ emit_flop(struct nv50_pc *pc, unsigned sub,
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0x90000000;
-	if (sub) {
+	if (sub || src->mod) {
 		set_long(pc, e);
 		e->inst[1] |= (sub << 29);
 	}
 
 	set_dst(pc, dst, e);
+	set_src_0_restricted(pc, src, e);
 
-	if (sub == 0 || sub == 2)
-		set_src_0_restricted(pc, src, e);
-	else
-		set_src_0(pc, src, e);
+	assert(!src->mod || sub < 4);
+
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
 
 	emit(pc, e);
 }
@@ -1007,6 +1319,11 @@ emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	set_long(pc, e);
 	e->inst[1] |= (6 << 29) | 0x00004000;
 
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
 	emit(pc, e);
 }
 
@@ -1022,39 +1339,49 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	set_long(pc, e);
 	e->inst[1] |= (6 << 29);
 
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
 	emit(pc, e);
 }
 
-#define CVTOP_RN	0x01
-#define CVTOP_FLOOR	0x03
-#define CVTOP_CEIL	0x05
-#define CVTOP_TRUNC	0x07
-#define CVTOP_SAT	0x08
-#define CVTOP_ABS	0x10
-
-/* 0x04 == 32 bit dst */
-/* 0x40 == dst is float */
-/* 0x80 == src is float */
-#define CVT_F32_F32 0xc4
-#define CVT_F32_S32 0x44
-#define CVT_S32_F32 0x8c
-#define CVT_S32_S32 0x0c
-#define CVT_NEG     0x20
-#define CVT_RI      0x08
+#define CVT_RN    (0x00 << 16)
+#define CVT_FLOOR (0x02 << 16)
+#define CVT_CEIL  (0x04 << 16)
+#define CVT_TRUNC (0x06 << 16)
+#define CVT_SAT   (0x08 << 16)
+#define CVT_ABS   (0x10 << 16)
+
+#define CVT_X32_X32 0x04004000
+#define CVT_X32_S32 0x04014000
+#define CVT_F32_F32 ((0xc0 << 24) | CVT_X32_X32)
+#define CVT_S32_F32 ((0x88 << 24) | CVT_X32_X32)
+#define CVT_U32_F32 ((0x80 << 24) | CVT_X32_X32)
+#define CVT_F32_S32 ((0x40 << 24) | CVT_X32_S32)
+#define CVT_F32_U32 ((0x40 << 24) | CVT_X32_X32)
+#define CVT_S32_S32 ((0x08 << 24) | CVT_X32_S32)
+#define CVT_S32_U32 ((0x08 << 24) | CVT_X32_X32)
+#define CVT_U32_S32 ((0x00 << 24) | CVT_X32_S32)
+
+#define CVT_NEG 0x20000000
+#define CVT_RI  0x08000000
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
-	 int wp, unsigned cvn, unsigned fmt)
+	 int wp, uint32_t cvn)
 {
 	struct nv50_program_exec *e;
 
 	e = exec(pc);
-	set_long(pc, e);
 
-	e->inst[0] |= 0xa0000000;
-	e->inst[1] |= 0x00004000; /* 32 bit src */
-	e->inst[1] |= (cvn << 16);
-	e->inst[1] |= (fmt << 24);
+	if (src->mod & NV50_MOD_NEG) cvn |= CVT_NEG;
+	if (src->mod & NV50_MOD_ABS) cvn |= CVT_ABS;
+
+	e->inst[0] = 0xa0000000;
+	e->inst[1] = cvn;
+	set_long(pc, e);
 	set_src_0(pc, src, e);
 
 	if (wp >= 0)
@@ -1079,10 +1406,12 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
  *  0x6 = GE
  *  0x7 = set condition code ? (used before bra.lt/le/gt/ge)
  *  0x8 = unordered bit (allows NaN)
+ *
+ *  mode = 0x04 (u32), 0x0c (s32), 0x80 (f32)
  */
 static void
 emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
-	 struct nv50_reg *src0, struct nv50_reg *src1)
+	 struct nv50_reg *src0, struct nv50_reg *src1, uint8_t mode)
 {
 	static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
 
@@ -1097,16 +1426,10 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	if (dst && dst->type != P_TEMP)
 		dst = alloc_temp(pc, NULL);
 
-	/* set.u32 */
 	set_long(pc, e);
-	e->inst[0] |= 0xb0000000;
+	e->inst[0] |= 0x30000000 | (mode << 24);
 	e->inst[1] |= 0x60000000 | (ccode << 14);
 
-	/* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but
-	 * that doesn't seem to match what the hw actually does
-	e->inst[1] |= 0x04000000; << breaks things, u32 by default ?
-	 */
-
 	if (wp >= 0)
 		set_pred_wr(pc, 1, wp, e);
 	if (dst)
@@ -1120,35 +1443,147 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	set_src_1(pc, src1, e);
 
 	emit(pc, e);
-	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
 
-	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
-	if (rdst)
-		emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32);
+	if (rdst && mode == 0x80) /* convert to float ? */
+		emit_cvt(pc, rdst, dst, -1, CVT_ABS | CVT_F32_S32);
 	if (rdst && rdst != dst)
 		free_temp(pc, dst);
 }
 
-static INLINE unsigned
-map_tgsi_setop_cc(unsigned op)
+static INLINE void
+map_tgsi_setop_hw(unsigned op, uint8_t *cc, uint8_t *ty)
 {
 	switch (op) {
-	case TGSI_OPCODE_SLT: return 0x1;
-	case TGSI_OPCODE_SGE: return 0x6;
-	case TGSI_OPCODE_SEQ: return 0x2;
-	case TGSI_OPCODE_SGT: return 0x4;
-	case TGSI_OPCODE_SLE: return 0x3;
-	case TGSI_OPCODE_SNE: return 0xd;
+	case TGSI_OPCODE_SLT: *cc = 0x1; *ty = 0x80; break;
+	case TGSI_OPCODE_SGE: *cc = 0x6; *ty = 0x80; break;
+	case TGSI_OPCODE_SEQ: *cc = 0x2; *ty = 0x80; break;
+	case TGSI_OPCODE_SGT: *cc = 0x4; *ty = 0x80; break;
+	case TGSI_OPCODE_SLE: *cc = 0x3; *ty = 0x80; break;
+	case TGSI_OPCODE_SNE: *cc = 0xd; *ty = 0x80; break;
+
+	case TGSI_OPCODE_ISLT: *cc = 0x1; *ty = 0x0c; break;
+	case TGSI_OPCODE_ISGE: *cc = 0x6; *ty = 0x0c; break;
+	case TGSI_OPCODE_USEQ: *cc = 0x2; *ty = 0x04; break;
+	case TGSI_OPCODE_USGE: *cc = 0x6; *ty = 0x04; break;
+	case TGSI_OPCODE_USLT: *cc = 0x1; *ty = 0x04; break;
+	case TGSI_OPCODE_USNE: *cc = 0x5; *ty = 0x04; break;
 	default:
 		assert(0);
-		return 0;
+		return;
 	}
 }
 
+static void
+emit_add_b32(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, struct nv50_reg *rsrc1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	struct nv50_reg *src1;
+
+	e->inst[0] = 0x20000000;
+
+	alloc_reg(pc, rsrc1);
+	check_swap_src_0_1(pc, &src0, &rsrc1);
+
+	src1 = rsrc1;
+	if (src0->mod & rsrc1->mod & NV50_MOD_NEG) {
+		src1 = temp_temp(pc, e);
+		emit_cvt(pc, src1, rsrc1, -1, CVT_S32_S32);
+	}
+
+	if (!pc->allow32 || src1->hw > 63 ||
+	    (src1->type != P_TEMP && src1->type != P_IMMD))
+		set_long(pc, e);
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+
+	if (is_long(e)) {
+		e->inst[1] |= 1 << 26;
+		set_src_2(pc, src1, e);
+	} else {
+		e->inst[0] |= 0x8000;
+		if (src1->type == P_IMMD)
+			set_immd(pc, src1, e);
+		else
+			set_src_1(pc, src1, e);
+	}
+
+	if (src0->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 28;
+	else
+	if (src1->mod & NV50_MOD_NEG)
+		e->inst[0] |= 1 << 22;
+
+	emit(pc, e);
+}
+
+static void
+emit_mad_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1,
+	     struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x60000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != P_TEMP) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mul_u16(struct nv50_pc *pc, struct nv50_reg *dst,
+	     struct nv50_reg *src0, int lh_0, struct nv50_reg *src1, int lh_1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x40000000;
+	set_long(pc, e);
+	set_dst(pc, dst, e);
+
+	set_half_src(pc, src0, lh_0, e, 9);
+	set_half_src(pc, src1, lh_1, e, 16);
+
+	emit(pc, e);
+}
+
+static void
+emit_sad(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0x50000000;
+	if (!pc->allow32)
+		set_long(pc, e);
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	alloc_reg(pc, src2);
+	if (is_long(e) || (src2->type != dst->type) || (src2->hw != dst->hw))
+		set_src_2(pc, src2, e);
+
+	if (is_long(e))
+		e->inst[1] |= 0x0c << 24;
+	else
+		e->inst[0] |= 0x81 << 8;
+
+	emit(pc, e);
+}
+
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
+	emit_cvt(pc, dst, src, -1, CVT_FLOOR | CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -1157,24 +1592,18 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 {
 	struct nv50_reg *temp = alloc_temp(pc, NULL);
 
-	emit_flop(pc, 3, temp, v);
+	emit_flop(pc, NV50_FLOP_LG2, temp, v);
 	emit_mul(pc, temp, temp, e);
 	emit_preex2(pc, temp, temp);
-	emit_flop(pc, 6, dst, temp);
+	emit_flop(pc, NV50_FLOP_EX2, dst, temp);
 
 	free_temp(pc, temp);
 }
 
 static INLINE void
-emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32);
-}
-
-static INLINE void
 emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32);
+	emit_cvt(pc, dst, src, -1, CVT_SAT | CVT_F32_F32);
 }
 
 static void
@@ -1192,18 +1621,18 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 
 	if (mask & (3 << 1)) {
 		tmp[0] = alloc_temp(pc, NULL);
-		emit_minmax(pc, 4, tmp[0], src[0], zero);
+		emit_minmax(pc, NV50_MAX_F32, tmp[0], src[0], zero);
 	}
 
 	if (mask & (1 << 2)) {
 		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
 
-		tmp[1] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[1], src[1], zero);
+		tmp[1] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[1], src[1], zero);
 
-		tmp[3] = temp_temp(pc);
-		emit_minmax(pc, 4, tmp[3], src[3], neg128);
-		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+		tmp[3] = temp_temp(pc, NULL);
+		emit_minmax(pc, NV50_MAX_F32, tmp[3], src[3], neg128);
+		emit_minmax(pc, NV50_MIN_F32, tmp[3], tmp[3], pos128);
 
 		emit_pow(pc, dst[2], tmp[1], tmp[3]);
 		emit_mov(pc, dst[2], zero);
@@ -1231,35 +1660,127 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	FREE(one);
 }
 
-static INLINE void
-emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG);
-}
-
 static void
 emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e;
 	const int r_pred = 1;
-	unsigned cvn = CVT_F32_F32;
-
-	if (src->mod & NV50_MOD_NEG)
-		cvn |= CVT_NEG;
-	/* write predicate reg */
-	emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn);
 
-	/* conditional discard */
 	e = exec(pc);
-	e->inst[0] = 0x00000002;
+	e->inst[0] = 0x00000002; /* discard */
+	set_long(pc, e); /* sets cond code to ALWAYS */
+
+	if (src) {
+		set_pred(pc, 0x1 /* cc = LT */, r_pred, e);
+		/* write to predicate reg */
+		emit_cvt(pc, NULL, src, r_pred, CVT_F32_F32);
+	}
+
+	emit(pc, e);
+}
+
+static struct nv50_program_exec *
+emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = (op << 28) | 2;
+	set_long(pc, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+
+	emit(pc, e);
+	return e;
+}
+
+static INLINE struct nv50_program_exec *
+emit_breakaddr(struct nv50_pc *pc)
+{
+	return emit_control_flow(pc, 0x4, -1, 0);
+}
+
+static INLINE void
+emit_break(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	emit_control_flow(pc, 0x5, pred, cc);
+}
+
+static INLINE struct nv50_program_exec *
+emit_joinat(struct nv50_pc *pc)
+{
+	return emit_control_flow(pc, 0xa, -1, 0);
+}
+
+static INLINE struct nv50_program_exec *
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	return emit_control_flow(pc, 0x1, pred, cc);
+}
+
+static INLINE struct nv50_program_exec *
+emit_call(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	return emit_control_flow(pc, 0x2, pred, cc);
+}
+
+static INLINE void
+emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	emit_control_flow(pc, 0x3, pred, cc);
+}
+
+static void
+emit_prim_cmd(struct nv50_pc *pc, unsigned cmd)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000 | (cmd << 9);
+	e->inst[1] = 0xc0000000;
 	set_long(pc, e);
-	set_pred(pc, 0x1 /* LT */, r_pred, e);
+
 	emit(pc, e);
 }
 
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV_SRC1 3
+
+/* For a quad of threads / top left, top right, bottom left, bottom right
+ * pixels, do a different operation, and take src0 from a specific thread.
+ */
+static void
+emit_quadop(struct nv50_pc *pc, struct nv50_reg *dst, int wp, int lane_src0,
+	    struct nv50_reg *src0, struct nv50_reg *src1, ubyte qop)
+{
+       struct nv50_program_exec *e = exec(pc);
+
+       e->inst[0] = 0xc0000000;
+       e->inst[1] = 0x80000000;
+       set_long(pc, e);
+       e->inst[0] |= lane_src0 << 16;
+       set_src_0(pc, src0, e);
+       set_src_2(pc, src1, e);
+
+       if (wp >= 0)
+	       set_pred_wr(pc, 1, wp, e);
+
+       if (dst)
+	       set_dst(pc, dst, e);
+       else {
+	       e->inst[0] |= 0x000001fc;
+	       e->inst[1] |= 0x00000008;
+       }
+
+       e->inst[0] |= (qop & 3) << 20;
+       e->inst[1] |= (qop >> 2) << 22;
+
+       emit(pc, e);
+}
+
 static void
 load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
-		     struct nv50_reg **src, boolean proj)
+		     struct nv50_reg **src, unsigned arg, boolean proj)
 {
 	int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod };
 
@@ -1267,8 +1788,8 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 	src[1]->mod |= NV50_MOD_ABS;
 	src[2]->mod |= NV50_MOD_ABS;
 
-	emit_minmax(pc, 4, t[2], src[0], src[1]);
-	emit_minmax(pc, 4, t[2], src[2], t[2]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[0], src[1]);
+	emit_minmax(pc, NV50_MAX_F32, t[2], src[2], t[2]);
 
 	src[0]->mod = mod[0];
 	src[1]->mod = mod[1];
@@ -1276,7 +1797,11 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 
 	if (proj && 0 /* looks more correct without this */)
 		emit_mul(pc, t[2], t[2], src[3]);
-	emit_flop(pc, 0, t[2], t[2]);
+	else
+	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
+		emit_mov(pc, t[3], src[3]);
+
+	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]);
 
 	emit_mul(pc, t[0], src[0], t[2]);
 	emit_mul(pc, t[1], src[1], t[2]);
@@ -1284,89 +1809,221 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 }
 
 static void
-emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
-	 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj)
+load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
+		     struct nv50_reg **src, unsigned dim, unsigned arg)
 {
-	struct nv50_reg *t[4];
-	struct nv50_program_exec *e;
+	unsigned c, mode;
+
+	if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
+		mode = pc->interp_mode[src[0]->index] | INTERP_PERSPECTIVE;
 
-	unsigned c, mode, dim;
+		t[3]->rhw = src[3]->rhw;
+		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
+		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]);
+
+		for (c = 0; c < dim; ++c) {
+			t[c]->rhw = src[c]->rhw;
+			emit_interp(pc, t[c], t[3], mode);
+		}
+		if (arg != dim) { /* depth reference value */
+			t[dim]->rhw = src[2]->rhw;
+			emit_interp(pc, t[dim], t[3], mode);
+		}
+	} else {
+		/* XXX: for some reason the blob sometimes uses MAD
+		 * (mad f32 $rX $rY $rZ neg $r63)
+		 */
+		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]);
+		for (c = 0; c < dim; ++c)
+			emit_mul(pc, t[c], src[c], t[3]);
+		if (arg != dim) /* depth reference value */
+			emit_mul(pc, t[dim], src[2], t[3]);
+	}
+}
 
+static INLINE void
+get_tex_dim(unsigned type, unsigned *dim, unsigned *arg)
+{
 	switch (type) {
 	case TGSI_TEXTURE_1D:
-		dim = 1;
+		*arg = *dim = 1;
+		break;
+	case TGSI_TEXTURE_SHADOW1D:
+		*dim = 1;
+		*arg = 2;
 		break;
 	case TGSI_TEXTURE_UNKNOWN:
 	case TGSI_TEXTURE_2D:
-	case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */
 	case TGSI_TEXTURE_RECT:
-		dim = 2;
+		*arg = *dim = 2;
+		break;
+	case TGSI_TEXTURE_SHADOW2D:
+	case TGSI_TEXTURE_SHADOWRECT:
+		*dim = 2;
+		*arg = 3;
 		break;
 	case TGSI_TEXTURE_3D:
 	case TGSI_TEXTURE_CUBE:
-	case TGSI_TEXTURE_SHADOW2D:
-	case TGSI_TEXTURE_SHADOWRECT: /* XXX */
-		dim = 3;
+		*dim = *arg = 3;
 		break;
 	default:
 		assert(0);
 		break;
 	}
+}
 
-	/* some cards need t[0]'s hw index to be a multiple of 4 */
-	alloc_temp4(pc, t, 0);
+/* We shouldn't execute TEXLOD if any of the pixels in a quad have
+ * different LOD values, so branch off groups of equal LOD.
+ */
+static void
+emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
+		     struct nv50_reg *src, struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *join_at;
+	unsigned i, target = pc->p->exec_size + 9 * 2;
 
-	if (type == TGSI_TEXTURE_CUBE) {
-		load_cube_tex_coords(pc, t, src, proj);
-	} else
-	if (proj) {
-		if (src[0]->type == P_TEMP && src[0]->rhw != -1) {
-			mode = pc->interp_mode[src[0]->index];
-
-			t[3]->rhw = src[3]->rhw;
-			emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
-			emit_flop(pc, 0, t[3], t[3]);
-
-			for (c = 0; c < dim; c++) {
-				t[c]->rhw = src[c]->rhw;
-				emit_interp(pc, t[c], t[3],
-					    (mode | INTERP_PERSPECTIVE));
-			}
-		} else {
-			emit_flop(pc, 0, t[3], src[3]);
-			for (c = 0; c < dim; c++)
-				emit_mul(pc, t[c], src[c], t[3]);
+	if (pc->p->type != PIPE_SHADER_FRAGMENT) {
+		emit(pc, tex);
+		return;
+	}
+	pc->allow32 = FALSE;
 
-			/* XXX: for some reason the blob sometimes uses MAD:
-			 * emit_mad(pc, t[c], src[0][c], t[3], t[3])
-			 * pc->p->exec_tail->inst[1] |= 0x080fc000;
-			 */
+	/* Subtract lod of each pixel from lod of top left pixel, jump
+	 * texlod insn if result is 0, then repeat for 2 other pixels.
+	 */
+	join_at = emit_joinat(pc);
+	emit_quadop(pc, NULL, 0, 0, tlod, tlod, 0x55);
+	emit_branch(pc, 0, 2)->param.index = target;
+
+	for (i = 1; i < 4; ++i) {
+		emit_quadop(pc, NULL, 0, i, tlod, tlod, 0x55);
+		emit_branch(pc, 0, 2)->param.index = target;
+	}
+
+	emit_mov(pc, tlod, src); /* target */
+	emit(pc, tex); /* texlod */
+
+	join_at->param.index = target + 2 * 2;
+	JOIN_ON(emit_nop(pc)); /* join _after_ tex */
+}
+
+static void
+emit_texbias_sequence(struct nv50_pc *pc, struct nv50_reg *t[4], unsigned arg,
+		      struct nv50_program_exec *tex)
+{
+	struct nv50_program_exec *e;
+	struct nv50_reg imm_1248, *t123[4][4], *r_bits = alloc_temp(pc, NULL);
+	int r_pred = 0;
+	unsigned n, c, i, cc[4] = { 0x0a, 0x13, 0x11, 0x10 };
+
+	pc->allow32 = FALSE;
+	ctor_reg(&imm_1248, P_IMMD, -1, ctor_immd_4u32(pc, 1, 2, 4, 8) * 4);
+
+	/* Subtract bias value of thread i from bias values of each thread,
+	 * store result in r_pred, and set bit i in r_bits if result was 0.
+	 */
+	assert(arg < 4);
+	for (i = 0; i < 4; ++i, ++imm_1248.hw) {
+		emit_quadop(pc, NULL, r_pred, i, t[arg], t[arg], 0x55);
+		emit_mov(pc, r_bits, &imm_1248);
+		set_pred(pc, 2, r_pred, pc->p->exec_tail);
+	}
+	emit_mov_to_pred(pc, r_pred, r_bits);
+
+	/* The lanes of a quad are now grouped by the bit in r_pred they have
+	 * set. Put the input values for TEX into a new register set for each
+	 * group and execute TEX only for a specific group.
+	 * We cannot use the same register set for each group because we need
+	 * the derivatives, which are implicitly calculated, to be correct.
+	 */
+	for (i = 1; i < 4; ++i) {
+		alloc_temp4(pc, t123[i], 0);
+
+		for (c = 0; c <= arg; ++c)
+			emit_mov(pc, t123[i][c], t[c]);
+
+		*(e = exec(pc)) = *(tex);
+		e->inst[0] &= ~0x01fc;
+		set_dst(pc, t123[i][0], e);
+		set_pred(pc, cc[i], r_pred, e);
+		emit(pc, e);
+	}
+	/* finally TEX on the original regs (where we kept the input) */
+	set_pred(pc, cc[0], r_pred, tex);
+	emit(pc, tex);
+
+	/* put the 3 * n other results into regs for lane 0 */
+	n = popcnt4(((e->inst[0] >> 25) & 0x3) | ((e->inst[1] >> 12) & 0xc));
+	for (i = 1; i < 4; ++i) {
+		for (c = 0; c < n; ++c) {
+			emit_mov(pc, t[c], t123[i][c]);
+			set_pred(pc, cc[i], r_pred, pc->p->exec_tail);
 		}
-	} else {
-		for (c = 0; c < dim; c++)
-			emit_mov(pc, t[c], src[c]);
+		free_temp4(pc, t123[i]);
 	}
 
+	emit_nop(pc);
+	free_temp(pc, r_bits);
+}
+
+static void
+emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src, unsigned unit, unsigned type,
+	 boolean proj, int bias_lod)
+{
+	struct nv50_reg *t[4];
+	struct nv50_program_exec *e;
+	unsigned c, dim, arg;
+
+	/* t[i] must be within a single 128 bit super-reg */
+	alloc_temp4(pc, t, 0);
+
 	e = exec(pc);
+	e->inst[0] = 0xf0000000;
 	set_long(pc, e);
-	e->inst[0] |= 0xf0000000;
-	e->inst[1] |= 0x00000004;
 	set_dst(pc, t[0], e);
-	e->inst[0] |= (unit << 9);
 
-	if (dim == 2)
-		e->inst[0] |= 0x00400000;
-	else
-	if (dim == 3) {
-		e->inst[0] |= 0x00800000;
-		if (type == TGSI_TEXTURE_CUBE)
-			e->inst[0] |= 0x08000000;
+	/* TIC and TSC binding indices (TSC is ignored as TSC_LINKED = TRUE): */
+	e->inst[0] |= (unit << 9) /* | (unit << 17) */;
+
+	/* live flag (don't set if TEX results affect input to another TEX): */
+	/* e->inst[0] |= 0x00000004; */
+
+	get_tex_dim(type, &dim, &arg);
+
+	if (type == TGSI_TEXTURE_CUBE) {
+		e->inst[0] |= 0x08000000;
+		load_cube_tex_coords(pc, t, src, arg, proj);
+	} else
+	if (proj)
+		load_proj_tex_coords(pc, t, src, dim, arg);
+	else {
+		for (c = 0; c < dim; c++)
+			emit_mov(pc, t[c], src[c]);
+		if (arg != dim) /* depth reference value (always src.z here) */
+			emit_mov(pc, t[dim], src[2]);
 	}
 
 	e->inst[0] |= (mask & 0x3) << 25;
 	e->inst[1] |= (mask & 0xc) << 12;
 
-	emit(pc, e);
+	if (!bias_lod) {
+		e->inst[0] |= (arg - 1) << 22;
+		emit(pc, e);
+	} else
+	if (bias_lod < 0) {
+		assert(pc->p->type == PIPE_SHADER_FRAGMENT);
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x20000000; /* texbias */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texbias_sequence(pc, t, arg, e);
+	} else {
+		e->inst[0] |= arg << 22;
+		e->inst[1] |= 0x40000000; /* texlod */
+		emit_mov(pc, t[arg], src[3]);
+		emit_texlod_sequence(pc, t[arg], src[3], e);
+	}
+
 #if 1
 	c = 0;
 	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
@@ -1389,46 +2046,14 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 }
 
 static void
-emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
-	    struct nv50_program_exec **join)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	if (join) {
-		set_long(pc, e);
-		e->inst[0] |= 0xa0000002;
-		emit(pc, e);
-		*join = e;
-		e = exec(pc);
-	}
-
-	set_long(pc, e);
-	e->inst[0] |= 0x10000002;
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
-	emit(pc, e);
-}
-
-static void
-emit_nop(struct nv50_pc *pc)
-{
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xf0000000;
-	set_long(pc, e);
-	e->inst[1] = 0xe0000000;
-	emit(pc, e);
-}
-
-static void
 emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
 	struct nv50_program_exec *e = exec(pc);
 
 	assert(src->type == P_TEMP);
 
-	e->inst[0] = 0xc0140000;
-	e->inst[1] = 0x89800000;
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0240000 : 0xc0140000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x86400000 : 0x89800000;
 	set_long(pc, e);
 	set_dst(pc, dst, e);
 	set_src_0(pc, src, e);
@@ -1444,11 +2069,8 @@ emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 
 	assert(src->type == P_TEMP);
 
-	if (!(src->mod & NV50_MOD_NEG)) /* ! double negation */
-		emit_neg(pc, src, src);
-
-	e->inst[0] = 0xc0150000;
-	e->inst[1] = 0x8a400000;
+	e->inst[0] = (src->mod & NV50_MOD_NEG) ? 0xc0250000 : 0xc0150000;
+	e->inst[1] = (src->mod & NV50_MOD_NEG) ? 0x85800000 : 0x8a400000;
 	set_long(pc, e);
 	set_dst(pc, dst, e);
 	set_src_0(pc, src, e);
@@ -1470,6 +2092,21 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 		q = 0x0403c000;
 		m = 0xffff7fff;
 		break;
+	case 0x2:
+	case 0x3:
+		/* ADD, SUB, SUBR b32 */
+		m = ~(0x8000 | (127 << 16));
+		q = ((e->inst[0] & (~m)) >> 2) | (1 << 26);
+		break;
+	case 0x5:
+		/* SAD */
+		m = ~(0x81 << 8);
+		q = (0x0c << 24) | ((e->inst[0] & (0x7f << 2)) << 12);
+		break;
+	case 0x6:
+		/* MAD u16 */
+		q = (e->inst[0] & (0x7f << 2)) << 12;
+		break;
 	case 0x8:
 		/* INTERP (move centroid, perspective and flat bits) */
 		m = ~0x03000100;
@@ -1506,50 +2143,65 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 }
 
 /* Some operations support an optional negation flag. */
-static boolean
-negate_supported(const struct tgsi_full_instruction *insn, int i)
+static int
+get_supported_mods(const struct tgsi_full_instruction *insn, int i)
 {
-	int s;
-
 	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_ADD:
+	case TGSI_OPCODE_COS:
+	case TGSI_OPCODE_DDX:
 	case TGSI_OPCODE_DDY:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_MUL:
+	case TGSI_OPCODE_EX2:
 	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_ADD:
-	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_MAD:
-		break;
+	case TGSI_OPCODE_MUL:
 	case TGSI_OPCODE_POW:
-		if (i == 1)
-			break;
-		return FALSE;
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_SIN:
+	case TGSI_OPCODE_SUB:
+		return NV50_MOD_NEG;
+	case TGSI_OPCODE_MAX:
+	case TGSI_OPCODE_MIN:
+	case TGSI_OPCODE_INEG: /* tgsi src sign toggle/set would be stupid */
+		return NV50_MOD_ABS;
+	case TGSI_OPCODE_CEIL:
+	case TGSI_OPCODE_FLR:
+	case TGSI_OPCODE_TRUNC:
+		return NV50_MOD_NEG | NV50_MOD_ABS;
+	case TGSI_OPCODE_F2I:
+	case TGSI_OPCODE_F2U:
+	case TGSI_OPCODE_I2F:
+	case TGSI_OPCODE_U2F:
+		return NV50_MOD_NEG | NV50_MOD_ABS | NV50_MOD_I32;
+	case TGSI_OPCODE_UADD:
+		return NV50_MOD_NEG | NV50_MOD_I32;
+	case TGSI_OPCODE_SAD:
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_IMAX:
+	case TGSI_OPCODE_IMIN:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_NOT:
+	case TGSI_OPCODE_UMAD:
+	case TGSI_OPCODE_UMAX:
+	case TGSI_OPCODE_UMIN:
+	case TGSI_OPCODE_UMUL:
+	case TGSI_OPCODE_USHR:
+		return NV50_MOD_I32;
 	default:
-		return FALSE;
-	}
-
-	/* Watch out for possible multiple uses of an nv50_reg, we
-	 * can't use nv50_reg::neg in these cases.
-	 */
-	for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) {
-		if (s == i)
-			continue;
-		if ((insn->FullSrcRegisters[s].SrcRegister.Index ==
-		     insn->FullSrcRegisters[i].SrcRegister.Index) &&
-		    (insn->FullSrcRegisters[s].SrcRegister.File ==
-		     insn->FullSrcRegisters[i].SrcRegister.File))
-			return FALSE;
+		return 0;
 	}
-
-	return TRUE;
 }
 
 /* Return a read mask for source registers deduced from opcode & write mask. */
 static unsigned
 nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 {
-	unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask;
+	unsigned x, mask = insn->Dst[0].Register.WriteMask;
 
 	switch (insn->Instruction.Opcode) {
 	case TGSI_OPCODE_COS:
@@ -1564,30 +2216,40 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 	case TGSI_OPCODE_DST:
 		return mask & (c ? 0xa : 0x6);
 	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_EXP:
 	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_LOG:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
 	case TGSI_OPCODE_SCS:
 		return 0x1;
+	case TGSI_OPCODE_IF:
+		return 0x1;
 	case TGSI_OPCODE_LIT:
 		return 0xb;
 	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
 	case TGSI_OPCODE_TXP:
 	{
-		const struct tgsi_instruction_ext_texture *tex;
+		const struct tgsi_instruction_texture *tex;
 
-		assert(insn->Instruction.Extended);
-		tex = &insn->InstructionExtTexture;
+		assert(insn->Instruction.Texture);
+		tex = &insn->Texture;
 
 		mask = 0x7;
-		if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)
-			mask |= 0x8;
+		if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+		    insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+			mask |= 0x8; /* bias, lod or proj */
 
 		switch (tex->Texture) {
 		case TGSI_TEXTURE_1D:
 			mask &= 0x9;
 			break;
+		case TGSI_TEXTURE_SHADOW1D:
+			mask &= 0x5;
+			break;
 		case TGSI_TEXTURE_2D:
 			mask &= 0xb;
 			break;
@@ -1612,23 +2274,28 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
-	switch (dst->DstRegister.File) {
+	switch (dst->Register.File) {
 	case TGSI_FILE_TEMPORARY:
-		return &pc->temp[dst->DstRegister.Index * 4 + c];
+		return &pc->temp[dst->Register.Index * 4 + c];
 	case TGSI_FILE_OUTPUT:
-		return &pc->result[dst->DstRegister.Index * 4 + c];
+		return &pc->result[dst->Register.Index * 4 + c];
 	case TGSI_FILE_ADDRESS:
 	{
-		struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c];
+		struct nv50_reg *r = pc->addr[dst->Register.Index * 4 + c];
 		if (!r) {
-			r = alloc_addr(pc, NULL);
-			pc->addr[dst->DstRegister.Index * 4 + c] = r;
+			r = get_address_reg(pc, NULL);
+			r->index = dst->Register.Index * 4 + c;
+			pc->addr[r->index] = r;
 		}
 		assert(r);
 		return r;
 	}
 	case TGSI_FILE_NULL:
 		return NULL;
+	case TGSI_FILE_SYSTEM_VALUE:
+		assert(pc->sysval[dst->Register.Index].type == P_RESULT);
+		assert(c == 0);
+		return &pc->sysval[dst->Register.Index];
 	default:
 		break;
 	}
@@ -1638,14 +2305,14 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 
 static struct nv50_reg *
 tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
-	 boolean neg)
+	 int mod)
 {
 	struct nv50_reg *r = NULL;
-	struct nv50_reg *temp;
-	unsigned sgn, c, swz;
+	struct nv50_reg *temp = NULL;
+	unsigned sgn, c, swz, cvn;
 
-	if (src->SrcRegister.File != TGSI_FILE_CONSTANT)
-		assert(!src->SrcRegister.Indirect);
+	if (src->Register.File != TGSI_FILE_CONSTANT)
+		assert(!src->Register.Indirect);
 
 	sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);
 
@@ -1655,38 +2322,54 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 	case TGSI_SWIZZLE_Y:
 	case TGSI_SWIZZLE_Z:
 	case TGSI_SWIZZLE_W:
-		switch (src->SrcRegister.File) {
+		switch (src->Register.File) {
 		case TGSI_FILE_INPUT:
-			r = &pc->attr[src->SrcRegister.Index * 4 + c];
+			r = &pc->attr[src->Register.Index * 4 + c];
+
+			if (!src->Dimension.Dimension)
+				break;
+			r = reg_instance(pc, r);
+			r->vtx = src->Dimension.Index;
+
+			if (!src->Dimension.Indirect)
+				break;
+			swz = tgsi_util_get_src_register_swizzle(
+				&src->DimIndirect, 0);
+			r->acc = -1;
+			r->indirect[1] = src->DimIndirect.Index * 4 + swz;
 			break;
 		case TGSI_FILE_TEMPORARY:
-			r = &pc->temp[src->SrcRegister.Index * 4 + c];
+			r = &pc->temp[src->Register.Index * 4 + c];
 			break;
 		case TGSI_FILE_CONSTANT:
-			if (!src->SrcRegister.Indirect) {
-				r = &pc->param[src->SrcRegister.Index * 4 + c];
+			if (!src->Register.Indirect) {
+				r = &pc->param[src->Register.Index * 4 + c];
 				break;
 			}
 			/* Indicate indirection by setting r->acc < 0 and
 			 * use the index field to select the address reg.
 			 */
-			r = MALLOC_STRUCT(nv50_reg);
+			r = reg_instance(pc, NULL);
+			ctor_reg(r, P_CONST, -1, src->Register.Index * 4 + c);
+
 			swz = tgsi_util_get_src_register_swizzle(
-						 &src->SrcRegisterInd, 0);
-			ctor_reg(r, P_CONST,
-				 src->SrcRegisterInd.Index * 4 + swz,
-				 src->SrcRegister.Index * 4 + c);
+				&src->Indirect, 0);
 			r->acc = -1;
+			r->indirect[0] = src->Indirect.Index * 4 + swz;
 			break;
 		case TGSI_FILE_IMMEDIATE:
-			r = &pc->immd[src->SrcRegister.Index * 4 + c];
+			r = &pc->immd[src->Register.Index * 4 + c];
 			break;
 		case TGSI_FILE_SAMPLER:
-			break;
+			return NULL;
 		case TGSI_FILE_ADDRESS:
-			r = pc->addr[src->SrcRegister.Index * 4 + c];
+			r = pc->addr[src->Register.Index * 4 + c];
 			assert(r);
 			break;
+		case TGSI_FILE_SYSTEM_VALUE:
+			assert(c == 0);
+			r = &pc->sysval[src->Register.Index];
+			break;
 		default:
 			assert(0);
 			break;
@@ -1697,33 +2380,34 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src,
 		break;
 	}
 
+	cvn = (mod & NV50_MOD_I32) ? CVT_S32_S32 : CVT_F32_F32;
+
 	switch (sgn) {
-	case TGSI_UTIL_SIGN_KEEP:
-		break;
 	case TGSI_UTIL_SIGN_CLEAR:
-		temp = temp_temp(pc);
-		emit_abs(pc, temp, r);
-		r = temp;
-		break;
-	case TGSI_UTIL_SIGN_TOGGLE:
-		if (neg)
-			r->mod = NV50_MOD_NEG;
-		else {
-			temp = temp_temp(pc);
-			emit_neg(pc, temp, r);
-			r = temp;
-		}
+		r->mod = NV50_MOD_ABS;
 		break;
 	case TGSI_UTIL_SIGN_SET:
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG);
-		r = temp;
+		r->mod = NV50_MOD_NEG_ABS;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		r->mod = NV50_MOD_NEG;
 		break;
 	default:
-		assert(0);
+		assert(!r->mod && sgn == TGSI_UTIL_SIGN_KEEP);
 		break;
 	}
 
+	if ((r->mod & mod) != r->mod) {
+		temp = temp_temp(pc, NULL);
+		emit_cvt(pc, temp, r, -1, cvn);
+		r->mod = 0;
+		r = temp;
+	} else
+		r->mod |= mod & NV50_MOD_I32;
+
+	assert(r);
+	if (r->acc >= 0 && r->vtx < 0 && r != temp)
+		return reg_instance(pc, r); /* will clear r->mod */
 	return r;
 }
 
@@ -1776,9 +2460,13 @@ nv50_tgsi_dst_revdep(unsigned op, int s, int c)
 			assert(0);
 			return 0x0;
 		}
+	case TGSI_OPCODE_EXP:
+	case TGSI_OPCODE_LOG:
 	case TGSI_OPCODE_LIT:
 	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
+	case TGSI_OPCODE_TXB:
+	case TGSI_OPCODE_TXL:
 	case TGSI_OPCODE_TXP:
 		/* these take care of dangerous swizzles themselves */
 		return 0x0;
@@ -1814,34 +2502,51 @@ nv50_kill_branch(struct nv50_pc *pc)
 
 	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
 		return FALSE;
+	if (is_immd(pc->p->exec_tail))
+		return FALSE;
 
 	/* if ccode == 'true', the BRA is from an ELSE and the predicate
 	 * reg may no longer be valid, since we currently always use $p0
 	 */
 	if (has_pred(pc->if_insn[lvl], 0xf))
 		return FALSE;
-	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
+	assert(pc->if_insn[lvl] && pc->if_join[lvl]);
 
-	/* We'll use the exec allocated for JOIN_AT (as we can't easily
-	 * update prev's next); if exec_tail is BRK, update the pointer.
+	/* We'll use the exec allocated for JOIN_AT (we can't easily
+	 * access nv50_program_exec's prev).
 	 */
-	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
-		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
-
 	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
 
-	*pc->br_join[lvl] = *pc->p->exec_tail;
+	*pc->if_join[lvl] = *pc->p->exec_tail;
 
 	FREE(pc->if_insn[lvl]);
 	FREE(pc->p->exec_tail);
 
-	pc->p->exec_tail = pc->br_join[lvl];
+	pc->p->exec_tail = pc->if_join[lvl];
 	pc->p->exec_tail->next = NULL;
 	set_pred(pc, 0xd, 0, pc->p->exec_tail);
 
 	return TRUE;
 }
 
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc,
 		     const struct tgsi_full_instruction *inst)
@@ -1850,38 +2555,38 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	unsigned mask, sat, unit;
 	int i, c;
 
-	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+	mask = inst->Dst[0].Register.WriteMask;
 	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
 
 	memset(src, 0, sizeof(src));
 
 	for (c = 0; c < 4; c++) {
 		if ((mask & (1 << c)) && !pc->r_dst[c])
-			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+			dst[c] = tgsi_dst(pc, c, &inst->Dst[0]);
 		else
 			dst[c] = pc->r_dst[c];
 		rdst[c] = dst[c];
 	}
 
 	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i];
+		const struct tgsi_full_src_register *fs = &inst->Src[i];
 		unsigned src_mask;
-		boolean neg_supp;
+		int mod_supp;
 
 		src_mask = nv50_tgsi_src_mask(inst, i);
-		neg_supp = negate_supported(inst, i);
+		mod_supp = get_supported_mods(inst, i);
 
-		if (fs->SrcRegister.File == TGSI_FILE_SAMPLER)
-			unit = fs->SrcRegister.Index;
+		if (fs->Register.File == TGSI_FILE_SAMPLER)
+			unit = fs->Register.Index;
 
 		for (c = 0; c < 4; c++)
 			if (src_mask & (1 << c))
-				src[i][c] = tgsi_src(pc, c, fs, neg_supp);
+				src[i][c] = tgsi_src(pc, c, fs, mod_supp);
 	}
 
 	brdc = temp = pc->r_brdc;
 	if (brdc && brdc->type != P_TEMP) {
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (sat)
 			brdc = temp;
 	} else
@@ -1890,7 +2595,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)) || dst[c]->type == P_TEMP)
 				continue;
 			/* rdst[c] = dst[c]; */ /* done above */
-			dst[c] = temp_temp(pc);
+			dst[c] = temp_temp(pc, NULL);
 		}
 	}
 
@@ -1901,7 +2606,8 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_abs(pc, dst[c], src[0][c]);
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_ABS | CVT_F32_F32);
 		}
 		break;
 	case TGSI_OPCODE_ADD:
@@ -1922,26 +2628,42 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_ARL:
-		assert(src[0][0]);
-		temp = temp_temp(pc);
-		emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32);
-		emit_arl(pc, dst[0], temp, 4);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, temp, src[0][c], -1,
+				 CVT_FLOOR | CVT_S32_F32);
+			emit_arl(pc, dst[c], temp, 4);
+		}
 		break;
 	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_brka[pc->loop_lvl] = emit_breakaddr(pc);
 		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_BGNSUB:
+		assert(!pc->in_subroutine);
+		pc->in_subroutine = TRUE;
+		/* probably not necessary, but align to 8 byte boundary */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+		break;
 	case TGSI_OPCODE_BRK:
-		emit_branch(pc, -1, 0, NULL);
 		assert(pc->loop_lvl > 0);
-		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
+		emit_break(pc, -1, 0);
+		break;
+	case TGSI_OPCODE_CAL:
+		assert(inst->Label.Label < pc->insn_nr);
+		emit_call(pc, -1, 0)->param.index = inst->Label.Label;
+		/* replaced by actual offset in nv50_program_fixup_insns */
 		break;
 	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
+				 CVT_CEIL | CVT_F32_F32 | CVT_RI);
 		}
 		break;
 	case TGSI_OPCODE_CMP:
@@ -1949,24 +2671,29 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32);
+			emit_cvt(pc, NULL, src[0][c], 1, CVT_F32_F32);
 			emit_mov(pc, dst[c], src[1][c]);
 			set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */
 			emit_mov(pc, dst[c], src[2][c]);
 			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
 		}
 		break;
+	case TGSI_OPCODE_CONT:
+		assert(pc->loop_lvl > 0);
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[pc->loop_lvl - 1];
+		break;
 	case TGSI_OPCODE_COS:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, 5, dst[3], temp);
+			emit_flop(pc, NV50_FLOP_COS, dst[3], temp);
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, brdc, temp);
+		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
 		break;
 	case TGSI_OPCODE_DDX:
 		for (c = 0; c < 4; c++) {
@@ -2010,11 +2737,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[0], 1.0f);
 		break;
 	case TGSI_OPCODE_ELSE:
-		emit_branch(pc, -1, 0, NULL);
+		emit_branch(pc, -1, 0);
 		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
 		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_EMIT:
+		emit_prim_cmd(pc, 1);
+		break;
 	case TGSI_OPCODE_ENDIF:
 		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
 
@@ -2022,26 +2752,76 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (nv50_kill_branch(pc) == TRUE)
 			break;
 
-		if (pc->br_join[pc->if_lvl]) {
-			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
-			pc->br_join[pc->if_lvl] = NULL;
+		if (pc->if_join[pc->if_lvl]) {
+			pc->if_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->if_join[pc->if_lvl] = NULL;
 		}
 		terminate_mbb(pc);
 		/* emit a NOP as join point, we could set it on the next
 		 * one, but would have to make sure it is long and !immd
 		 */
-		emit_nop(pc);
-		pc->p->exec_tail->inst[1] |= 2;
+		JOIN_ON(emit_nop(pc));
 		break;
 	case TGSI_OPCODE_ENDLOOP:
-		emit_branch(pc, -1, 0, NULL);
-		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
-		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[--pc->loop_lvl];
+		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
+		terminate_mbb(pc);
+		break;
+	case TGSI_OPCODE_ENDPRIM:
+		emit_prim_cmd(pc, 2);
+		break;
+	case TGSI_OPCODE_ENDSUB:
+		assert(pc->in_subroutine);
 		terminate_mbb(pc);
+		pc->in_subroutine = FALSE;
 		break;
 	case TGSI_OPCODE_EX2:
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, brdc, temp);
+		emit_flop(pc, NV50_FLOP_EX2, brdc, temp);
+		break;
+	case TGSI_OPCODE_EXP:
+	{
+		struct nv50_reg *t[2];
+
+		assert(!temp);
+		t[0] = temp_temp(pc, NULL);
+		t[1] = temp_temp(pc, NULL);
+
+		if (mask & 0x6)
+			emit_mov(pc, t[0], src[0][0]);
+		if (mask & 0x3)
+			emit_flr(pc, t[1], src[0][0]);
+
+		if (mask & (1 << 1))
+			emit_sub(pc, dst[1], t[0], t[1]);
+		if (mask & (1 << 0)) {
+			emit_preex2(pc, t[1], t[1]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]);
+		}
+		if (mask & (1 << 2)) {
+			emit_preex2(pc, t[0], t[0]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
+		break;
+	case TGSI_OPCODE_F2I:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_S32_F32);
+		}
+		break;
+	case TGSI_OPCODE_F2U:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_TRUNC | CVT_U32_F32);
+		}
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -2051,7 +2831,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 		break;
 	case TGSI_OPCODE_FRC:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2059,30 +2839,88 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_I2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_S32);
+		}
+		break;
 	case TGSI_OPCODE_IF:
-		/* emitting a join_at may not be necessary */
-		assert(pc->if_lvl < MAX_IF_DEPTH);
-		/* set_pred_wr(pc, 1, 0, pc->if_cond); */
-		emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN,
-			 CVT_F32_F32);
-		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
-		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		assert(pc->if_lvl < NV50_MAX_COND_NESTING);
+		emit_cvt(pc, NULL, src[0][0], 0, CVT_ABS | CVT_F32_F32);
+		pc->if_join[pc->if_lvl] = emit_joinat(pc);
+		pc->if_insn[pc->if_lvl++] = emit_branch(pc, 0, 2);;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_IMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x08c, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_IMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0ac, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_INEG:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1,
+				 CVT_S32_S32 | CVT_NEG);
+		}
+		break;
 	case TGSI_OPCODE_KIL:
+		assert(src[0][0] && src[0][1] && src[0][2] && src[0][3]);
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
 		emit_kil(pc, src[0][2]);
 		emit_kil(pc, src[0][3]);
 		break;
+	case TGSI_OPCODE_KILP:
+		emit_kil(pc, NULL);
+		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		emit_flop(pc, 3, brdc, src[0][0]);
+		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_LOG:
+	{
+		struct nv50_reg *t[2];
+
+		t[0] = temp_temp(pc, NULL);
+		if (mask & (1 << 1))
+			t[1] = temp_temp(pc, NULL);
+		else
+			t[1] = t[0];
+
+		emit_cvt(pc, t[0], src[0][0], -1, CVT_ABS | CVT_F32_F32);
+		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], t[1]);
+		emit_flr(pc, t[1], t[1]);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], t[1]);
+		if (mask & (1 << 1)) {
+			t[1]->mod = NV50_MOD_NEG;
+			emit_preex2(pc, t[1], t[1]);
+			t[1]->mod = 0;
+			emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]);
+			emit_mul(pc, dst[1], t[0], t[1]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
 		break;
 	case TGSI_OPCODE_LRP:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
@@ -2101,14 +2939,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x880, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MIN:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+			emit_minmax(pc, 0x8a0, dst[c], src[0][c], src[1][c]);
 		}
 		break;
 	case TGSI_OPCODE_MOV:
@@ -2125,39 +2963,73 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mul(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
+	case TGSI_OPCODE_NOT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_not(pc, dst[c], src[0][c]);
+		}
+		break;
 	case TGSI_OPCODE_POW:
 		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		emit_flop(pc, 0, brdc, src[0][0]);
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
+		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_RET:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine)
+			nv50_fp_move_results(pc);
+		emit_ret(pc, -1, 0);
 		break;
 	case TGSI_OPCODE_RSQ:
-		emit_flop(pc, 2, brdc, src[0][0]);
+		if (!sat && popcnt4(mask) == 1)
+			brdc = dst[ffs(mask) - 1];
+		src[0][0]->mod |= NV50_MOD_ABS;
+		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_SAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
 		break;
 	case TGSI_OPCODE_SCS:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & 3)
 			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
-			emit_flop(pc, 5, dst[0], temp);
+			emit_flop(pc, NV50_FLOP_COS, dst[0], temp);
 		if (mask & (1 << 1))
-			emit_flop(pc, 4, dst[1], temp);
+			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp);
 		if (mask & (1 << 2))
 			emit_mov_immdval(pc, dst[2], 0.0);
 		if (mask & (1 << 3))
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
+	case TGSI_OPCODE_SHL:
+	case TGSI_OPCODE_ISHR:
+	case TGSI_OPCODE_USHR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_shift(pc, dst[c], src[0][c], src[1][c],
+				   inst->Instruction.Opcode);
+		}
+		break;
 	case TGSI_OPCODE_SIN:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, 4, dst[3], temp);
+			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp);
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
-				temp = brdc = temp_temp(pc);
+				temp = brdc = temp_temp(pc, NULL);
 		}
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, brdc, temp);
+		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
 	case TGSI_OPCODE_SGE:
@@ -2165,12 +3037,23 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	case TGSI_OPCODE_SGT:
 	case TGSI_OPCODE_SLE:
 	case TGSI_OPCODE_SNE:
-		i = map_tgsi_setop_cc(inst->Instruction.Opcode);
+	case TGSI_OPCODE_ISLT:
+	case TGSI_OPCODE_ISGE:
+	case TGSI_OPCODE_USEQ:
+	case TGSI_OPCODE_USGE:
+	case TGSI_OPCODE_USLT:
+	case TGSI_OPCODE_USNE:
+	{
+		uint8_t cc, ty;
+
+		map_tgsi_setop_hw(inst->Instruction.Opcode, &cc, &ty);
+
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
-			emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]);
+			emit_set(pc, cc, dst[c], -1, src[0][c], src[1][c], ty);
 		}
+	}
 		break;
 	case TGSI_OPCODE_SUB:
 		for (c = 0; c < 4; c++) {
@@ -2181,22 +3064,91 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		break;
 	case TGSI_OPCODE_TEX:
 		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->InstructionExtTexture.Texture, FALSE);
+			 inst->Texture.Texture, FALSE, 0);
+		break;
+	case TGSI_OPCODE_TXB:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, -1);
+		break;
+	case TGSI_OPCODE_TXL:
+		emit_tex(pc, dst, mask, src[0], unit,
+			 inst->Texture.Texture, FALSE, 1);
 		break;
 	case TGSI_OPCODE_TXP:
 		emit_tex(pc, dst, mask, src[0], unit,
-			 inst->InstructionExtTexture.Texture, TRUE);
+			 inst->Texture.Texture, TRUE, 0);
 		break;
 	case TGSI_OPCODE_TRUNC:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
+				 CVT_TRUNC | CVT_F32_F32 | CVT_RI);
+		}
+		break;
+	case TGSI_OPCODE_U2F:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_cvt(pc, dst[c], src[0][c], -1, CVT_F32_U32);
+		}
+		break;
+	case TGSI_OPCODE_UADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add_b32(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x084, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 0x0a4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_UMAD:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, temp, src[0][c], 0, src[1][c], 0,
+				     temp);
+			emit_add_b32(pc, dst[c], temp, src[2][c]);
 		}
+	}
+		break;
+	case TGSI_OPCODE_UMUL:
+	{
+		assert(!temp);
+		temp = temp_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul_u16(pc, temp, src[0][c], 0, src[1][c], 1);
+			emit_mad_u16(pc, temp, src[0][c], 1, src[1][c], 0,
+				     temp);
+			emit_shl_imm(pc, temp, temp, 16);
+			emit_mad_u16(pc, dst[c], src[0][c], 0, src[1][c], 0,
+				     temp);
+		}
+	}
 		break;
 	case TGSI_OPCODE_XPD:
-		temp = temp_temp(pc);
+		temp = temp_temp(pc, NULL);
 		if (mask & (1 << 0)) {
 			emit_mul(pc, temp, src[0][2], src[1][1]);
 			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
@@ -2213,6 +3165,21 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_END:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT)
+			nv50_fp_move_results(pc);
+
+		/* last insn must be long so it can have the exit bit set */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+		else
+		if (is_immd(pc->p->exec_tail) ||
+		    is_join(pc->p->exec_tail) ||
+		    is_control_flow(pc->p->exec_tail))
+			emit_nop(pc);
+
+		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */
+
+		terminate_mbb(pc);
 		break;
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
@@ -2239,39 +3206,34 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		}
 	}
 
-	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		for (c = 0; c < 4; c++) {
-			if (!src[i][c])
-				continue;
-			src[i][c]->mod = 0;
-			if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)
-				FREE(src[i][c]);
-			else
-			if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST)
-				FREE(src[i][c]); /* indirect constant */
-		}
-	}
+	kill_temp_temp(pc, NULL);
+	pc->reg_instance_nr = 0;
 
-	kill_temp_temp(pc);
 	return TRUE;
 }
 
 static void
 prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 {
-	struct nv50_reg *reg = NULL;
+	struct nv50_reg *r, *reg = NULL;
 	const struct tgsi_full_src_register *src;
 	const struct tgsi_dst_register *dst;
 	unsigned i, c, k, mask;
 
-	dst = &insn->FullDstRegisters[0].DstRegister;
+	dst = &insn->Dst[0].Register;
 	mask = dst->WriteMask;
 
         if (dst->File == TGSI_FILE_TEMPORARY)
-                reg = pc->temp;
+		reg = pc->temp;
         else
-        if (dst->File == TGSI_FILE_OUTPUT)
-                reg = pc->result;
+	if (dst->File == TGSI_FILE_OUTPUT) {
+		reg = pc->result;
+
+		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV &&
+		    dst->Index == pc->edgeflag_out &&
+		    insn->Src[0].Register.File == TGSI_FILE_INPUT)
+			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index;
+	}
 
 	if (reg) {
 		for (c = 0; c < 4; c++) {
@@ -2282,12 +3244,12 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 	}
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
-		src = &insn->FullSrcRegisters[i];
+		src = &insn->Src[i];
 
-		if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)
+		if (src->Register.File == TGSI_FILE_TEMPORARY)
 			reg = pc->temp;
 		else
-		if (src->SrcRegister.File == TGSI_FILE_INPUT)
+		if (src->Register.File == TGSI_FILE_INPUT)
 			reg = pc->attr;
 		else
 			continue;
@@ -2299,7 +3261,15 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 				continue;
 			k = tgsi_util_get_full_src_register_swizzle(src, c);
 
-			reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr;
+			r = &reg[src->Register.Index * 4 + k];
+
+			/* If used before written, pre-allocate the reg,
+			 * lest we overwrite results from a subroutine.
+			 */
+			if (!r->acc && r->type == P_TEMP)
+				alloc_reg(pc, r);
+
+			r->acc = pc->insn_nr;
 		}
 	}
 }
@@ -2313,7 +3283,7 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 static unsigned
 nv50_revdep_reorder(unsigned m[4], unsigned rdep[4])
 {
-	unsigned i, c, x, unsafe;
+	unsigned i, c, x, unsafe = 0;
 
 	for (c = 0; c < 4; c++)
 		m[c] = c;
@@ -2359,13 +3329,13 @@ static struct nv50_reg *
 tgsi_broadcast_dst(struct nv50_pc *pc,
 		   const struct tgsi_full_dst_register *fd, unsigned mask)
 {
-	if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) {
-		int c = ffs(~mask & fd->DstRegister.WriteMask);
+	if (fd->Register.File == TGSI_FILE_TEMPORARY) {
+		int c = ffs(~mask & fd->Register.WriteMask);
 		if (c)
 			return tgsi_dst(pc, c - 1, fd);
 	} else {
-		int c = ffs(fd->DstRegister.WriteMask) - 1;
-		if ((1 << c) == fd->DstRegister.WriteMask)
+		int c = ffs(fd->Register.WriteMask) - 1;
+		if ((1 << c) == fd->Register.WriteMask)
 			return tgsi_dst(pc, c, fd);
 	}
 
@@ -2379,7 +3349,7 @@ static unsigned
 nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 		       unsigned rdep[4])
 {
-	const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0];
+	const struct tgsi_full_dst_register *fd = &insn->Dst[0];
 	const struct tgsi_full_src_register *fs;
 	unsigned i, deqs = 0;
 
@@ -2388,11 +3358,11 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 
 	for (i = 0; i < insn->Instruction.NumSrcRegs; i++) {
 		unsigned chn, mask = nv50_tgsi_src_mask(insn, i);
-		boolean neg_supp = negate_supported(insn, i);
+		int ms = get_supported_mods(insn, i);
 
-		fs = &insn->FullSrcRegisters[i];
-		if (fs->SrcRegister.File != fd->DstRegister.File ||
-		    fs->SrcRegister.Index != fd->DstRegister.Index)
+		fs = &insn->Src[i];
+		if (fs->Register.File != fd->Register.File ||
+		    fs->Register.Index != fd->Register.Index)
 			continue;
 
 		for (chn = 0; chn < 4; ++chn) {
@@ -2403,13 +3373,15 @@ nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn,
 			c = tgsi_util_get_full_src_register_swizzle(fs, chn);
 			s = tgsi_util_get_full_src_register_sign_mode(fs, chn);
 
-			if (!(fd->DstRegister.WriteMask & (1 << c)))
+			if (!(fd->Register.WriteMask & (1 << c)))
 				continue;
 
-			/* no danger if src is copied to TEMP first */
-			if ((s != TGSI_UTIL_SIGN_KEEP) &&
-			    (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp))
-				continue;
+			if (s == TGSI_UTIL_SIGN_TOGGLE && !(ms & NV50_MOD_NEG))
+					continue;
+			if (s == TGSI_UTIL_SIGN_CLEAR && !(ms & NV50_MOD_ABS))
+					continue;
+			if ((s == TGSI_UTIL_SIGN_SET) && ((ms & 3) != 3))
+					continue;
 
 			rdep[c] |= nv50_tgsi_dst_revdep(
 				insn->Instruction.Opcode, i, chn);
@@ -2427,18 +3399,18 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	const struct tgsi_full_dst_register *fd;
 	unsigned i, deqs, rdep[4], m[4];
 
-	fd = &tok->FullInstruction.FullDstRegisters[0];
+	fd = &tok->FullInstruction.Dst[0];
 	deqs = nv50_tgsi_scan_swizzle(&insn, rdep);
 
 	if (is_scalar_op(insn.Instruction.Opcode)) {
 		pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs);
 		if (!pc->r_brdc)
-			pc->r_brdc = temp_temp(pc);
+			pc->r_brdc = temp_temp(pc, NULL);
 		return nv50_program_tx_insn(pc, &insn);
 	}
 	pc->r_brdc = NULL;
 
-	if (!deqs)
+	if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3]))
 		return nv50_program_tx_insn(pc, &insn);
 
 	deqs = nv50_revdep_reorder(m, rdep);
@@ -2446,10 +3418,10 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	for (i = 0; i < 4; ++i) {
 		assert(pc->r_dst[m[i]] == NULL);
 
-		insn.FullDstRegisters[0].DstRegister.WriteMask =
-			fd->DstRegister.WriteMask & (1 << m[i]);
+		insn.Dst[0].Register.WriteMask =
+			fd->Register.WriteMask & (1 << m[i]);
 
-		if (!insn.FullDstRegisters[0].DstRegister.WriteMask)
+		if (!insn.Dst[0].Register.WriteMask)
 			continue;
 
 		if (deqs & (1 << i))
@@ -2489,7 +3461,7 @@ load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
 		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
 
 		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
-		emit_flop(pc, 0, iv, iv);
+		emit_flop(pc, NV50_FLOP_RCP, iv, iv);
 
 		/* XXX: when loading interpolants dynamically, move these
 		 * to the program head, or make sure it can't be skipped.
@@ -2503,17 +3475,53 @@ load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
  * value of 0 for back-facing, and 0xffffffff for front-facing.
  */
 static void
-load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a)
+load_frontfacing(struct nv50_pc *pc, struct nv50_reg *sv)
 {
-	struct nv50_reg *one = alloc_immd(pc, 1.0f);
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+	int r_pred = 0;
 
-	assert(a->rhw == -1);
-	alloc_reg(pc, a); /* do this before rhw is set */
-	a->rhw = 255;
-	load_interpolant(pc, a);
-	emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND);
+	temp->rhw = 255;
+	emit_interp(pc, temp, NULL, INTERP_FLAT);
 
-	FREE(one);
+	emit_cvt(pc, sv, temp, r_pred, CVT_ABS | CVT_F32_S32);
+
+	emit_not(pc, temp, temp);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+	emit_cvt(pc, sv, temp, -1, CVT_F32_S32);
+	set_pred(pc, 0x2, r_pred, pc->p->exec_tail);
+
+	free_temp(pc, temp);
+}
+
+static void
+load_instance_id(struct nv50_pc *pc, unsigned index)
+{
+	struct nv50_reg reg, mem;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+	ctor_reg(&mem, P_CONST, -1, 24); /* startInstance */
+	mem.buf_index = 2;
+
+	emit_add_b32(pc, &reg, &pc->sysval[index], &mem);
+	pc->sysval[index] = reg;
+}
+
+static void
+copy_semantic_info(struct nv50_program *p)
+{
+	unsigned i, id;
+
+	for (i = 0; i < p->cfg.in_nr; ++i) {
+		id = p->cfg.in[i].id;
+		p->cfg.in[i].sn = p->info.input_semantic_name[id];
+		p->cfg.in[i].si = p->info.input_semantic_index[id];
+	}
+
+	for (i = 0; i < p->cfg.out_nr; ++i) {
+		id = p->cfg.out[i].id;
+		p->cfg.out[i].sn = p->info.output_semantic_name[id];
+		p->cfg.out[i].si = p->info.output_semantic_index[id];
+	}
 }
 
 static boolean
@@ -2522,7 +3530,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 	struct tgsi_parse_context tp;
 	struct nv50_program *p = pc->p;
 	boolean ret = FALSE;
-	unsigned i, c, flat_nr = 0;
+	unsigned i, c, instance_id, vertex_id, flat_nr = 0;
 
 	tgsi_parse_init(&tp, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&tp)) {
@@ -2535,10 +3543,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			const struct tgsi_full_immediate *imm =
 				&tp.FullToken.FullImmediate;
 
-			ctor_immd(pc, imm->u[0].Float,
-				      imm->u[1].Float,
-				      imm->u[2].Float,
-				      imm->u[3].Float);
+			ctor_immd_4f32(pc, imm->u[0].Float,
+				       imm->u[1].Float,
+				       imm->u[2].Float,
+				       imm->u[3].Float);
 		}
 			break;
 		case TGSI_TOKEN_TYPE_DECLARATION:
@@ -2547,8 +3555,8 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			unsigned si, last, first, mode;
 
 			d = &tp.FullToken.FullDeclaration;
-			first = d->DeclarationRange.First;
-			last = d->DeclarationRange.Last;
+			first = d->Range.First;
+			last = d->Range.Last;
 
 			switch (d->Declaration.File) {
 			case TGSI_FILE_TEMPORARY:
@@ -2558,17 +3566,20 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				    p->type == PIPE_SHADER_FRAGMENT)
 					break;
 
-				si = d->Semantic.SemanticIndex;
-				switch (d->Semantic.SemanticName) {
+				si = d->Semantic.Index;
+				switch (d->Semantic.Name) {
 				case TGSI_SEMANTIC_BCOLOR:
 					p->cfg.two_side[si].hw = first;
-					if (p->cfg.io_nr > first)
-						p->cfg.io_nr = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
 					break;
 				case TGSI_SEMANTIC_PSIZE:
 					p->cfg.psiz = first;
-					if (p->cfg.io_nr > first)
-						p->cfg.io_nr = first;
+					if (p->cfg.out_nr > first)
+						p->cfg.out_nr = first;
+					break;
+				case TGSI_SEMANTIC_EDGEFLAG:
+					pc->edgeflag_out = first;
 					break;
 					/*
 				case TGSI_SEMANTIC_CLIP_DISTANCE:
@@ -2605,6 +3616,37 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 					pc->interp_mode[i] = mode;
 			}
 				break;
+			case TGSI_FILE_SYSTEM_VALUE:
+				assert(d->Declaration.Semantic);
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_FACE:
+					assert(p->type == PIPE_SHADER_FRAGMENT);
+					load_frontfacing(pc,
+							 &pc->sysval[first]);
+					break;
+				case TGSI_SEMANTIC_INSTANCEID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					instance_id = first;
+					p->cfg.regs[0] |= (1 << 4);
+					break;
+				case TGSI_SEMANTIC_PRIMID:
+					assert(p->type != PIPE_SHADER_VERTEX);
+					p->cfg.prim_id = first;
+					break;
+					/*
+				case TGSI_SEMANTIC_PRIMIDIN:
+					assert(p->type == PIPE_SHADER_GEOMETRY);
+					pc->sysval[first].hw = 6;
+					p->cfg.regs[0] |= (1 << 8);
+					break;
+				case TGSI_SEMANTIC_VERTEXID:
+					assert(p->type == PIPE_SHADER_VERTEX);
+					vertex_id = first;
+					p->cfg.regs[0] |= (1 << 12) | (1 << 0);
+					break;
+					*/
+				}
+				break;
 			case TGSI_FILE_ADDRESS:
 			case TGSI_FILE_CONSTANT:
 			case TGSI_FILE_SAMPLER:
@@ -2625,36 +3667,65 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		}
 	}
 
-	if (p->type == PIPE_SHADER_VERTEX) {
+	if (p->type == PIPE_SHADER_VERTEX || p->type == PIPE_SHADER_GEOMETRY) {
 		int rid = 0;
 
-		for (i = 0; i < pc->attr_nr * 4; ++i) {
-			if (pc->attr[i].acc) {
-				pc->attr[i].hw = rid++;
-				p->cfg.attr[i / 32] |= 1 << (i % 32);
+		if (p->type == PIPE_SHADER_GEOMETRY) {
+			for (i = 0; i < pc->attr_nr; ++i) {
+				p->cfg.in[i].hw = rid;
+				p->cfg.in[i].id = i;
+
+				for (c = 0; c < 4; ++c) {
+					int n = i * 4 + c;
+					if (!pc->attr[n].acc)
+						continue;
+					pc->attr[n].hw = rid++;
+					p->cfg.in[i].mask |= 1 << c;
+				}
+			}
+		} else {
+			for (i = 0; i < pc->attr_nr * 4; ++i) {
+				if (pc->attr[i].acc) {
+					pc->attr[i].hw = rid++;
+					p->cfg.attr[i / 32] |= 1 << (i % 32);
+				}
+			}
+			if (p->cfg.regs[0] & (1 << 0))
+				pc->sysval[vertex_id].hw = rid++;
+			if (p->cfg.regs[0] & (1 << 4)) {
+				pc->sysval[instance_id].hw = rid++;
+				load_instance_id(pc, instance_id);
 			}
 		}
 
 		for (i = 0, rid = 0; i < pc->result_nr; ++i) {
-			p->cfg.io[i].hw = rid;
-			p->cfg.io[i].id_vp = i;
+			p->cfg.out[i].hw = rid;
+			p->cfg.out[i].id = i;
 
 			for (c = 0; c < 4; ++c) {
 				int n = i * 4 + c;
 				if (!pc->result[n].acc)
 					continue;
 				pc->result[n].hw = rid++;
-				p->cfg.io[i].mask |= 1 << c;
+				p->cfg.out[i].mask |= 1 << c;
 			}
 		}
+		if (p->cfg.prim_id < 0x40) {
+			/* GP has to write to PrimitiveID */
+			ctor_reg(&pc->sysval[p->cfg.prim_id],
+				 P_RESULT, p->cfg.prim_id, rid);
+			p->cfg.prim_id = rid++;
+		}
 
 		for (c = 0; c < 2; ++c)
 			if (p->cfg.two_side[c].hw < 0x40)
-				p->cfg.two_side[c] = p->cfg.io[
+				p->cfg.two_side[c] = p->cfg.out[
 					p->cfg.two_side[c].hw];
 
 		if (p->cfg.psiz < 0x40)
-			p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw;
+			p->cfg.psiz = p->cfg.out[p->cfg.psiz].hw;
+
+		copy_semantic_info(p);
 	} else
 	if (p->type == PIPE_SHADER_FRAGMENT) {
 		int rid, aid;
@@ -2662,33 +3733,34 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 
 		pc->allow32 = TRUE;
 
-		int base = (TGSI_SEMANTIC_POSITION ==
-			    p->info.input_semantic_name[0]) ? 0 : 1;
+		/* do we read FragCoord ? */
+		if (pc->attr_nr &&
+		    p->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+			/* select FCRD components we want accessible */
+			for (c = 0; c < 4; ++c)
+				if (pc->attr[c].acc)
+					p->cfg.regs[1] |= 1 << (24 + c);
+			aid = 0;
+		} else /* offset by 1 if FCRD.w is needed for pinterp */
+			aid = popcnt4(p->cfg.regs[1] >> 24);
 
 		/* non-flat interpolants have to be mapped to
 		 * the lower hardware IDs, so sort them:
 		 */
 		for (i = 0; i < pc->attr_nr; i++) {
-			if (pc->interp_mode[i] == INTERP_FLAT) {
-				p->cfg.io[m].id_vp = i + base;
-				p->cfg.io[m++].id_fp = i;
-			} else {
+			if (pc->interp_mode[i] == INTERP_FLAT)
+				p->cfg.in[m++].id = i;
+			else {
 				if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE))
-					p->cfg.io[n].linear = TRUE;
-				p->cfg.io[n].id_vp = i + base;
-				p->cfg.io[n++].id_fp = i;
+					p->cfg.in[n].linear = TRUE;
+				p->cfg.in[n++].id = i;
 			}
 		}
-
-		if (!base) /* set w-coordinate mask from perspective interp */
-			p->cfg.io[0].mask |= p->cfg.regs[1] >> 24;
-
-		aid = popcnt4( /* if fcrd isn't contained in cfg.io */
-			base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask);
+		copy_semantic_info(p);
 
 		for (n = 0; n < pc->attr_nr; ++n) {
-			p->cfg.io[n].hw = rid = aid;
-			i = p->cfg.io[n].id_fp;
+			p->cfg.in[n].hw = rid = aid;
+			i = p->cfg.in[n].id;
 
 			if (p->info.input_semantic_name[n] ==
 			    TGSI_SEMANTIC_FACE) {
@@ -2700,16 +3772,13 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 				if (!pc->attr[i * 4 + c].acc)
 					continue;
 				pc->attr[i * 4 + c].rhw = rid++;
-				p->cfg.io[n].mask |= 1 << c;
+				p->cfg.in[n].mask |= 1 << c;
 
 				load_interpolant(pc, &pc->attr[i * 4 + c]);
 			}
-			aid += popcnt4(p->cfg.io[n].mask);
+			aid += popcnt4(p->cfg.in[n].mask);
 		}
 
-		if (!base)
-			p->cfg.regs[1] |= p->cfg.io[0].mask << 24;
-
 		m = popcnt4(p->cfg.regs[1] >> 24);
 
 		/* set count of non-position inputs and of non-flat
@@ -2718,32 +3787,33 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 		p->cfg.regs[1] |= aid - m;
 
 		if (flat_nr) {
-			i = p->cfg.io[pc->attr_nr - flat_nr].hw;
+			i = p->cfg.in[pc->attr_nr - flat_nr].hw;
 			p->cfg.regs[1] |= (i - m) << 16;
 		} else
 			p->cfg.regs[1] |= p->cfg.regs[1] << 16;
 
 		/* mark color semantic for light-twoside */
-		n = 0x40;
-		for (i = 0; i < pc->attr_nr; i++) {
-			ubyte si, sn;
-
-			sn = p->info.input_semantic_name[p->cfg.io[i].id_fp];
-			si = p->info.input_semantic_index[p->cfg.io[i].id_fp];
-
-			if (sn == TGSI_SEMANTIC_COLOR) {
-				p->cfg.two_side[si] = p->cfg.io[i];
-
-				/* increase colour count */
-				p->cfg.regs[0] += popcnt4(
-					p->cfg.two_side[si].mask) << 16;
-
-				n = MIN2(n, p->cfg.io[i].hw - m);
+		n = 0x80;
+		for (i = 0; i < p->cfg.in_nr; i++) {
+			if (p->cfg.in[i].sn == TGSI_SEMANTIC_COLOR) {
+				n = MIN2(n, p->cfg.in[i].hw - m);
+				p->cfg.two_side[p->cfg.in[i].si] = p->cfg.in[i];
+
+				p->cfg.regs[0] += /* increase colour count */
+					popcnt4(p->cfg.in[i].mask) << 16;
 			}
 		}
-		if (n < 0x40)
+		if (n < 0x80)
 			p->cfg.regs[0] += n;
 
+		if (p->cfg.prim_id < 0x40) {
+			pc->sysval[p->cfg.prim_id].rhw = rid++;
+			emit_interp(pc, &pc->sysval[p->cfg.prim_id], NULL,
+				    INTERP_FLAT);
+			/* increase FP_INTERPOLANT_CTRL_COUNT */
+			p->cfg.regs[1] += 1;
+		}
+
 		/* Initialize FP results:
 		 * FragDepth is always first TGSI and last hw output
 		 */
@@ -2797,10 +3867,31 @@ free_nv50_pc(struct nv50_pc *pc)
 		FREE(pc->attr);
 	if (pc->temp)
 		FREE(pc->temp);
+	if (pc->sysval)
+		FREE(pc->sysval);
+	if (pc->insn_pos)
+		FREE(pc->insn_pos);
 
 	FREE(pc);
 }
 
+static INLINE uint32_t
+nv50_map_gs_output_prim(unsigned pprim)
+{
+	switch (pprim) {
+	case PIPE_PRIM_POINTS:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
+	case PIPE_PRIM_LINE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
+	default:
+		NOUVEAU_ERR("invalid GS_OUTPUT_PRIMITIVE: %u\n", pprim);
+		abort();
+		return 0;
+	}
+}
+
 static boolean
 ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 {
@@ -2814,23 +3905,55 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 	pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1;
 	pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1;
 	assert(pc->addr_nr <= 2);
+	pc->sysval_nr = p->info.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
 
 	p->cfg.high_temp = 4;
 
 	p->cfg.two_side[0].hw = 0x40;
 	p->cfg.two_side[1].hw = 0x40;
+	p->cfg.prim_id = 0x40;
+
+	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
+
+	for (i = 0; i < p->info.num_properties; ++i) {
+		unsigned *data = &p->info.properties[i].data[0];
+
+		switch (p->info.properties[i].name) {
+		case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+			p->cfg.prim_type = nv50_map_gs_output_prim(data[0]);
+			break;
+		case TGSI_PROPERTY_GS_MAX_VERTICES:
+			p->cfg.vert_count = data[0];
+			break;
+		default:
+			break;
+		}
+	}
 
 	switch (p->type) {
 	case PIPE_SHADER_VERTEX:
 		p->cfg.psiz = 0x40;
 		p->cfg.clpd = 0x40;
-		p->cfg.io_nr = pc->result_nr;
+		p->cfg.out_nr = pc->result_nr;
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		assert(p->cfg.prim_type);
+		assert(p->cfg.vert_count);
+
+		p->cfg.psiz = 0x80;
+		p->cfg.clpd = 0x80;
+		p->cfg.prim_id = 0x80;
+		p->cfg.out_nr = pc->result_nr;
+		p->cfg.in_nr = pc->attr_nr;
+
+		p->cfg.two_side[0].hw = 0x80;
+		p->cfg.two_side[1].hw = 0x80;
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		rtype[0] = rtype[1] = P_TEMP;
 
 		p->cfg.regs[0] = 0x01000004;
-		p->cfg.io_nr = pc->attr_nr;
+		p->cfg.in_nr = pc->attr_nr;
 
 		if (p->info.writes_z) {
 			p->cfg.regs[2] |= 0x00000100;
@@ -2888,33 +4011,24 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 			return FALSE;
 	}
 	for (i = 0; i < NV50_SU_MAX_ADDR; ++i)
-		ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1);
-
-	return TRUE;
-}
+		ctor_reg(&pc->r_addr[i], P_ADDR, -1, i + 1);
 
-static void
-nv50_fp_move_results(struct nv50_pc *pc)
-{
-	struct nv50_reg reg;
-	unsigned i;
-
-	ctor_reg(&reg, P_TEMP, -1, -1);
-
-	for (i = 0; i < pc->result_nr * 4; ++i) {
-		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
-			continue;
-		if (pc->result[i].rhw != pc->result[i].hw) {
-			reg.hw = pc->result[i].rhw;
-			emit_mov(pc, &reg, &pc->result[i]);
-		}
+	if (pc->sysval_nr) {
+		pc->sysval = CALLOC(pc->sysval_nr, sizeof(struct nv50_reg *));
+		if (!pc->sysval)
+			return FALSE;
+		/* will only ever use SYSTEM_VALUE[i].x (hopefully) */
+		for (i = 0; i < pc->sysval_nr; ++i)
+			ctor_reg(&pc->sysval[i], rtype[0], i, -1);
 	}
+
+	return TRUE;
 }
 
 static void
 nv50_program_fixup_insns(struct nv50_pc *pc)
 {
-	struct nv50_program_exec *e, *prev = NULL, **bra_list;
+	struct nv50_program_exec *e, **bra_list;
 	unsigned i, n, pos;
 
 	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
@@ -2934,27 +4048,24 @@ nv50_program_fixup_insns(struct nv50_pc *pc)
 			for (i = 0; i < n; ++i)
 				if (bra_list[i]->param.index >= pos)
 					bra_list[i]->param.index += 1;
+			for (i = 0; i < pc->insn_nr; ++i)
+				if (pc->insn_pos[i] >= pos)
+					pc->insn_pos[i] += 1;
 			convert_to_long(pc, e);
 			++pos;
 		}
-		if (e->next)
-			prev = e;
 	}
 
-	assert(!is_immd(pc->p->exec_head));
-	assert(!is_immd(pc->p->exec_tail));
+	FREE(bra_list);
 
-	/* last instruction must be long so it can have the end bit set */
-	if (!is_long(pc->p->exec_tail)) {
-		convert_to_long(pc, pc->p->exec_tail);
-		if (prev)
-			convert_to_long(pc, prev);
-	}
-	assert(!(pc->p->exec_tail->inst[1] & 2));
-	/* set the end-bit */
-	pc->p->exec_tail->inst[1] |= 1;
+	if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL])
+		return;
 
-	FREE(bra_list);
+	/* fill in CALL offsets */
+	for (e = pc->p->exec_head; e; e = e->next) {
+		if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2)
+			e->param.index = pc->insn_pos[e->param.index];
+	}
 }
 
 static boolean
@@ -2976,19 +4087,20 @@ nv50_program_tx(struct nv50_program *p)
 	if (ret == FALSE)
 		goto out_cleanup;
 
+	pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned));
+
 	tgsi_parse_init(&parse, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		const union tgsi_full_token *tok = &parse.FullToken;
 
-		/* don't allow half insn/immd on first and last instruction */
+		/* previously allow32 was FALSE for first & last instruction */
 		pc->allow32 = TRUE;
-		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
-			pc->allow32 = FALSE;
 
 		tgsi_parse_token(&parse);
 
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_pos[pc->insn_cur] = pc->p->exec_size;
 			++pc->insn_cur;
 			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
@@ -2999,9 +4111,6 @@ nv50_program_tx(struct nv50_program *p)
 		}
 	}
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT)
-		nv50_fp_move_results(pc);
-
 	nv50_program_fixup_insns(pc);
 
 	p->param_nr = pc->param_nr * 4;
@@ -3025,7 +4134,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
 }
 
 static void
-nv50_program_upload_data(struct nv50_context *nv50, float *map,
+nv50_program_upload_data(struct nv50_context *nv50, uint32_t *map,
 			unsigned start, unsigned count, unsigned cbuf)
 {
 	struct nouveau_channel *chan = nv50->screen->base.channel;
@@ -3073,13 +4182,17 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
 
 	if (p->param_nr) {
 		unsigned cb;
-		float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type],
-					     PIPE_BUFFER_USAGE_CPU_READ);
-
-		if (p->type == PIPE_SHADER_VERTEX)
+		uint32_t *map = pipe_buffer_map(pscreen,
+						nv50->constbuf[p->type],
+						PIPE_BUFFER_USAGE_CPU_READ);
+		switch (p->type) {
+		case PIPE_SHADER_GEOMETRY: cb = NV50_CB_PGP; break;
+		case PIPE_SHADER_FRAGMENT: cb = NV50_CB_PFP; break;
+		default:
 			cb = NV50_CB_PVP;
-		else
-			cb = NV50_CB_PFP;
+			assert(p->type == PIPE_SHADER_VERTEX);
+			break;
+		}
 
 		nv50_program_upload_data(nv50, map, 0, p->param_nr, cb);
 		pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]);
@@ -3173,19 +4286,18 @@ nv50_vertprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(13, 2);
+	so = so_new(5, 7, 2);
 	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_HIGH, 0, 0);
+		  NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
-		      NOUVEAU_BO_LOW, 0, 0);
+		  NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2);
 	so_data  (so, p->cfg.attr[0]);
 	so_data  (so, p->cfg.attr[1]);
 	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2);
-	so_data  (so, p->cfg.high_result); //8);
+	so_method(so, tesla, NV50TCL_VP_REG_ALLOC_TEMP, 1);
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_VP_START_ID, 1);
 	so_data  (so, 0); /* program start offset */
@@ -3209,7 +4321,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	nv50_program_validate_data(nv50, p);
 	nv50_program_validate_code(nv50, p);
 
-	so = so_new(64, 2);
+	so = so_new(6, 7, 2);
 	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
 	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		      NOUVEAU_BO_HIGH, 0, 0);
@@ -3219,7 +4331,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
+	so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
 	so_data  (so, p->cfg.regs[2]);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
 	so_data  (so, p->cfg.regs[3]);
@@ -3229,65 +4341,111 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_ref(NULL, &so);
 }
 
-static void
+void
+nv50_geomprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->geomprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(6, 7, 2);
+	so_method(so, tesla, NV50TCL_GP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_TEMP, 1);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, NV50TCL_GP_REG_ALLOC_RESULT, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, NV50TCL_GP_OUTPUT_PRIMITIVE_TYPE, 1);
+	so_data  (so, p->cfg.prim_type);
+	so_method(so, tesla, NV50TCL_GP_VERTEX_OUTPUT_COUNT, 1);
+	so_data  (so, p->cfg.vert_count);
+	so_method(so, tesla, NV50TCL_GP_START_ID, 1);
+	so_data  (so, 0);
+	so_ref(so, &nv50->state.geomprog);
+	so_ref(NULL, &so);
+}
+
+static uint32_t
 nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base)
 {
+	struct nv50_program *vp;
 	struct nv50_program *fp = nv50->fragprog;
-	struct nv50_program *vp = nv50->vertprog;
 	unsigned i, c, m = base;
+	uint32_t origin = 0x00000010;
+
+	vp = nv50->geomprog ? nv50->geomprog : nv50->vertprog;
 
-	/* XXX: This can't work correctly in all cases yet, we either
-	 * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has
-	 * to be per FP input instead of per VP output
+	/* XXX: this might not work correctly in all cases yet - we'll
+	 * just assume that an FP generic input that is not written in
+	 * the VP is PointCoord.
 	 */
 	memset(pntc, 0, 8 * sizeof(uint32_t));
 
-	for (i = 0; i < fp->cfg.io_nr; i++) {
-		uint8_t sn, si;
-		uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp;
-		unsigned n = popcnt4(fp->cfg.io[i].mask);
+	for (i = 0; i < fp->cfg.in_nr; i++) {
+		unsigned j, n = popcnt4(fp->cfg.in[i].mask);
 
-		if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) {
+		if (fp->cfg.in[i].sn != TGSI_SEMANTIC_GENERIC) {
 			m += n;
 			continue;
 		}
 
-		sn = vp->info.input_semantic_name[j];
-		si = vp->info.input_semantic_index[j];
+		for (j = 0; j < vp->cfg.out_nr; ++j)
+			if (vp->cfg.out[j].sn ==  fp->cfg.in[i].sn &&
+			    vp->cfg.out[j].si == fp->cfg.in[i].si)
+				break;
 
-		if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) {
-			ubyte mode =
-				nv50->rasterizer->pipe.sprite_coord_mode[si];
+		if (j < vp->cfg.out_nr) {
+			ubyte mode = nv50->rasterizer->pipe.sprite_coord_mode[
+				vp->cfg.out[j].si];
 
 			if (mode == PIPE_SPRITE_COORD_NONE) {
 				m += n;
 				continue;
-			}
+			} else
+			if (mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+				origin = 0;
 		}
 
 		/* this is either PointCoord or replaced by sprite coords */
 		for (c = 0; c < 4; c++) {
-			if (!(fp->cfg.io[i].mask & (1 << c)))
+			if (!(fp->cfg.in[i].mask & (1 << c)))
 				continue;
 			pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
 			++m;
 		}
 	}
+	return origin;
 }
 
 static int
-nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
-	       struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
+nv50_vec4_map(uint32_t *map32, int mid, uint8_t zval, uint32_t lin[4],
+	      struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo)
 {
 	int c;
 	uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw;
-	uint8_t *map = (uint8_t *)p_map;
+	uint8_t *map = (uint8_t *)map32;
 
 	for (c = 0; c < 4; ++c) {
 		if (mf & 1) {
 			if (fpi->linear == TRUE)
 				lin[mid / 32] |= 1 << (mid % 32);
-			map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40);
+			if (mv & 1)
+				map[mid] = oid;
+			else
+				map[mid] = (c == 3) ? (zval + 1) : zval;
+			++mid;
 		}
 
 		oid += mv & 1;
@@ -3299,34 +4457,42 @@ nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4],
 }
 
 void
-nv50_linkage_validate(struct nv50_context *nv50)
+nv50_fp_linkage_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_program *vp = nv50->vertprog;
 	struct nv50_program *fp = nv50->fragprog;
 	struct nouveau_stateobj *so;
-	struct nv50_sreg4 dummy, *vpo;
+	struct nv50_sreg4 dummy;
 	int i, n, c, m = 0;
-	uint32_t map[16], lin[4], reg[5], pcrd[8];
+	uint32_t map[16], lin[4], reg[6], pcrd[8];
+	uint8_t zval = 0x40;
 
+	if (nv50->geomprog) {
+		vp = nv50->geomprog;
+		zval = 0x80;
+	}
 	memset(map, 0, sizeof(map));
 	memset(lin, 0, sizeof(lin));
 
 	reg[1] = 0x00000004; /* low and high clip distance map ids */
 	reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */
 	reg[3] = 0x00000000; /* point size map id & enable */
+	reg[5] = 0x00000000; /* primitive ID map slot */
 	reg[0] = fp->cfg.regs[0]; /* colour semantic reg */
 	reg[4] = fp->cfg.regs[1]; /* interpolant info */
 
 	dummy.linear = FALSE;
 	dummy.mask = 0xf; /* map all components of HPOS */
-	m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]);
+	m = nv50_vec4_map(map, m, zval, lin, &dummy, &vp->cfg.out[0]);
 
 	dummy.mask = 0x0;
 
 	if (vp->cfg.clpd < 0x40) {
-		for (c = 0; c < vp->cfg.clpd_nr; ++c)
-			map[m++] = vp->cfg.clpd + c;
+		for (c = 0; c < vp->cfg.clpd_nr; ++c) {
+			map[m / 4] |= (vp->cfg.clpd + c) << ((m % 4) * 8);
+			++m;
+		}
 		reg[1] = (m << 8);
 	}
 
@@ -3334,31 +4500,37 @@ nv50_linkage_validate(struct nv50_context *nv50)
 
 	/* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */
 	if (nv50->rasterizer->pipe.light_twoside) {
-		vpo = &vp->cfg.two_side[0];
+		struct nv50_sreg4 *vpo = &vp->cfg.two_side[0];
+		struct nv50_sreg4 *fpi = &fp->cfg.two_side[0];
 
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]);
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]);
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[0], &vpo[0]);
+		m = nv50_vec4_map(map, m, zval, lin, &fpi[1], &vpo[1]);
 	}
 
 	reg[0] += m - 4; /* adjust FFC0 id */
 	reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */
 
-	i = 0;
-	if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION)
-		i = 1;
-	for (; i < fp->cfg.io_nr; i++) {
-		ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp];
-		ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp];
-
-		n = fp->cfg.io[i].id_vp;
-		if (n >= vp->cfg.io_nr ||
-		    vp->info.output_semantic_name[n] != sn ||
-		    vp->info.output_semantic_index[n] != si)
-			vpo = &dummy;
-		else
-			vpo = &vp->cfg.io[n];
+	for (i = 0; i < fp->cfg.in_nr; i++) {
+		/* maybe even remove these from cfg.io */
+		if (fp->cfg.in[i].sn == TGSI_SEMANTIC_POSITION ||
+		    fp->cfg.in[i].sn == TGSI_SEMANTIC_FACE)
+			continue;
 
-		m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo);
+		for (n = 0; n < vp->cfg.out_nr; ++n)
+			if (vp->cfg.out[n].sn == fp->cfg.in[i].sn &&
+			    vp->cfg.out[n].si == fp->cfg.in[i].si)
+				break;
+
+		m = nv50_vec4_map(map, m, zval, lin, &fp->cfg.in[i],
+				  (n < vp->cfg.out_nr) ?
+				  &vp->cfg.out[n] : &dummy);
+	}
+	/* PrimitiveID either is replaced by the system value, or
+	 * written by the geometry shader into an output register
+	 */
+	if (fp->cfg.prim_id < 0x40) {
+		map[m / 4] |= vp->cfg.prim_id << ((m % 4) * 8);
+		reg[5] = m++;
 	}
 
 	if (nv50->rasterizer->pipe.point_size_per_vertex) {
@@ -3366,14 +4538,28 @@ nv50_linkage_validate(struct nv50_context *nv50)
 		reg[3] = (m++ << 4) | 1;
 	}
 
-	/* now fill the stateobj */
-	so = so_new(64, 0);
+	/* now fill the stateobj (at most 28 so_data)  */
+	so = so_new(10, 54, 0);
 
 	n = (m + 3) / 4;
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
-	so_data  (so, m);
-	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
-	so_datap (so, map, n);
+	assert(m <= 32);
+	if (vp->type == PIPE_SHADER_GEOMETRY) {
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_GP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	} else {
+		so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+		so_data  (so, vp->cfg.regs[0]);
+
+		so_method(so, tesla, NV50TCL_MAP_SEMANTIC_4, 1);
+		so_data  (so, reg[5]);
+
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+		so_data  (so, m);
+		so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n);
+		so_datap (so, map, n);
+	}
 
 	so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4);
 	so_datap (so, reg, 4);
@@ -3381,18 +4567,89 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
 	so_data  (so, reg[4]);
 
-	so_method(so, tesla, 0x1540, 4);
+	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
 	so_datap (so, lin, 4);
 
 	if (nv50->rasterizer->pipe.point_sprite) {
-		nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff);
+		so_method(so, tesla, NV50TCL_POINT_SPRITE_CTRL, 1);
+		so_data  (so,
+			  nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff));
 
 		so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8);
 		so_datap (so, pcrd, 8);
 	}
 
-        so_ref(so, &nv50->state.programs);
-        so_ref(NULL, &so);
+	so_method(so, tesla, NV50TCL_GP_ENABLE, 1);
+	so_data  (so, (vp->type == PIPE_SHADER_GEOMETRY) ? 1 : 0);
+
+	so_ref(so, &nv50->state.fp_linkage);
+	so_ref(NULL, &so);
+}
+
+static int
+construct_vp_gp_mapping(uint32_t *map32, int m,
+			struct nv50_program *vp, struct nv50_program *gp)
+{
+	uint8_t *map = (uint8_t *)map32;
+	int i, j, c;
+
+        for (i = 0; i < gp->cfg.in_nr; ++i) {
+                uint8_t oid, mv = 0, mg = gp->cfg.in[i].mask;
+
+                for (j = 0; j < vp->cfg.out_nr; ++j) {
+                        if (vp->cfg.out[j].sn == gp->cfg.in[i].sn &&
+                            vp->cfg.out[j].si == gp->cfg.in[i].si) {
+				mv = vp->cfg.out[j].mask;
+				oid = vp->cfg.out[j].hw;
+                                break;
+			}
+		}
+
+                for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
+			if (mg & mv & 1)
+				map[m++] = oid;
+			else
+			if (mg & 1)
+				map[m++] = (c == 3) ? 0x41 : 0x40;
+                        oid += mv & 1;
+                }
+        }
+	return m;
+}
+
+void
+nv50_gp_linkage_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	struct nv50_program *vp = nv50->vertprog;
+	struct nv50_program *gp = nv50->geomprog;
+	uint32_t map[16];
+	int m = 0;
+
+	if (!gp) {
+		so_ref(NULL, &nv50->state.gp_linkage);
+		return;
+	}
+	memset(map, 0, sizeof(map));
+
+	m = construct_vp_gp_mapping(map, m, vp, gp);
+
+	so = so_new(3, 24 - 3, 0);
+
+	so_method(so, tesla, NV50TCL_VP_GP_BUILTIN_ATTR_EN, 1);
+	so_data  (so, vp->cfg.regs[0] | gp->cfg.regs[0]);
+
+	assert(m <= 32);
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1);
+	so_data  (so, m);
+
+	m = (m + 3) / 4;
+	so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), m);
+	so_datap (so, map, m);
+
+	so_ref(so, &nv50->state.gp_linkage);
+	so_ref(NULL, &so);
 }
 
 void
@@ -3409,6 +4666,7 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
 
 	nouveau_bo_ref(NULL, &p->bo);
 
+	FREE(p->immd);
 	nouveau_resource_free(&p->data[0]);
 
 	p->translated = 0;
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index d78dee083f1..1e3ad6bff05 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -16,12 +16,13 @@ struct nv50_program_exec {
 };
 
 struct nv50_sreg4 {
-	uint8_t hw;
-	uint8_t id_vp;
-	uint8_t id_fp;
+	uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
+	uint8_t id; /* tgsi index */
 
 	uint8_t mask;
 	boolean linear;
+
+	ubyte sn, si; /* semantic name & index */
 };
 
 struct nv50_program {
@@ -38,7 +39,7 @@ struct nv50_program {
 
 	struct nouveau_bo *bo;
 
-	float *immd;
+	uint32_t *immd;
 	unsigned immd_nr;
 	unsigned param_nr;
 
@@ -50,15 +51,24 @@ struct nv50_program {
 		uint32_t regs[4];
 
 		/* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */
-		unsigned io_nr;
-		struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS];
+		unsigned in_nr, out_nr;
+		struct nv50_sreg4 in[PIPE_MAX_SHADER_INPUTS];
+		struct nv50_sreg4 out[PIPE_MAX_SHADER_OUTPUTS];
 
 		/* FP colour inputs, VP/GP back colour outputs */
 		struct nv50_sreg4 two_side[2];
 
-		/* VP only */
+		/* GP only */
+		unsigned vert_count;
+		uint8_t prim_type;
+
+		/* VP & GP only */
 		uint8_t clpd, clpd_nr;
 		uint8_t psiz;
+		uint8_t edgeflag_in;
+
+		/* FP & GP only */
+		uint8_t prim_id;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index f605c475aca..9eba4c96115 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -48,7 +48,7 @@ nv50_query_create(struct pipe_context *pipe, unsigned type)
 	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
 	q->type = type;
 
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 256,
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256,
 			     16, &q->bo);
 	if (ret) {
 		FREE(q);
@@ -77,9 +77,9 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
 
-	BEGIN_RING(chan, tesla, 0x1530, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_RESET, 1);
 	OUT_RING  (chan, 1);
-	BEGIN_RING(chan, tesla, 0x1514, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_ENABLE, 1);
 	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
@@ -93,13 +93,15 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
 
-	WAIT_RING (chan, 5);
+	MARK_RING (chan, 5, 2); /* flush on lack of space or relocs */
 	BEGIN_RING(chan, tesla, NV50TCL_QUERY_ADDRESS_HIGH, 4);
-	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
-	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
 	OUT_RING  (chan, 0x00000000);
 	OUT_RING  (chan, 0x0100f002);
-	FIRE_RING (chan);
+
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_ENABLE, 1);
+	OUT_RING  (chan, 0);
 }
 
 static boolean
@@ -123,6 +125,35 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 	return q->ready;
 }
 
+static void
+nv50_render_condition(struct pipe_context *pipe,
+		      struct pipe_query *pq, uint mode)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->base.channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_query *q;
+
+	if (!pq) {
+		BEGIN_RING(chan, tesla, NV50TCL_COND_MODE, 1);
+		OUT_RING  (chan, NV50TCL_COND_MODE_ALWAYS);
+		return;
+	}
+	q = nv50_query(pq);
+
+	if (mode == PIPE_RENDER_COND_WAIT ||
+	    mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
+		/* XXX: big fence, FIFO semaphore might be better */
+		BEGIN_RING(chan, tesla, 0x0110, 1);
+		OUT_RING  (chan, 0);
+	}
+
+	BEGIN_RING(chan, tesla, NV50TCL_COND_ADDRESS_HIGH, 3);
+	OUT_RELOCh(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, q->bo, 0, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RING  (chan, NV50TCL_COND_MODE_RES);
+}
+
 void
 nv50_init_query_functions(struct nv50_context *nv50)
 {
@@ -131,4 +162,5 @@ nv50_init_query_functions(struct nv50_context *nv50)
 	nv50->pipe.begin_query = nv50_query_begin;
 	nv50->pipe.end_query = nv50_query_end;
 	nv50->pipe.get_query_result = nv50_query_result;
+	nv50->pipe.render_condition = nv50_render_condition;
 }
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index e1b2f11239a..9d58f3c9651 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -76,6 +76,7 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 		case PIPE_FORMAT_DXT3_RGBA:
 		case PIPE_FORMAT_DXT5_RGBA:
 		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_S8Z24_UNORM:
 		case PIPE_FORMAT_Z32_FLOAT:
 		case PIPE_FORMAT_R16G16B16A16_SNORM:
 		case PIPE_FORMAT_R16G16B16A16_UNORM:
@@ -97,6 +98,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 	switch (param) {
 	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
 		return 32;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 32;
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 64;
 	case PIPE_CAP_NPOT_TEXTURES:
 		return 1;
 	case PIPE_CAP_TWO_SIDED_STENCIL:
@@ -122,10 +127,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
 		return 1;
-	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-		return 0;
 	case PIPE_CAP_TGSI_CONT_SUPPORTED:
-		return 0;
+		return 1;
 	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
 		return 1;
 	case NOUVEAU_CAP_HW_VTXBUF:
@@ -162,6 +165,21 @@ static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv50_screen *screen = nv50_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < 3; i++) {
+		if (screen->constbuf_parm[i])
+			nouveau_bo_ref(NULL, &screen->constbuf_parm[i]);
+	}
+
+	if (screen->constbuf_misc[0])
+		nouveau_bo_ref(NULL, &screen->constbuf_misc[0]);
+	if (screen->tic)
+		nouveau_bo_ref(NULL, &screen->tic);
+	if (screen->tsc)
+		nouveau_bo_ref(NULL, &screen->tsc);
+	if (screen->static_init)
+		so_ref(NULL, &screen->static_init);
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->tesla);
@@ -171,6 +189,28 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 	FREE(screen);
 }
 
+static int
+nv50_pre_pipebuffer_map(struct pipe_screen *pscreen, struct pipe_buffer *pb,
+	unsigned usage)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *ctx = screen->cur_ctx;
+
+	if (!(pb->usage & PIPE_BUFFER_USAGE_VERTEX))
+		return 0;
+
+	/* Our vtxbuf got mapped, it can no longer be considered part of current
+	 * state, remove it to avoid emitting reloc markers.
+	 */
+	if (ctx && ctx->state.vtxbuf && so_bo_is_reloc(ctx->state.vtxbuf,
+			nouveau_bo(pb))) {
+		so_ref(NULL, &ctx->state.vtxbuf);
+		ctx->dirty |= NV50_NEW_ARRAYS;
+	}
+
+	return 0;
+}
+
 struct pipe_screen *
 nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 {
@@ -198,6 +238,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	pscreen->get_param = nv50_screen_get_param;
 	pscreen->get_paramf = nv50_screen_get_paramf;
 	pscreen->is_format_supported = nv50_screen_is_format_supported;
+	screen->base.pre_pipebuffer_map_callback = nv50_pre_pipebuffer_map;
 
 	nv50_screen_init_miptree_functions(pscreen);
 	nv50_transfer_init_screen_functions(pscreen);
@@ -210,7 +251,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->m2mf, 1);
 
 	/* 2D object */
 	ret = nouveau_grobj_alloc(chan, 0xbeef502d, NV50_2D, &screen->eng2d);
@@ -219,7 +259,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->eng2d, 2);
 
 	/* 3D object */
 	switch (chipset & 0xf0) {
@@ -228,8 +267,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		break;
 	case 0x80:
 	case 0x90:
-		/* this stupid name should be corrected. */
-		tesla_class = NV54TCL;
+		tesla_class = NV84TCL;
 		break;
 	case 0xa0:
 		switch (chipset) {
@@ -239,7 +277,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 			tesla_class = NVA0TCL;
 			break;
 		default:
-			tesla_class = 0x8597;
+			tesla_class = NVA8TCL;
 			break;
 		}
 		break;
@@ -256,7 +294,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		nv50_screen_destroy(pscreen);
 		return NULL;
 	}
-	BIND_RING(chan, screen->tesla, 3);
 
 	/* Sync notifier */
 	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
@@ -267,7 +304,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	/* Static M2MF init */
-	so = so_new(32, 0);
+	so = so_new(1, 3, 0);
 	so_method(so, screen->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 3);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -276,7 +313,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref (NULL, &so);
 
 	/* Static 2D init */
-	so = so_new(64, 0);
+	so = so_new(4, 7, 0);
 	so_method(so, screen->eng2d, NV50_2D_DMA_NOTIFY, 4);
 	so_data  (so, screen->sync->handle);
 	so_data  (so, chan->vram->handle);
@@ -284,7 +321,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_data  (so, chan->vram->handle);
 	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
 	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
-	so_method(so, screen->eng2d, 0x0290, 1);
+	so_method(so, screen->eng2d, NV50_2D_CLIP_ENABLE, 1);
 	so_data  (so, 0);
 	so_method(so, screen->eng2d, 0x0888, 1);
 	so_data  (so, 1);
@@ -292,33 +329,36 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_ref(NULL, &so);
 
 	/* Static tesla init */
-	so = so_new(256, 20);
+	so = so_new(47, 95, 24);
 
-	so_method(so, screen->tesla, 0x1558, 1);
-	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_COND_MODE, 1);
+	so_data  (so, NV50TCL_COND_MODE_ALWAYS);
 	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
-	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
-				     NV50TCL_DMA_UNK0__SIZE);
-	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+	so_method(so, screen->tesla, NV50TCL_DMA_ZETA, 11);
+	for (i = 0; i < 11; i++)
 		so_data(so, chan->vram->handle);
-	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
-				     NV50TCL_DMA_UNK1__SIZE);
-	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+	so_method(so, screen->tesla, NV50TCL_DMA_COLOR(0),
+				     NV50TCL_DMA_COLOR__SIZE);
+	for (i = 0; i < NV50TCL_DMA_COLOR__SIZE; i++)
 		so_data(so, chan->vram->handle);
-	so_method(so, screen->tesla, 0x121c, 1);
+	so_method(so, screen->tesla, NV50TCL_RT_CONTROL, 1);
 	so_data  (so, 1);
 
 	/* activate all 32 lanes (threads) in a warp */
-	so_method(so, screen->tesla, 0x19a0, 1);
+	so_method(so, screen->tesla, NV50TCL_WARP_HALVES, 1);
 	so_data  (so, 0x2);
 	so_method(so, screen->tesla, 0x1400, 1);
 	so_data  (so, 0xf);
 
-	so_method(so, screen->tesla, 0x13bc, 1);
-	so_data  (so, 0x54);
+	/* max TIC (bits 4:8) & TSC (ignored) bindings, per program type */
+	for (i = 0; i < 3; ++i) {
+		so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(i), 1);
+		so_data  (so, 0x54);
+	}
+
 	/* origin is top left (set to 1 for bottom left) */
-	so_method(so, screen->tesla, 0x13ac, 1);
+	so_method(so, screen->tesla, NV50TCL_Y_ORIGIN_BOTTOM, 1);
 	so_data  (so, 0);
 	so_method(so, screen->tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, 8);
@@ -331,8 +371,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		return NULL;
 	}
 
-	for (i = 0; i < 2; i++) {
-		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (128 * 4) * 4,
+	for (i = 0; i < 3; i++) {
+		ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (256 * 4) * 4,
 				     &screen->constbuf_parm[i]);
 		if (ret) {
 			nv50_screen_destroy(pscreen);
@@ -354,7 +394,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	//  B = buffer ID (maybe more than 1 byte)
 	//  N = CB index used in shader instruction
 	//  P = program type (0 = VP, 2 = GP, 3 = FP)
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x000BBNP1);
 	*/
 
@@ -367,27 +407,51 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000001 | (NV50_CB_PMISC << 12));
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000021 | (NV50_CB_PMISC << 12));
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000031 | (NV50_CB_PMISC << 12));
 
+	/* bind auxiliary constbuf to immediate data bo */
 	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, screen->constbuf_misc[0], (128 * 4) * 4,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_misc[0], (128 * 4) * 4,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_AUX << 16) | 0x00000200);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000201 | (NV50_CB_AUX << 12));
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000221 | (NV50_CB_AUX << 12));
+
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_VERTEX], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_VERTEX], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PVP << 16) | 0x00000800);
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000101 | (NV50_CB_PVP << 12));
 
 	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
-	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |
-		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_GEOMETRY], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_GEOMETRY], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PGP << 16) | 0x00000800);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
+	so_data  (so, 0x00000121 | (NV50_CB_PGP << 12));
+
+	so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_FRAGMENT], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf_parm[PIPE_SHADER_FRAGMENT], 0,
+		  NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 	so_data  (so, (NV50_CB_PFP << 16) | 0x00000800);
 	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x00000131 | (NV50_CB_PFP << 12));
 
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tic);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, PIPE_SHADER_TYPES*32*32,
+			     &screen->tic);
 	if (ret) {
 		nv50_screen_destroy(pscreen);
 		return NULL;
@@ -398,9 +462,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, 0x000007ff);
+	so_data  (so, PIPE_SHADER_TYPES * 32 - 1);
 
-	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tsc);
+	ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, PIPE_SHADER_TYPES*32*32,
+			     &screen->tsc);
 	if (ret) {
 		nv50_screen_destroy(pscreen);
 		return NULL;
@@ -411,27 +476,31 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
 		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
-	so_data  (so, 0x00000000);
+	so_data  (so, 0x00000000); /* ignored if TSC_LINKED (0x1234) = 1 */
 
 
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
-		so_method(so, screen->tesla, NV50TCL_UNK1080_OFFSET_HIGH(i), 2);
+		so_method(so, screen->tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
 		so_data  (so, 0x000000ff);
 		so_data  (so, 0xffffffff);
 	}
 
-	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR(0), 2);
 	so_data  (so, fui(0.0));
 	so_data  (so, fui(1.0));
 
-	so_method(so, screen->tesla, 0x1234, 1);
+	/* no dynamic combination of TIC & TSC entries => only BIND_TIC used */
+	so_method(so, screen->tesla, NV50TCL_LINKED_TSC, 1);
 	so_data  (so, 1);
 
 	/* activate first scissor rectangle */
-	so_method(so, screen->tesla, NV50TCL_SCISSOR_ENABLE, 1);
+	so_method(so, screen->tesla, NV50TCL_SCISSOR_ENABLE(0), 1);
 	so_data  (so, 1);
 
+	so_method(so, screen->tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+	so_data  (so, 1); /* default edgeflag to TRUE */
+
 	so_emit(chan, so);
 	so_ref (so, &screen->static_init);
 	so_ref (NULL, &so);
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
index 61e24a5b571..0d786b0f2e3 100644
--- a/src/gallium/drivers/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -2,6 +2,7 @@
 #define __NV50_SCREEN_H__
 
 #include "nouveau/nouveau_screen.h"
+#include "nv50_context.h"
 
 struct nv50_screen {
 	struct nouveau_screen base;
@@ -9,6 +10,7 @@ struct nv50_screen {
 	struct nouveau_winsys *nvws;
 
 	unsigned cur_pctx;
+	struct nv50_context *cur_ctx;
 
 	struct nouveau_grobj *tesla;
 	struct nouveau_grobj *eng2d;
@@ -16,10 +18,12 @@ struct nv50_screen {
 	struct nouveau_notifier *sync;
 
 	struct nouveau_bo *constbuf_misc[1];
-	struct nouveau_bo *constbuf_parm[2];
+	struct nouveau_bo *constbuf_parm[PIPE_SHADER_TYPES];
 
 	struct nouveau_resource *immd_heap[1];
-	struct nouveau_resource *parm_heap[2];
+	struct nouveau_resource *parm_heap[PIPE_SHADER_TYPES];
+
+	struct pipe_buffer *strm_vbuf[16];
 
 	struct nouveau_bo *tic;
 	struct nouveau_bo *tsc;
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index ffaa5e29d1c..6ab33be663d 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -35,7 +35,7 @@ static void *
 nv50_blend_state_create(struct pipe_context *pipe,
 			const struct pipe_blend_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(5, 24, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
 	unsigned cmask = 0, i;
@@ -146,7 +146,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		  (wrap_mode(cso->wrap_r) << 6));
 
 	switch (cso->mag_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
 		break;
@@ -157,7 +156,6 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	switch (cso->min_img_filter) {
-	case PIPE_TEX_FILTER_ANISO:
 	case PIPE_TEX_FILTER_LINEAR:
 		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
 		break;
@@ -196,8 +194,9 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	}
 
 	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-		tsc[0] |= (1 << 8);
-		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
+		/* XXX: must be deactivated for non-shadow textures */
+		tsc[0] |= (1 << 9);
+		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7) << 10;
 	}
 
 	limit = CLAMP(cso->lod_bias, -16.0, 15.0);
@@ -215,46 +214,71 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 	return (void *)sso;
 }
 
-static void
-nv50_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+static INLINE void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned type,
+			unsigned nr, void **sampler)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	int i;
 
-	nv50->sampler_nr = nr;
-	for (i = 0; i < nv50->sampler_nr; i++)
-		nv50->sampler[i] = sampler[i];
+	memcpy(nv50->sampler[type], sampler, nr * sizeof(void *));
 
+	nv50->sampler_nr[type] = nr;
 	nv50->dirty |= NV50_NEW_SAMPLER;
 }
 
 static void
+nv50_vp_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+	nv50_sampler_state_bind(pipe, PIPE_SHADER_VERTEX, nr, s);
+}
+
+static void
+nv50_fp_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+	nv50_sampler_state_bind(pipe, PIPE_SHADER_FRAGMENT, nr, s);
+}
+
+static void
 nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
 	FREE(hwcso);
 }
 
-static void
-nv50_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
-			 struct pipe_texture **pt)
+static INLINE void
+nv50_set_sampler_texture(struct pipe_context *pipe, unsigned type,
+			 unsigned nr, struct pipe_texture **pt)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
-	int i;
+	unsigned i;
 
 	for (i = 0; i < nr; i++)
-		pipe_texture_reference((void *)&nv50->miptree[i], pt[i]);
-	for (i = nr; i < nv50->miptree_nr; i++)
-		pipe_texture_reference((void *)&nv50->miptree[i], NULL);
+		pipe_texture_reference((void *)&nv50->miptree[type][i], pt[i]);
+	for (i = nr; i < nv50->miptree_nr[type]; i++)
+		pipe_texture_reference((void *)&nv50->miptree[type][i], NULL);
 
-	nv50->miptree_nr = nr;
+	nv50->miptree_nr[type] = nr;
 	nv50->dirty |= NV50_NEW_TEXTURE;
 }
 
+static void
+nv50_set_vp_sampler_textures(struct pipe_context *pipe,
+			     unsigned nr, struct pipe_texture **pt)
+{
+	nv50_set_sampler_texture(pipe, PIPE_SHADER_VERTEX, nr, pt);
+}
+
+static void
+nv50_set_fp_sampler_textures(struct pipe_context *pipe,
+			     unsigned nr, struct pipe_texture **pt)
+{
+	nv50_set_sampler_texture(pipe, PIPE_SHADER_FRAGMENT, nr, pt);
+}
+
 static void *
 nv50_rasterizer_state_create(struct pipe_context *pipe,
 			     const struct pipe_rasterizer_state *cso)
 {
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(15, 21, 0);
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_rasterizer_stateobj *rso =
 		CALLOC_STRUCT(nv50_rasterizer_stateobj);
@@ -269,7 +293,7 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
 	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
 				       NV50TCL_SHADE_MODEL_SMOOTH);
-	so_method(so, tesla, 0x1684, 1);
+	so_method(so, tesla, NV50TCL_PROVOKING_VERTEX_LAST, 1);
 	so_data  (so, cso->flatshade_first ? 0 : 1);
 
 	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
@@ -366,7 +390,7 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
 		so_data  (so, fui(cso->offset_scale));
 		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
-		so_data  (so, fui(cso->offset_units));
+		so_data  (so, fui(cso->offset_units * 2.0f));
 	}
 
 	rso->pipe = *cso;
@@ -399,7 +423,7 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 {
 	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
 	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
-	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_stateobj *so = so_new(8, 22, 0);
 
 	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
 	so_data  (so, cso->depth.writemask ? 1 : 0);
@@ -413,9 +437,8 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, 0);
 	}
 
-	/* XXX: keep hex values until header is updated (names reversed) */
 	if (cso->stencil[0].enabled) {
-		so_method(so, tesla, 0x1380, 8);
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
@@ -425,23 +448,23 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, cso->stencil[0].writemask);
 		so_data  (so, cso->stencil[0].valuemask);
 	} else {
-		so_method(so, tesla, 0x1380, 1);
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
 	if (cso->stencil[1].enabled) {
-		so_method(so, tesla, 0x1594, 5);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
 		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
-		so_method(so, tesla, 0x0f54, 3);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
 		so_data  (so, cso->stencil[1].ref_value);
 		so_data  (so, cso->stencil[1].writemask);
 		so_data  (so, cso->stencil[1].valuemask);
 	} else {
-		so_method(so, tesla, 0x1594, 1);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
@@ -508,7 +531,7 @@ nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 	struct nv50_program *p = hwcso;
 
 	nv50_program_destroy(nv50, p);
-	FREE((void*)p->pipe.tokens);
+	FREE((void *)p->pipe.tokens);
 	FREE(p);
 }
 
@@ -540,7 +563,39 @@ nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 	struct nv50_program *p = hwcso;
 
 	nv50_program_destroy(nv50, p);
-	FREE((void*)p->pipe.tokens);
+	FREE((void *)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_gp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_GEOMETRY;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_GEOMPROG;
+}
+
+static void
+nv50_gp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void *)p->pipe.tokens);
 	FREE(p);
 }
 
@@ -562,17 +617,21 @@ nv50_set_clip_state(struct pipe_context *pipe,
 
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
-			 const struct pipe_constant_buffer *buf )
+			 struct pipe_buffer *buf )
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
 
 	if (shader == PIPE_SHADER_VERTEX) {
-		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf;
 		nv50->dirty |= NV50_NEW_VERTPROG_CB;
 	} else
 	if (shader == PIPE_SHADER_FRAGMENT) {
-		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf;
 		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_GEOMETRY) {
+		nv50->constbuf[PIPE_SHADER_GEOMETRY] = buf;
+		nv50->dirty |= NV50_NEW_GEOMPROG_CB;
 	}
 }
 
@@ -648,9 +707,11 @@ nv50_init_state_functions(struct nv50_context *nv50)
 	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
 
 	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
-	nv50->pipe.bind_sampler_states = nv50_sampler_state_bind;
 	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
-	nv50->pipe.set_sampler_textures = nv50_set_sampler_texture;
+	nv50->pipe.bind_fragment_sampler_states = nv50_fp_sampler_state_bind;
+	nv50->pipe.bind_vertex_sampler_states   = nv50_vp_sampler_state_bind;
+	nv50->pipe.set_fragment_sampler_textures = nv50_set_fp_sampler_textures;
+	nv50->pipe.set_vertex_sampler_textures   = nv50_set_vp_sampler_textures;
 
 	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
 	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
@@ -671,6 +732,10 @@ nv50_init_state_functions(struct nv50_context *nv50)
 	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
 	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
 
+	nv50->pipe.create_gs_state = nv50_gp_state_create;
+	nv50->pipe.bind_gs_state = nv50_gp_state_bind;
+	nv50->pipe.delete_gs_state = nv50_gp_state_delete;
+
 	nv50->pipe.set_blend_color = nv50_set_blend_color;
 	nv50->pipe.set_clip_state = nv50_set_clip_state;
 	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 799d2758fee..956da9b304c 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -33,7 +33,7 @@ static void
 nv50_state_validate_fb(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *so = so_new(128, 18);
+	struct nouveau_stateobj *so = so_new(32, 79, 18);
 	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
 	unsigned i, w, h, gw = 0;
 
@@ -41,7 +41,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	 * FP result 0 always goes to RT[0], bits 4 - 6 are ignored.
 	 * Ambiguous assignment results in no rendering (no DATA_ERROR).
 	 */
-	so_method(so, tesla, 0x121c, 1);
+	so_method(so, tesla, NV50TCL_RT_CONTROL, 1);
 	so_data  (so, fb->nr_cbufs |
 		  (0 <<  4) | (1 <<  7) | (2 << 10) | (3 << 13) |
 		  (4 << 16) | (5 << 19) | (6 << 22) | (7 << 25));
@@ -87,7 +87,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
-		so_method(so, tesla, 0x1224, 1);
+		so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1);
 		so_data  (so, 1);
 	}
 
@@ -124,22 +124,22 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
-		so_method(so, tesla, 0x1538, 1);
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, tesla, NV50TCL_ZETA_HORIZ, 3);
 		so_data  (so, fb->zsbuf->width);
 		so_data  (so, fb->zsbuf->height);
 		so_data  (so, 0x00010001);
 	} else {
-		so_method(so, tesla, 0x1538, 1);
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
-	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ(0), 2);
 	so_data  (so, w << 16);
 	so_data  (so, h << 16);
 	/* set window lower left corner */
-	so_method(so, tesla, NV50TCL_WINDOW_LEFT, 2);
+	so_method(so, tesla, NV50TCL_WINDOW_OFFSET_X, 2);
 	so_data  (so, 0);
 	so_data  (so, 0);
 	/* set screen scissor rectangle */
@@ -156,11 +156,38 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 }
 
 static void
+nv50_validate_samplers(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		       unsigned p)
+{
+	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
+	unsigned i, j, dw = nv50->sampler_nr[p] * 8;
+
+	if (!dw)
+		return;
+	nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM,
+			  p * (32 * 8 * 4), dw * 4);
+
+	so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), dw);
+
+	for (i = 0; i < nv50->sampler_nr[p]; ++i) {
+		if (nv50->sampler[p][i])
+			so_datap(so, nv50->sampler[p][i]->tsc, 8);
+		else {
+			for (j = 0; j < 8; ++j) /* you get punished */
+				so_data(so, 0); /* ... for leaving holes */
+		}
+	}
+}
+
+static void
 nv50_state_emit(struct nv50_context *nv50)
 {
 	struct nv50_screen *screen = nv50->screen;
 	struct nouveau_channel *chan = screen->base.channel;
 
+	/* I don't want to copy headers from the winsys. */
+	screen->cur_ctx = nv50;
+
 	if (nv50->pctx_id != screen->cur_pctx) {
 		if (nv50->state.fb)
 			nv50->state.dirty |= NV50_NEW_FRAMEBUFFER;
@@ -172,6 +199,8 @@ nv50_state_emit(struct nv50_context *nv50)
 			nv50->state.dirty |= NV50_NEW_VERTPROG;
 		if (nv50->state.fragprog)
 			nv50->state.dirty |= NV50_NEW_FRAGPROG;
+		if (nv50->state.geomprog)
+			nv50->state.dirty |= NV50_NEW_GEOMPROG;
 		if (nv50->state.rast)
 			nv50->state.dirty |= NV50_NEW_RASTERIZER;
 		if (nv50->state.blend_colour)
@@ -201,8 +230,14 @@ nv50_state_emit(struct nv50_context *nv50)
 		so_emit(chan, nv50->state.vertprog);
 	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
 		so_emit(chan, nv50->state.fragprog);
-	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
-		so_emit(chan, nv50->state.programs);
+	if (nv50->state.dirty & NV50_NEW_GEOMPROG && nv50->state.geomprog)
+		so_emit(chan, nv50->state.geomprog);
+	if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
+				 NV50_NEW_GEOMPROG | NV50_NEW_RASTERIZER))
+		so_emit(chan, nv50->state.fp_linkage);
+	if ((nv50->state.dirty & (NV50_NEW_VERTPROG | NV50_NEW_GEOMPROG))
+	    && nv50->state.gp_linkage)
+		so_emit(chan, nv50->state.gp_linkage);
 	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
 		so_emit(chan, nv50->state.rast);
 	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
@@ -239,13 +274,15 @@ nv50_state_flush_notify(struct nouveau_channel *chan)
 	so_emit_reloc_markers(chan, nv50->state.fragprog);
 	so_emit_reloc_markers(chan, nv50->state.vtxbuf);
 	so_emit_reloc_markers(chan, nv50->screen->static_init);
+
+	if (nv50->state.instbuf)
+		so_emit_reloc_markers(chan, nv50->state.instbuf);
 }
 
 boolean
 nv50_state_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 	struct nouveau_stateobj *so;
 	unsigned i;
 
@@ -264,14 +301,21 @@ nv50_state_validate(struct nv50_context *nv50)
 	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
 		nv50_fragprog_validate(nv50);
 
-	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG))
-		nv50_linkage_validate(nv50);
+	if (nv50->dirty & (NV50_NEW_GEOMPROG | NV50_NEW_GEOMPROG_CB))
+		nv50_geomprog_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
+			   NV50_NEW_GEOMPROG | NV50_NEW_RASTERIZER))
+		nv50_fp_linkage_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_GEOMPROG | NV50_NEW_VERTPROG))
+		nv50_gp_linkage_validate(nv50);
 
 	if (nv50->dirty & NV50_NEW_RASTERIZER)
 		so_ref(nv50->rasterizer->so, &nv50->state.rast);
 
 	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
-		so = so_new(5, 0);
+		so = so_new(1, 4, 0);
 		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
 		so_data  (so, fui(nv50->blend_colour.color[0]));
 		so_data  (so, fui(nv50->blend_colour.color[1]));
@@ -282,10 +326,10 @@ nv50_state_validate(struct nv50_context *nv50)
 	}
 
 	if (nv50->dirty & NV50_NEW_STIPPLE) {
-		so = so_new(33, 0);
+		so = so_new(1, 32, 0);
 		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
 		for (i = 0; i < 32; i++)
-			so_data(so, nv50->stipple.stipple[i]);
+			so_data(so, util_bswap32(nv50->stipple.stipple[i]));
 		so_ref(so, &nv50->state.stipple);
 		so_ref(NULL, &so);
 	}
@@ -299,8 +343,8 @@ nv50_state_validate(struct nv50_context *nv50)
 			goto scissor_uptodate;
 		nv50->state.scissor_enabled = rast->scissor;
 
-		so = so_new(3, 0);
-		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ, 2);
+		so = so_new(1, 2, 0);
+		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ(0), 2);
 		if (nv50->state.scissor_enabled) {
 			so_data(so, (s->maxx << 16) | s->minx);
 			so_data(so, (s->maxy << 16) | s->miny);
@@ -328,13 +372,13 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(14, 0);
+		so = so_new(5, 9, 0);
 		if (!bypass) {
-			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
 			so_data  (so, fui(nv50->viewport.translate[1]));
 			so_data  (so, fui(nv50->viewport.translate[2]));
-			so_method(so, tesla, NV50TCL_VIEWPORT_SCALE(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_SCALE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.scale[0]));
 			so_data  (so, fui(nv50->viewport.scale[1]));
 			so_data  (so, fui(nv50->viewport.scale[2]));
@@ -367,22 +411,18 @@ scissor_uptodate:
 viewport_uptodate:
 
 	if (nv50->dirty & NV50_NEW_SAMPLER) {
-		unsigned i;
+		unsigned nr = 0;
 
-		so = so_new(nv50->sampler_nr * 9 + 23 + 4, 2);
+		for (i = 0; i < PIPE_SHADER_TYPES; ++i)
+			nr += nv50->sampler_nr[i];
 
-		nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM,
-				  nv50->sampler_nr * 8 * 4);
+		so = so_new(1 + 5 * PIPE_SHADER_TYPES,
+			    1 + 19 * PIPE_SHADER_TYPES + nr * 8,
+			    PIPE_SHADER_TYPES * 2);
 
-		for (i = 0; i < nv50->sampler_nr; i++) {
-			if (!nv50->sampler[i])
-				continue;
-			so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8);
-			so_datap (so, nv50->sampler[i]->tsc, 8);
-		}
+		nv50_validate_samplers(nv50, so, PIPE_SHADER_VERTEX);
+		nv50_validate_samplers(nv50, so, PIPE_SHADER_FRAGMENT);
 
-		so_method(so, tesla, 0x1440, 1); /* sync SIFC */
-		so_data  (so, 0);
 		so_method(so, tesla, 0x1334, 1); /* flush TSC */
 		so_data  (so, 0);
 
@@ -405,10 +445,13 @@ viewport_uptodate:
 
 void nv50_so_init_sifc(struct nv50_context *nv50,
 		       struct nouveau_stateobj *so,
-		       struct nouveau_bo *bo, unsigned reloc, unsigned size)
+		       struct nouveau_bo *bo, unsigned reloc,
+		       unsigned offset, unsigned size)
 {
 	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 
+	reloc |= NOUVEAU_BO_WR;
+
 	so_method(so, eng2d, NV50_2D_DST_FORMAT, 2);
 	so_data  (so, NV50_2D_DST_FORMAT_R8_UNORM);
 	so_data  (so, 1);
@@ -416,9 +459,9 @@ void nv50_so_init_sifc(struct nv50_context *nv50,
 	so_data  (so, 262144);
 	so_data  (so, 65536);
 	so_data  (so, 1);
-	so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0);
-	so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
 	so_data  (so, 0);
 	so_data  (so, NV50_2D_SIFC_FORMAT_R8_UNORM);
 	so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10);
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 6bf6f773b0c..6378132979e 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -62,6 +62,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		return 1;
 
  	if (!bo->tile_flags) {
+		MARK_RING (chan, 9, 2); /* flush on lack of space or relocs */
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
@@ -72,6 +73,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		OUT_RELOCh(chan, bo, ps->offset, flags);
  		OUT_RELOCl(chan, bo, ps->offset, flags);
  	} else {
+		MARK_RING (chan, 11, 2); /* flush on lack of space or relocs */
  		BEGIN_RING(chan, eng2d, mthd, 5);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 0);
@@ -174,11 +176,11 @@ nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
 	if (ret)
 		return;
 
-	BEGIN_RING(chan, eng2d, 0x0580, 3);
-	OUT_RING  (chan, 4);
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_SHAPE, 3);
+	OUT_RING  (chan, NV50_2D_DRAW_SHAPE_RECTANGLES);
 	OUT_RING  (chan, format);
 	OUT_RING  (chan, value);
-	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_POINT32_X(0), 4);
 	OUT_RING  (chan, destx);
 	OUT_RING  (chan, desty);
 	OUT_RING  (chan, width);
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index 2813f544770..9f1a1713032 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -25,6 +25,8 @@
 
 #include "nouveau/nouveau_stateobj.h"
 
+#include "util/u_format.h"
+
 #define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f)		\
 {                                                       	\
 	PIPE_FORMAT_##pf,					\
@@ -68,6 +70,7 @@ static const struct nv50_texture_format nv50_tex_format_list[] =
 	_(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5),
 
 	_MIXED(Z24S8_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8),
+	_MIXED(S8Z24_UNORM, UNORM, UINT, UINT, UINT, C0, C0, C0, ONE, 8_24),
 
 	_(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16),
 	_(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16),
@@ -85,10 +88,11 @@ static const struct nv50_texture_format nv50_tex_format_list[] =
 
 static int
 nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
-		   struct nv50_miptree *mt, int unit)
+		   struct nv50_miptree *mt, int unit, unsigned p)
 {
 	unsigned i;
 	uint32_t mode;
+	const struct util_format_description *desc;
 
 	for (i = 0; i < NV50_TEX_FORMAT_LIST_SIZE; i++)
 		if (nv50_tex_format_list[i].pf == mt->base.base.format)
@@ -96,7 +100,7 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 	if (i == NV50_TEX_FORMAT_LIST_SIZE)
                 return 1;
 
-	if (nv50->sampler[unit]->normalized)
+	if (nv50->sampler[p][unit]->normalized)
 		mode = 0x50001000 | (1 << 31);
 	else {
 		mode = 0x50001000 | (7 << 14);
@@ -106,7 +110,10 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 	mode |= ((mt->base.bo->tile_mode & 0x0f) << 22) |
 		((mt->base.bo->tile_mode & 0xf0) << 21);
 
-	if (pf_type(mt->base.base.format) == PIPE_FORMAT_TYPE_SRGB)
+	desc = util_format_description(mt->base.base.format);
+	assert(desc);
+
+	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
 		mode |= 0x0400;
 
 	switch (mt->base.base.target) {
@@ -131,64 +138,91 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so,
 		 NOUVEAU_BO_RD, 0, 0);
 	so_data (so, mode);
 	so_data (so, 0x00300000);
-	so_data (so, mt->base.base.width[0] | (1 << 31));
+	so_data (so, mt->base.base.width0 | (1 << 31));
 	so_data (so, (mt->base.base.last_level << 28) |
-		 (mt->base.base.depth[0] << 16) | mt->base.base.height[0]);
+		 (mt->base.base.depth0 << 16) | mt->base.base.height0);
 	so_data (so, 0x03000000);
 	so_data (so, mt->base.base.last_level << 4);
 
 	return 0;
 }
 
-void
-nv50_tex_validate(struct nv50_context *nv50)
+#ifndef NV50TCL_BIND_TIC
+#define NV50TCL_BIND_TIC(n) (0x1448 + 8 * n)
+#endif
+
+static boolean
+nv50_validate_textures(struct nv50_context *nv50, struct nouveau_stateobj *so,
+		       unsigned p)
 {
+	static const unsigned p_remap[PIPE_SHADER_TYPES] = { 0, 2, 1 };
+
 	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
-	struct nouveau_stateobj *so;
-	unsigned i, unit, push;
-
-	push = MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2 + 23 + 6;
-	so = so_new(nv50->miptree_nr * 9 + push, nv50->miptree_nr * 2 + 2);
+	unsigned unit, j, p_hw = p_remap[p];
 
 	nv50_so_init_sifc(nv50, so, nv50->screen->tic, NOUVEAU_BO_VRAM,
-			  nv50->miptree_nr * 8 * 4);
-
-	for (i = 0, unit = 0; unit < nv50->miptree_nr; ++unit) {
-		struct nv50_miptree *mt = nv50->miptree[unit];
+			  p * (32 * 8 * 4), nv50->miptree_nr[p] * 8 * 4);
 
-		if (!mt)
-			continue;
+	for (unit = 0; unit < nv50->miptree_nr[p]; ++unit) {
+		struct nv50_miptree *mt = nv50->miptree[p][unit];
 
 		so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8);
-		if (nv50_tex_construct(nv50, so, mt, unit)) {
-			NOUVEAU_ERR("failed tex validate\n");
-			so_ref(NULL, &so);
-			return;
+		if (mt) {
+			if (nv50_tex_construct(nv50, so, mt, unit, p))
+				return FALSE;
+			/* Set TEX insn $t src binding $unit in program type p
+			 * to TIC, TSC entry (32 * p + unit), mark valid (1).
+			 */
+			so_method(so, tesla, NV50TCL_BIND_TIC(p_hw), 1);
+			so_data  (so, ((32 * p + unit) << 9) | (unit << 1) | 1);
+		} else {
+			for (j = 0; j < 8; ++j)
+				so_data(so, 0);
+			so_method(so, tesla, NV50TCL_BIND_TIC(p_hw), 1);
+			so_data  (so, (unit << 1) | 0);
 		}
+	}
+
+	for (; unit < nv50->state.miptree_nr[p]; unit++) {
+		/* Make other bindings invalid. */
+		so_method(so, tesla, NV50TCL_BIND_TIC(p_hw), 1);
+		so_data  (so, (unit << 1) | 0);
+	}
 
-		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
-		so_data  (so, (i++ << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) |
-			  (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) |
-			  NV50TCL_SET_SAMPLER_TEX_VALID);
+	nv50->state.miptree_nr[p] = nv50->miptree_nr[p];
+	return TRUE;
+}
+
+void
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_stateobj *so;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned p, start, push, nrlc;
+
+	for (nrlc = 0, start = 0, push = 0, p = 0; p < PIPE_SHADER_TYPES; ++p) {
+		start += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
+		push += MAX2(nv50->miptree_nr[p], nv50->state.miptree_nr[p]);
+		nrlc += nv50->miptree_nr[p];
 	}
+	start = start * 2 + 4 * PIPE_SHADER_TYPES + 2;
+	push = push * 9 + 19 * PIPE_SHADER_TYPES + 2;
+	nrlc = nrlc * 2 + 2 * PIPE_SHADER_TYPES;
+
+	so = so_new(start, push, nrlc);
 
-	for (; unit < nv50->state.miptree_nr; unit++) {
-		so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1);
-		so_data  (so,
-			  (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0);
+	if (nv50_validate_textures(nv50, so, PIPE_SHADER_VERTEX) == FALSE ||
+	    nv50_validate_textures(nv50, so, PIPE_SHADER_FRAGMENT) == FALSE) {
+		so_ref(NULL, &so);
+
+		NOUVEAU_ERR("failed tex validate\n");
+		return;
 	}
 
-	/* not sure if the following really do what I think: */
-	so_method(so, tesla, 0x1440, 1); /* sync SIFC */
-	so_data  (so, 0);
 	so_method(so, tesla, 0x1330, 1); /* flush TIC */
 	so_data  (so, 0);
-	so_method(so, tesla, 0x1338, 1); /* flush texture caches */
-	so_data  (so, 0x20);
 
 	so_ref(so, &nv50->state.tic_upload);
 	so_ref(NULL, &so);
-	nv50->state.miptree_nr = nv50->miptree_nr;
 }
-
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
index d531e611327..b870302019a 100644
--- a/src/gallium/drivers/nv50/nv50_texture.h
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -82,6 +82,7 @@
 #define NV50TIC_0_0_FMT_RGTC1                                     0x00000027
 #define NV50TIC_0_0_FMT_RGTC2                                     0x00000028
 #define NV50TIC_0_0_FMT_24_8                                      0x00000029
+#define NV50TIC_0_0_FMT_8_24                                      0x0000002a
 #define NV50TIC_0_0_FMT_32_DEPTH                                  0x0000002f
 #define NV50TIC_0_0_FMT_32_8                                      0x00000030
 
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index ea61357aaa6..a2f1db2914c 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -1,6 +1,8 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
 
 #include "nv50_context.h"
 
@@ -15,16 +17,19 @@ struct nv50_transfer {
 	int level_depth;
 	int level_x;
 	int level_y;
+	int level_z;
+	unsigned nblocksx;
+	unsigned nblocksy;
 };
 
 static void
 nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			struct nouveau_bo *src_bo, unsigned src_offset,
 			int src_pitch, unsigned src_tile_mode,
-			int sx, int sy, int sw, int sh, int sd,
+			int sx, int sy, int sz, int sw, int sh, int sd,
 			struct nouveau_bo *dst_bo, unsigned dst_offset,
 			int dst_pitch, unsigned dst_tile_mode,
-			int dx, int dy, int dw, int dh, int dd,
+			int dx, int dy, int dz, int dw, int dh, int dd,
 			int cpp, int width, int height,
 			unsigned src_reloc, unsigned dst_reloc)
 {
@@ -42,7 +47,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
 		OUT_RING  (chan, src_pitch);
 		src_offset += (sy * src_pitch) + (sx * cpp);
 	} else {
@@ -53,7 +58,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RING  (chan, sw * cpp);
 		OUT_RING  (chan, sh);
 		OUT_RING  (chan, sd);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, sz); /* copying only 1 zslice per call */
 	}
 
 	if (!dst_bo->tile_flags) {
@@ -61,7 +66,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
 		OUT_RING  (chan, dst_pitch);
 		dst_offset += (dy * dst_pitch) + (dx * cpp);
 	} else {
@@ -72,19 +77,19 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RING  (chan, dw * cpp);
 		OUT_RING  (chan, dh);
 		OUT_RING  (chan, dd);
-		OUT_RING  (chan, 0);
+		OUT_RING  (chan, dz); /* copying only 1 zslice per call */
 	}
 
 	while (height) {
 		int line_count = height > 2047 ? 2047 : height;
 
-		WAIT_RING (chan, 15);
+		MARK_RING (chan, 15, 4); /* flush on lack of space or relocs */
 		BEGIN_RING(chan, m2mf,
 			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH, 2);
 		OUT_RELOCh(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCh(chan, dst_bo, dst_offset, dst_reloc);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
+			NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
 		if (src_bo->tile_flags) {
@@ -102,7 +107,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			dst_offset += (line_count * dst_pitch);
 		}
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
+			NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
 		OUT_RING  (chan, width * cpp);
 		OUT_RING  (chan, line_count);
 		OUT_RING  (chan, 0x00000101);
@@ -115,20 +120,6 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 	}
 }
 
-static INLINE unsigned
-get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned ny)
-{
-	unsigned tile_h = get_tile_height(tile_mode);
-	unsigned tile_d = get_tile_depth(tile_mode);
-
-	/* pitch_2d == to next slice within this volume-tile */
-	/* pitch_3d == to next slice in next 2D array of blocks */
-	unsigned pitch_2d = tile_h * 64;
-	unsigned pitch_3d = tile_d * align(ny, tile_h) * pitch;
-
-	return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d;
-}
-
 static struct pipe_transfer *
 nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		  unsigned face, unsigned level, unsigned zslice,
@@ -150,56 +141,43 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 
 	pipe_texture_reference(&tx->base.texture, pt);
-	tx->base.format = pt->format;
+	tx->nblocksx = util_format_get_nblocksx(pt->format, u_minify(pt->width0, level));
+	tx->nblocksy = util_format_get_nblocksy(pt->format, u_minify(pt->height0, level));
 	tx->base.width = w;
 	tx->base.height = h;
-	tx->base.block = pt->block;
-	if (!pt->nblocksx[level]) {
-		tx->base.nblocksx = pf_get_nblocksx(&pt->block,
-						    pt->width[level]);
-		tx->base.nblocksy = pf_get_nblocksy(&pt->block,
-						    pt->height[level]);
-	} else {
-		tx->base.nblocksx = pt->nblocksx[level];
-		tx->base.nblocksy = pt->nblocksy[level];
-	}
-	tx->base.stride = tx->base.nblocksx * pt->block.size;
+	tx->base.stride = tx->nblocksx * util_format_get_blocksize(pt->format);
 	tx->base.usage = usage;
 
 	tx->level_pitch = lvl->pitch;
-	tx->level_width = mt->base.base.width[level];
-	tx->level_height = mt->base.base.height[level];
-	tx->level_depth = mt->base.base.depth[level];
+	tx->level_width = u_minify(mt->base.base.width0, level);
+	tx->level_height = u_minify(mt->base.base.height0, level);
+	tx->level_depth = u_minify(mt->base.base.depth0, level);
 	tx->level_offset = lvl->image_offset[image];
 	tx->level_tiling = lvl->tile_mode;
-	tx->level_x = pf_get_nblocksx(&tx->base.block, x);
-	tx->level_y = pf_get_nblocksy(&tx->base.block, y);
+	tx->level_z = zslice;
+	tx->level_x = util_format_get_nblocksx(pt->format, x);
+	tx->level_y = util_format_get_nblocksy(pt->format, y);
 	ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
-			     tx->base.nblocksy * tx->base.stride, &tx->bo);
+			     tx->nblocksy * tx->base.stride, &tx->bo);
 	if (ret) {
 		FREE(tx);
 		return NULL;
 	}
 
-	if (pt->target == PIPE_TEXTURE_3D)
-		tx->level_offset += get_zslice_offset(lvl->tile_mode, zslice,
-						      lvl->pitch,
-						      tx->base.nblocksy);
-
 	if (usage & PIPE_TRANSFER_READ) {
-		nx = pf_get_nblocksx(&tx->base.block, tx->base.width);
-		ny = pf_get_nblocksy(&tx->base.block, tx->base.height);
+		nx = util_format_get_nblocksx(pt->format, tx->base.width);
+		ny = util_format_get_nblocksy(pt->format, tx->base.height);
 
 		nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset,
 					tx->level_pitch, tx->level_tiling,
-					x, y,
-					tx->base.nblocksx, tx->base.nblocksy,
+					x, y, zslice,
+					tx->nblocksx, tx->nblocksy,
 					tx->level_depth,
 					tx->bo, 0,
 					tx->base.stride, tx->bo->tile_mode,
-					0, 0,
-					tx->base.nblocksx, tx->base.nblocksy, 1,
-					tx->base.block.size, nx, ny,
+					0, 0, 0,
+					tx->nblocksx, tx->nblocksy, 1,
+					util_format_get_blocksize(pt->format), nx, ny,
 					NOUVEAU_BO_VRAM | NOUVEAU_BO_GART,
 					NOUVEAU_BO_GART);
 	}
@@ -212,23 +190,24 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 	struct nv50_miptree *mt = nv50_miptree(ptx->texture);
+	struct pipe_texture *pt = ptx->texture;
 
-	unsigned nx = pf_get_nblocksx(&tx->base.block, tx->base.width);
-	unsigned ny = pf_get_nblocksy(&tx->base.block, tx->base.height);
+	unsigned nx = util_format_get_nblocksx(pt->format, tx->base.width);
+	unsigned ny = util_format_get_nblocksy(pt->format, tx->base.height);
 
 	if (ptx->usage & PIPE_TRANSFER_WRITE) {
-		struct pipe_screen *pscreen = ptx->texture->screen;
+		struct pipe_screen *pscreen = pt->screen;
 
 		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0,
 					tx->base.stride, tx->bo->tile_mode,
-					0, 0,
-					tx->base.nblocksx, tx->base.nblocksy, 1,
+					0, 0, 0,
+					tx->nblocksx, tx->nblocksy, 1,
 					mt->base.bo, tx->level_offset,
 					tx->level_pitch, tx->level_tiling,
-					tx->level_x, tx->level_y,
-					tx->base.nblocksx, tx->base.nblocksy,
+					tx->level_x, tx->level_y, tx->level_z,
+					tx->nblocksx, tx->nblocksy,
 					tx->level_depth,
-					tx->base.block.size, nx, ny,
+					util_format_get_blocksize(pt->format), nx, ny,
 					NOUVEAU_BO_GART, NOUVEAU_BO_VRAM |
 					NOUVEAU_BO_GART);
 	}
@@ -287,7 +266,7 @@ nv50_upload_sifc(struct nv50_context *nv50,
 
 	reloc |= NOUVEAU_BO_WR;
 
-	WAIT_RING (chan, 32);
+	MARK_RING (chan, 32, 2); /* flush on lack of space or relocs */
 
 	if (bo->tile_flags) {
 		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5);
@@ -312,7 +291,7 @@ nv50_upload_sifc(struct nv50_context *nv50,
 
 	/* NV50_2D_OPERATION_SRCCOPY assumed already set */
 
-	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
 	OUT_RING  (chan, 0);
 	OUT_RING  (chan, src_format);
 	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
@@ -355,6 +334,6 @@ nv50_upload_sifc(struct nv50_context *nv50,
 		src += src_pitch;
 	}
 
-	BEGIN_RING(chan, tesla, 0x1440, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1);
 	OUT_RING  (chan, 0);
 }
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index db54380241f..bfb1b34d27a 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -24,6 +24,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 
+#include "util/u_format.h"
+
 #include "nv50_context.h"
 
 static boolean
@@ -38,6 +40,8 @@ nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned);
 static boolean
 nv50_push_arrays(struct nv50_context *, unsigned, unsigned);
 
+#define NV50_USING_LOATHED_EDGEFLAG(ctx) ((ctx)->vertprog->cfg.edgeflag_in < 16)
+
 static INLINE unsigned
 nv50_prim(unsigned mode)
 {
@@ -53,6 +57,14 @@ nv50_prim(unsigned mode)
 	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
 	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
 	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	case PIPE_PRIM_LINES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINES_ADJACENCY;
+	case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_LINE_STRIP_ADJACENCY;
+	case PIPE_PRIM_TRIANGLES_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLES_ADJACENCY;
+	case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP_ADJACENCY;
 	default:
 		break;
 	}
@@ -62,18 +74,25 @@ nv50_prim(unsigned mode)
 }
 
 static INLINE uint32_t
-nv50_vbo_type_to_hw(unsigned type)
+nv50_vbo_type_to_hw(enum pipe_format format)
 {
-	switch (type) {
-	case PIPE_FORMAT_TYPE_FLOAT:
+	const struct util_format_description *desc;
+
+	desc = util_format_description(format);
+	assert(desc);
+
+	switch (desc->channel[0].type) {
+	case UTIL_FORMAT_TYPE_FLOAT:
 		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT;
-	case PIPE_FORMAT_TYPE_UNORM:
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM;
-	case PIPE_FORMAT_TYPE_SNORM:
-		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM;
-	case PIPE_FORMAT_TYPE_USCALED:
+	case UTIL_FORMAT_TYPE_UNSIGNED:
+		if (desc->channel[0].normalized) {
+			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_UNORM;
+		}
 		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_USCALED;
-	case PIPE_FORMAT_TYPE_SSCALED:
+	case UTIL_FORMAT_TYPE_SIGNED:
+		if (desc->channel[0].normalized) {
+			return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SNORM;
+		}
 		return NV50TCL_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED;
 	/*
 	case PIPE_FORMAT_TYPE_UINT:
@@ -90,19 +109,19 @@ nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
 {
 	static const uint32_t hw_values[] = {
 		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16,
 		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32_32 };
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 };
 
 	/* we'd also have R11G11B10 and R10G10B10A2 */
 
@@ -120,9 +139,15 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
 {
 	uint32_t hw_type, hw_size;
 	enum pipe_format pf = ve->src_format;
-	unsigned size = pf_size_x(pf) << pf_exp2(pf);
+	const struct util_format_description *desc;
+	unsigned size;
+
+	desc = util_format_description(pf);
+	assert(desc);
+
+	size = util_format_get_component_bits(pf, UTIL_FORMAT_COLORSPACE_RGB, 0);
 
-	hw_type = nv50_vbo_type_to_hw(pf_type(pf));
+	hw_type = nv50_vbo_type_to_hw(pf);
 	hw_size = nv50_vbo_size_to_hw(size, ve->nr_components);
 
 	if (!hw_type || !hw_size) {
@@ -131,13 +156,316 @@ nv50_vbo_vtxelt_to_hw(struct pipe_vertex_element *ve)
 		return 0x24e80000;
 	}
 
-	if (pf_swizzle_x(pf) == 2) /* BGRA */
+	if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_Z) /* BGRA */
 		hw_size |= (1 << 31); /* no real swizzle bits :-( */
 
 	return (hw_type | hw_size);
 }
 
-boolean
+/* For instanced drawing from user buffers, hitting the FIFO repeatedly
+ * with the same vertex data is probably worse than uploading all data.
+ */
+static boolean
+nv50_upload_vtxbuf(struct nv50_context *nv50, unsigned i)
+{
+	struct nv50_screen *nscreen = nv50->screen;
+	struct pipe_screen *pscreen = &nscreen->base.base;
+	struct pipe_buffer *buf = nscreen->strm_vbuf[i];
+	struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
+	uint8_t *src;
+	unsigned size = align(vb->buffer->size, 4096);
+
+	if (buf && buf->size < size)
+		pipe_buffer_reference(&nscreen->strm_vbuf[i], NULL);
+
+	if (!nscreen->strm_vbuf[i]) {
+		nscreen->strm_vbuf[i] = pipe_buffer_create(
+			pscreen, 0, PIPE_BUFFER_USAGE_VERTEX, size);
+		buf = nscreen->strm_vbuf[i];
+	}
+
+	src = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!src)
+		return FALSE;
+	src += vb->buffer_offset;
+
+	size = (vb->max_index + 1) * vb->stride + 16; /* + 16 is for stride 0 */
+	if (vb->buffer_offset + size > vb->buffer->size)
+		size = vb->buffer->size - vb->buffer_offset;
+
+	pipe_buffer_write(pscreen, buf, vb->buffer_offset, size, src);
+	pipe_buffer_unmap(pscreen, vb->buffer);
+
+	vb->buffer = buf; /* don't pipe_reference, this is a private copy */
+	return TRUE;
+}
+
+static void
+nv50_upload_user_vbufs(struct nv50_context *nv50)
+{
+	unsigned i;
+
+	if (nv50->vbo_fifo)
+		nv50->dirty |= NV50_NEW_ARRAYS;
+	if (!(nv50->dirty & NV50_NEW_ARRAYS))
+		return;
+
+	for (i = 0; i < nv50->vtxbuf_nr; ++i) {
+		if (nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX)
+			continue;
+		nv50_upload_vtxbuf(nv50, i);
+	}
+}
+
+static void
+nv50_set_static_vtxattr(struct nv50_context *nv50, unsigned i, void *data)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	float v[4];
+
+	util_format_read_4f(nv50->vtxelt[i].src_format,
+			    v, 0, data, 0, 0, 0, 1, 1);
+
+	switch (nv50->vtxelt[i].nr_components) {
+	case 4:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_4F_X(i), 4);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		OUT_RINGf (chan, v[2]);
+		OUT_RINGf (chan, v[3]);
+		break;
+	case 3:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_3F_X(i), 3);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		OUT_RINGf (chan, v[2]);
+		break;
+	case 2:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_2F_X(i), 2);
+		OUT_RINGf (chan, v[0]);
+		OUT_RINGf (chan, v[1]);
+		break;
+	case 1:
+		BEGIN_RING(chan, tesla, NV50TCL_VTX_ATTR_1F(i), 1);
+		OUT_RINGf (chan, v[0]);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
+static unsigned
+init_per_instance_arrays_immd(struct nv50_context *nv50,
+			      unsigned startInstance,
+			      unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_bo *bo;
+	unsigned i, b, count = 0;
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		++count;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		pos[i] = nv50->vtxelt[i].src_offset +
+			nv50->vtxbuf[b].buffer_offset +
+			startInstance * nv50->vtxbuf[b].stride;
+		step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+		if (!bo->map)
+			nouveau_bo_map(bo, NOUVEAU_BO_RD);
+
+		nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+	}
+
+	return count;
+}
+
+static unsigned
+init_per_instance_arrays(struct nv50_context *nv50,
+			 unsigned startInstance,
+			 unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct nouveau_bo *bo;
+	struct nouveau_stateobj *so;
+	unsigned i, b, count = 0;
+	const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	if (nv50->vbo_fifo)
+		return init_per_instance_arrays_immd(nv50, startInstance,
+						     pos, step);
+
+	so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		++count;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		pos[i] = nv50->vtxelt[i].src_offset +
+			nv50->vtxbuf[b].buffer_offset +
+			startInstance * nv50->vtxbuf[b].stride;
+
+		if (!startInstance) {
+			step[i] = 0;
+			continue;
+		}
+		step[i] = startInstance % nv50->vtxelt[i].instance_divisor;
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (count && startInstance) {
+		so_ref (so, &nv50->state.instbuf); /* for flush notify */
+		so_emit(chan, nv50->state.instbuf);
+	}
+	so_ref (NULL, &so);
+
+	return count;
+}
+
+static void
+step_per_instance_arrays_immd(struct nv50_context *nv50,
+			      unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_bo *bo;
+	unsigned i, b;
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		if (++step[i] != nv50->vtxelt[i].instance_divisor)
+			continue;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		step[i] = 0;
+		pos[i] += nv50->vtxbuf[b].stride;
+
+		nv50_set_static_vtxattr(nv50, i, (uint8_t *)bo->map + pos[i]);
+	}
+}
+
+static void
+step_per_instance_arrays(struct nv50_context *nv50,
+			 unsigned pos[16], unsigned step[16])
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct nouveau_bo *bo;
+	struct nouveau_stateobj *so;
+	unsigned i, b;
+	const uint32_t rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	if (nv50->vbo_fifo) {
+		step_per_instance_arrays_immd(nv50, pos, step);
+		return;
+	}
+
+	so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 2, nv50->vtxelt_nr * 2);
+
+	for (i = 0; i < nv50->vtxelt_nr; ++i) {
+		if (!nv50->vtxelt[i].instance_divisor)
+			continue;
+		b = nv50->vtxelt[i].vertex_buffer_index;
+
+		if (++step[i] == nv50->vtxelt[i].instance_divisor) {
+			step[i] = 0;
+			pos[i] += nv50->vtxbuf[b].stride;
+		}
+
+		bo = nouveau_bo(nv50->vtxbuf[b].buffer);
+
+		so_method(so, tesla, NV50TCL_VERTEX_ARRAY_START_HIGH(i), 2);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, bo, pos[i], rl | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (so, &nv50->state.instbuf); /* for flush notify */
+	so_ref (NULL, &so);
+
+	so_emit(chan, nv50->state.instbuf);
+}
+
+static INLINE void
+nv50_unmap_vbufs(struct nv50_context *nv50)
+{
+        unsigned i;
+
+        for (i = 0; i < nv50->vtxbuf_nr; ++i)
+                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
+                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
+}
+
+void
+nv50_draw_arrays_instanced(struct pipe_context *pipe,
+			   unsigned mode, unsigned start, unsigned count,
+			   unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_channel *chan = nv50->screen->tesla->channel;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	unsigned i, nz_divisors;
+	unsigned step[16], pos[16];
+
+	if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+		nv50_upload_user_vbufs(nv50);
+
+	nv50_state_validate(nv50);
+
+	nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+
+	if (nv50->vbo_fifo)
+		nv50_push_arrays(nv50, start, count);
+	else {
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+		OUT_RING  (chan, start);
+		OUT_RING  (chan, count);
+	}
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	for (i = 1; i < instanceCount; i++) {
+		if (nz_divisors) /* any non-zero array divisors ? */
+			step_per_instance_arrays(nv50, pos, step);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+		if (nv50->vbo_fifo)
+			nv50_push_arrays(nv50, start, count);
+		else {
+			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+			OUT_RING  (chan, start);
+			OUT_RING  (chan, count);
+		}
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+	}
+	nv50_unmap_vbufs(nv50);
+
+	so_ref(NULL, &nv50->state.instbuf);
+}
+
+void
 nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 		 unsigned count)
 {
@@ -167,7 +495,11 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
-	return ret;
+	nv50_unmap_vbufs(nv50);
+
+        /* XXX: not sure what to do if ret != TRUE: flush and retry?
+         */
+        assert(ret);
 }
 
 static INLINE boolean
@@ -183,7 +515,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		return nv50_push_elements_u08(nv50, map, count);
 
 	if (count & 1) {
-		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, map[0]);
 		map++;
 		count--;
@@ -193,7 +525,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -216,7 +548,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		return nv50_push_elements_u16(nv50, map, count);
 
 	if (count & 1) {
-		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, map[0]);
 		map++;
 		count--;
@@ -226,7 +558,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U16, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -251,7 +583,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
-		BEGIN_RING(chan, tesla, 0x400015e8, nr);
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VB_ELEMENT_U32, nr);
 		OUT_RINGp (chan, map, nr);
 
 		count -= nr;
@@ -260,7 +592,78 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	return TRUE;
 }
 
-boolean
+static INLINE void
+nv50_draw_elements_inline(struct nv50_context *nv50,
+			  void *map, unsigned indexSize,
+			  unsigned start, unsigned count)
+{
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	}
+}
+
+void
+nv50_draw_elements_instanced(struct pipe_context *pipe,
+			     struct pipe_buffer *indexBuffer,
+			     unsigned indexSize,
+			     unsigned mode, unsigned start, unsigned count,
+			     unsigned startInstance, unsigned instanceCount)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_channel *chan = tesla->channel;
+	struct pipe_screen *pscreen = pipe->screen;
+	void *map;
+	unsigned i, nz_divisors;
+	unsigned step[16], pos[16];
+
+	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	if (!NV50_USING_LOATHED_EDGEFLAG(nv50))
+		nv50_upload_user_vbufs(nv50);
+
+	nv50_state_validate(nv50);
+
+	nz_divisors = init_per_instance_arrays(nv50, startInstance, pos, step);
+
+	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
+	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
+	OUT_RING  (chan, startInstance);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (chan, nv50_prim(mode));
+
+	nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (chan, 0);
+
+	for (i = 1; i < instanceCount; ++i) {
+		if (nz_divisors) /* any non-zero array divisors ? */
+			step_per_instance_arrays(nv50, pos, step);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
+		OUT_RING  (chan, nv50_prim(mode) | (1 << 28));
+
+		nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
+		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
+		OUT_RING  (chan, 0);
+	}
+	nv50_unmap_vbufs(nv50);
+
+	so_ref(NULL, &nv50->state.instbuf);
+}
+
+void
 nv50_draw_elements(struct pipe_context *pipe,
 		   struct pipe_buffer *indexBuffer, unsigned indexSize,
 		   unsigned mode, unsigned start, unsigned count)
@@ -270,7 +673,6 @@ nv50_draw_elements(struct pipe_context *pipe,
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct pipe_screen *pscreen = pipe->screen;
 	void *map;
-	boolean ret;
 	
 	map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
 
@@ -283,27 +685,15 @@ nv50_draw_elements(struct pipe_context *pipe,
 
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
 	OUT_RING  (chan, nv50_prim(mode));
-	switch (indexSize) {
-	case 1:
-		ret = nv50_draw_elements_inline_u08(nv50, map, start, count);
-		break;
-	case 2:
-		ret = nv50_draw_elements_inline_u16(nv50, map, start, count);
-		break;
-	case 4:
-		ret = nv50_draw_elements_inline_u32(nv50, map, start, count);
-		break;
-	default:
-		assert(0);
-		ret = FALSE;
-		break;
-	}
+
+	nv50_draw_elements_inline(nv50, map, indexSize, start, count);
+
 	BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 	OUT_RING  (chan, 0);
 
-	pipe_buffer_unmap(pscreen, indexBuffer);
+	nv50_unmap_vbufs(nv50);
 
-	return ret;
+	pipe_buffer_unmap(pscreen, indexBuffer);
 }
 
 static INLINE boolean
@@ -316,22 +706,19 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 	struct nouveau_stateobj *so;
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_bo *bo = nouveau_bo(vb->buffer);
-	float *v;
+	float v[4];
 	int ret;
-	enum pipe_format pf = ve->src_format;
-
-	if ((pf_type(pf) != PIPE_FORMAT_TYPE_FLOAT) ||
-	    (pf_size_x(pf) << pf_exp2(pf)) != 32)
-		return FALSE;
 
 	ret = nouveau_bo_map(bo, NOUVEAU_BO_RD);
 	if (ret)
 		return FALSE;
-	v = (float *)(bo->map + (vb->buffer_offset + ve->src_offset));
 
+	util_format_read_4f(ve->src_format, v, 0, (uint8_t *)bo->map +
+			    (vb->buffer_offset + ve->src_offset), 0,
+			    0, 0, 1, 1);
 	so = *pso;
 	if (!so)
-		*pso = so = so_new(nv50->vtxelt_nr * 5, 0);
+		*pso = so = so_new(nv50->vtxelt_nr, nv50->vtxelt_nr * 4, 0);
 
 	switch (ve->nr_components) {
 	case 4:
@@ -353,6 +740,10 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 		so_data  (so, fui(v[1]));
 		break;
 	case 1:
+		if (attrib == nv50->vertprog->cfg.edgeflag_in) {
+			so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+			so_data  (so, v[0] ? 1 : 0);
+		}
 		so_method(so, tesla, NV50TCL_VTX_ATTR_1F(attrib), 1);
 		so_data  (so, fui(v[0]));
 		break;
@@ -382,11 +773,14 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		    !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX))
 			nv50->vbo_fifo = 0xffff;
 
+	if (NV50_USING_LOATHED_EDGEFLAG(nv50))
+		nv50->vbo_fifo = 0xffff; /* vertprog can't set edgeflag */
+
 	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
 	vtxattr = NULL;
-	vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4);
-	vtxfmt = so_new(n_ve + 1, 0);
+	vtxbuf = so_new(n_ve * 2, n_ve * 5, nv50->vtxelt_nr * 4);
+	vtxfmt = so_new(1, n_ve, 0);
 	so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve);
 
 	for (i = 0; i < nv50->vtxelt_nr; i++) {
@@ -407,17 +801,20 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			nv50->vbo_fifo &= ~(1 << i);
 			continue;
 		}
-		so_data(vtxfmt, hw | i);
 
 		if (nv50->vbo_fifo) {
+			so_data  (vtxfmt, hw |
+				  (ve->instance_divisor ? (1 << 4) : i));
 			so_method(vtxbuf, tesla,
 				  NV50TCL_VERTEX_ARRAY_FORMAT(i), 1);
 			so_data  (vtxbuf, 0);
 			continue;
 		}
+		so_data(vtxfmt, hw | i);
 
 		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3);
-		so_data  (vtxbuf, 0x20000000 | vb->stride);
+		so_data  (vtxbuf, 0x20000000 |
+			  (ve->instance_divisor ? 0 : vb->stride));
 		so_reloc (vtxbuf, bo, vb->buffer_offset +
 			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
 			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
@@ -426,7 +823,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 
 		/* vertex array limits */
-		so_method(vtxbuf, tesla, 0x1080 + (i * 8), 2);
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
 		so_reloc (vtxbuf, bo, vb->buffer->size - 1,
 			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			  NOUVEAU_BO_HIGH, 0, 0);
@@ -455,11 +852,14 @@ typedef void (*pfn_push)(struct nouveau_channel *, void *);
 struct nv50_vbo_emitctx
 {
 	pfn_push push[16];
-	void *map[16];
+	uint8_t *map[16];
 	unsigned stride[16];
 	unsigned nr_ve;
 	unsigned vtx_dwords;
 	unsigned vtx_max;
+
+	float edgeflag;
+	unsigned ve_edgeflag;
 };
 
 static INLINE void
@@ -490,19 +890,18 @@ nv50_map_vbufs(struct nv50_context *nv50)
 
 	for (i = 0; i < nv50->vtxbuf_nr; ++i) {
 		struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
-		unsigned size, delta;
+		unsigned size = vb->stride * (vb->max_index + 1) + 16;
 
 		if (nouveau_bo(vb->buffer)->map)
 			continue;
 
-		size = vb->stride * (vb->max_index + 1);
-		delta = vb->buffer_offset;
-
+		size = vb->stride * (vb->max_index + 1) + 16;
+		size = MIN2(size, vb->buffer->size);
 		if (!size)
-			size = vb->buffer->size - vb->buffer_offset;
+			size = vb->buffer->size;
 
 		if (nouveau_bo_map_range(nouveau_bo(vb->buffer),
-					 delta, size, NOUVEAU_BO_RD))
+					 0, size, NOUVEAU_BO_RD))
 			break;
 	}
 
@@ -513,16 +912,6 @@ nv50_map_vbufs(struct nv50_context *nv50)
 	return FALSE;
 }
 
-static INLINE void
-nv50_unmap_vbufs(struct nv50_context *nv50)
-{
-        unsigned i;
-
-        for (i = 0; i < nv50->vtxbuf_nr; ++i)
-                if (nouveau_bo(nv50->vtxbuf[i].buffer)->map)
-                        nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer));
-}
-
 static void
 emit_b32_1(struct nouveau_channel *chan, void *data)
 {
@@ -603,26 +992,34 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 	if (nv50_map_vbufs(nv50) == FALSE)
 		return FALSE;
 
+	emit->ve_edgeflag = nv50->vertprog->cfg.edgeflag_in;
+
+	emit->edgeflag = 0.5f;
 	emit->nr_ve = 0;
 	emit->vtx_dwords = 0;
 
 	for (i = 0; i < nv50->vtxelt_nr; ++i) {
 		struct pipe_vertex_element *ve;
 		struct pipe_vertex_buffer *vb;
-		unsigned n, type, size;
+		unsigned n, size;
+		const struct util_format_description *desc;
 
 		ve = &nv50->vtxelt[i];
 		vb = &nv50->vtxbuf[ve->vertex_buffer_index];
-		if (!(nv50->vbo_fifo & (1 << i)))
+		if (!(nv50->vbo_fifo & (1 << i)) || ve->instance_divisor)
 			continue;
 		n = emit->nr_ve++;
 
 		emit->stride[n] = vb->stride;
-		emit->map[n] = nouveau_bo(vb->buffer)->map +
+		emit->map[n] = (uint8_t *)nouveau_bo(vb->buffer)->map +
+			vb->buffer_offset +
 			(start * vb->stride + ve->src_offset);
 
-		type = pf_type(ve->src_format);
-		size = pf_size_x(ve->src_format) << pf_exp2(ve->src_format);
+		desc = util_format_description(ve->src_format);
+		assert(desc);
+
+		size = util_format_get_component_bits(
+			ve->src_format, UTIL_FORMAT_COLORSPACE_RGB, 0);
 
 		assert(ve->nr_components > 0 && ve->nr_components <= 4);
 
@@ -664,10 +1061,31 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 	}
 
 	emit->vtx_max = 512 / emit->vtx_dwords;
+	if (emit->ve_edgeflag < 16)
+		emit->vtx_max = 1;
 
 	return TRUE;
 }
 
+static INLINE void
+set_edgeflag(struct nouveau_channel *chan,
+	     struct nouveau_grobj *tesla,
+	     struct nv50_vbo_emitctx *emit, uint32_t index)
+{
+	unsigned i = emit->ve_edgeflag;
+
+	if (i < 16) {
+		float f = *((float *)(emit->map[i] + index * emit->stride[i]));
+
+		if (emit->edgeflag != f) {
+			emit->edgeflag = f;
+
+			BEGIN_RING(chan, tesla, 0x15e4, 1);
+			OUT_RING  (chan, f ? 1 : 0);
+		}
+	}
+}
+
 static boolean
 nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 {
@@ -682,13 +1100,14 @@ nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		set_edgeflag(chan, tesla, &emit, 0); /* nr will be 1 */
+
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx_next(chan, &emit);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -707,13 +1126,14 @@ nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		set_edgeflag(chan, tesla, &emit, *map);
+
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -732,13 +1152,14 @@ nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		set_edgeflag(chan, tesla, &emit, *map);
+
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
@@ -757,13 +1178,14 @@ nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
-		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
+		set_edgeflag(chan, tesla, &emit, *map);
+
+		BEGIN_RING_NI(chan, tesla, NV50TCL_VERTEX_DATA, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
 
 		count -= nr;
 	}
-	nv50_unmap_vbufs(nv50);
 
 	return TRUE;
 }
diff --git a/src/gallium/drivers/r300/Makefile b/src/gallium/drivers/r300/Makefile
index 121b65063f3..afddcb161fa 100644
--- a/src/gallium/drivers/r300/Makefile
+++ b/src/gallium/drivers/r300/Makefile
@@ -4,8 +4,8 @@ include $(TOP)/configs/current
 LIBNAME = r300
 
 C_SOURCES = \
+	r300_blit.c \
 	r300_chipset.c \
-	r300_clear.c \
 	r300_context.c \
 	r300_debug.c \
 	r300_emit.c \
@@ -17,13 +17,13 @@ C_SOURCES = \
 	r300_state.c \
 	r300_state_derived.c \
 	r300_state_invariant.c \
-	r300_vbo.c \
 	r300_vs.c \
 	r300_texture.c \
 	r300_tgsi_to_rc.c
 
 LIBRARY_INCLUDES = \
-	-I$(TOP)/src/mesa/drivers/dri/r300/compiler
+	-I$(TOP)/src/mesa/drivers/dri/r300/compiler \
+	-I$(TOP)/src/gallium/winsys/drm/radeon/core
 
 COMPILER_ARCHIVE = $(TOP)/src/mesa/drivers/dri/r300/compiler/libr300compiler.a
 
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
index 97989040d2e..183aa17f9b3 100644
--- a/src/gallium/drivers/r300/SConscript
+++ b/src/gallium/drivers/r300/SConscript
@@ -4,13 +4,18 @@ r300compiler = SConscript('#/src/mesa/drivers/dri/r300/compiler/SConscript')
 
 env = env.Clone()
 # add the paths for r300compiler
-env.Append(CPPPATH = ['#/src/mesa/drivers/dri/r300/compiler', '#/include', '#/src/mesa'])
+env.Append(CPPPATH = [
+    '#/src/mesa/drivers/dri/r300/compiler', 
+    '#/src/gallium/winsys/drm/radeon/core',
+    '#/include', 
+    '#/src/mesa',
+])
 
 r300 = env.ConvenienceLibrary(
     target = 'r300',
     source = [
+        'r300_blit.c',
         'r300_chipset.c',
-        'r300_clear.c',
         'r300_context.c',
         'r300_debug.c',
         'r300_emit.c',
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
new file mode 100644
index 00000000000..c14414fff6b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "r300_blit.h"
+#include "r300_context.h"
+
+#include "util/u_rect.h"
+
+static void r300_blitter_save_states(struct r300_context* r300)
+{
+    util_blitter_save_blend(r300->blitter, r300->blend_state.state);
+    util_blitter_save_depth_stencil_alpha(r300->blitter, r300->dsa_state.state);
+    util_blitter_save_rasterizer(r300->blitter, r300->rs_state.state);
+    util_blitter_save_fragment_shader(r300->blitter, r300->fs);
+    util_blitter_save_vertex_shader(r300->blitter, r300->vs);
+}
+
+/* Clear currently bound buffers. */
+void r300_clear(struct pipe_context* pipe,
+                unsigned buffers,
+                const float* rgba,
+                double depth,
+                unsigned stencil)
+{
+    /* XXX Implement fastfill.
+     *
+     * If fastfill is enabled, a few facts should be considered:
+     *
+     * 1) Zbuffer must be micro-tiled and whole microtiles must be
+     *    written.
+     *
+     * 2) ZB_DEPTHCLEARVALUE is used to clear a zbuffer and Z Mask must be
+     *    equal to 0.
+     *
+     * 3) RB3D_COLOR_CLEAR_VALUE is used to clear a colorbuffer and
+     *    RB3D_COLOR_CHANNEL_MASK must be equal to 0.
+     *
+     * 4) ZB_CB_CLEAR can be used to make the ZB units help in clearing
+     *    the colorbuffer. The color clear value is supplied through both
+     *    RB3D_COLOR_CLEAR_VALUE and ZB_DEPTHCLEARVALUE, and the colorbuffer
+     *    must be set in ZB_DEPTHOFFSET and ZB_DEPTHPITCH in addition to
+     *    RB3D_COLOROFFSET and RB3D_COLORPITCH. It's obvious that the zbuffer
+     *    will not be cleared and multiple render targets cannot be cleared
+     *    this way either.
+     *
+     * 5) For 16-bit integer buffering, compression causes a hung with one or
+     *    two samples and should not be used.
+     *
+     * 6) Fastfill must not be used if reading of compressed Z data is disabled
+     *    and writing of compressed Z data is enabled (RD/WR_COMP_ENABLE),
+     *    i.e. it cannot be used to compress the zbuffer.
+     *    (what the hell does that mean and how does it fit in clearing
+     *    the buffers?)
+     *
+     * - Marek
+     */
+
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300_blitter_save_states(r300);
+
+    util_blitter_clear(r300->blitter,
+                       r300->framebuffer_state.width,
+                       r300->framebuffer_state.height,
+                       r300->framebuffer_state.nr_cbufs,
+                       buffers, rgba, depth, stencil);
+}
+
+/* Copy a block of pixels from one surface to another. */
+void r300_surface_copy(struct pipe_context* pipe,
+                       struct pipe_surface* dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface* src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    /* Yeah we have to save all those states to ensure this blitter operation
+     * is really transparent. The states will be restored by the blitter once
+     * copying is done. */
+    r300_blitter_save_states(r300);
+    util_blitter_save_framebuffer(r300->blitter, &r300->framebuffer_state);
+
+    util_blitter_save_fragment_sampler_states(
+        r300->blitter, r300->sampler_count, (void**)r300->sampler_states);
+
+    util_blitter_save_fragment_sampler_textures(
+        r300->blitter, r300->texture_count,
+        (struct pipe_texture**)r300->textures);
+
+    /* Do a copy */
+    util_blitter_copy(r300->blitter,
+                      dst, dstx, dsty, src, srcx, srcy, width, height, TRUE);
+}
+
+/* Fill a region of a surface with a constant value. */
+void r300_surface_fill(struct pipe_context* pipe,
+                       struct pipe_surface* dst,
+                       unsigned dstx, unsigned dsty,
+                       unsigned width, unsigned height,
+                       unsigned value)
+{
+    struct r300_context* r300 = r300_context(pipe);
+
+    r300_blitter_save_states(r300);
+    util_blitter_save_framebuffer(r300->blitter, &r300->framebuffer_state);
+
+    util_blitter_fill(r300->blitter,
+                      dst, dstx, dsty, width, height, value);
+}
diff --git a/src/gallium/drivers/r300/r300_clear.h b/src/gallium/drivers/r300/r300_blit.h
index b8fcdf273c7..029e4f98e7d 100644
--- a/src/gallium/drivers/r300/r300_clear.h
+++ b/src/gallium/drivers/r300/r300_blit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2008 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,10 +20,11 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#ifndef R300_CLEAR_H
-#define R300_CLEAR_H
+#ifndef R300_BLIT_H
+#define R300_BLIT_H
 
 struct pipe_context;
+struct pipe_surface;
 
 void r300_clear(struct pipe_context* pipe,
                 unsigned buffers,
@@ -31,4 +32,17 @@ void r300_clear(struct pipe_context* pipe,
                 double depth,
                 unsigned stencil);
 
-#endif /* R300_CLEAR_H */
+void r300_surface_copy(struct pipe_context* pipe,
+                       struct pipe_surface* dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface* src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height);
+
+void r300_surface_fill(struct pipe_context* pipe,
+                       struct pipe_surface* dst,
+                       unsigned dstx, unsigned dsty,
+                       unsigned width, unsigned height,
+                       unsigned value);
+
+#endif /* R300_BLIT_H */
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 51fdb82ff34..92de297ef1d 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -33,6 +33,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
     /* Reasonable defaults */
     caps->num_vert_fpus = 4;
     caps->has_tcl = debug_get_bool_option("RADEON_NO_TCL", FALSE) ? FALSE : TRUE;
+    caps->is_r400 = FALSE;
     caps->is_r500 = FALSE;
     caps->high_second_pipe = FALSE;
 
@@ -123,6 +124,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4A54:
             caps->family = CHIP_FAMILY_R420;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5548:
@@ -136,6 +138,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D57:
             caps->family = CHIP_FAMILY_R423;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x554C:
@@ -147,6 +150,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D4A:
             caps->family = CHIP_FAMILY_R430;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5D4C:
@@ -157,6 +161,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5D52:
             caps->family = CHIP_FAMILY_R480;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x4B48:
@@ -166,6 +171,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x4B4C:
             caps->family = CHIP_FAMILY_R481;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5E4C:
@@ -182,6 +188,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x5E4D:
             caps->family = CHIP_FAMILY_RV410;
             caps->num_vert_fpus = 6;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x5954:
@@ -212,6 +219,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x791F:
             caps->family = CHIP_FAMILY_RS690;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x793F:
@@ -219,6 +227,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x7942:
             caps->family = CHIP_FAMILY_RS600;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x796C:
@@ -227,6 +236,7 @@ void r300_parse_chipset(struct r300_capabilities* caps)
         case 0x796F:
             caps->family = CHIP_FAMILY_RS740;
             caps->has_tcl = FALSE;
+            caps->is_r400 = TRUE;
             break;
 
         case 0x7100:
diff --git a/src/gallium/drivers/r300/r300_chipset.h b/src/gallium/drivers/r300/r300_chipset.h
index 0633a8b8a72..28084864929 100644
--- a/src/gallium/drivers/r300/r300_chipset.h
+++ b/src/gallium/drivers/r300/r300_chipset.h
@@ -40,11 +40,18 @@ struct r300_capabilities {
     unsigned num_z_pipes;
     /* Whether or not TCL is physically present */
     boolean has_tcl;
+    /* Whether or not this is R400. The differences compared to their R3xx
+     * cousins are:
+     * - Extended fragment shader registers
+     * - Blend LTE/GTE thresholds */
+    boolean is_r400;
     /* Whether or not this is an RV515 or newer; R500s have many differences
      * that require extra consideration, compared to their R3xx cousins:
      * - Extra bit of width and height on texture sizes
      * - Blend color is split across two registers
-     * - Universal Shader (US) block used for fragment shaders */
+     * - Blend LTE/GTE thresholds
+     * - Universal Shader (US) block used for fragment shaders
+     * - FP16 blending and multisampling */
     boolean is_r500;
     /* Whether or not the second pixel pipe is accessed with the high bit */
     boolean high_second_pipe;
diff --git a/src/gallium/drivers/r300/r300_clear.c b/src/gallium/drivers/r300/r300_clear.c
deleted file mode 100644
index 02d6d504fc0..00000000000
--- a/src/gallium/drivers/r300/r300_clear.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE. */
-
-#include "r300_clear.h"
-#include "r300_context.h"
-
-#include "util/u_clear.h"
-
-/* Clears currently bound buffers. */
-void r300_clear(struct pipe_context* pipe,
-                unsigned buffers,
-                const float* rgba,
-                double depth,
-                unsigned stencil)
-{
-    /* XXX we can and should do one clear if both color and zs are set */
-    util_clear(pipe, &r300_context(pipe)->framebuffer_state,
-            buffers, rgba, depth, stencil);
-}
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index ae23329b83f..3bf9c8b18dc 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -28,14 +28,16 @@
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
 
-#include "r300_clear.h"
+#include "r300_blit.h"
 #include "r300_context.h"
+#include "r300_emit.h"
 #include "r300_flush.h"
 #include "r300_query.h"
 #include "r300_render.h"
 #include "r300_screen.h"
 #include "r300_state_derived.h"
 #include "r300_state_invariant.h"
+#include "r300_texture.h"
 #include "r300_winsys.h"
 
 static enum pipe_error r300_clear_hash_table(void* key, void* value,
@@ -51,6 +53,8 @@ static void r300_destroy_context(struct pipe_context* context)
     struct r300_context* r300 = r300_context(context);
     struct r300_query* query, * temp;
 
+    util_blitter_destroy(r300->blitter);
+
     util_hash_table_foreach(r300->shader_hash_table, r300_clear_hash_table,
         NULL);
     util_hash_table_destroy(r300->shader_hash_table);
@@ -66,10 +70,13 @@ static void r300_destroy_context(struct pipe_context* context)
         FREE(query);
     }
 
-    FREE(r300->blend_color_state);
+    FREE(r300->blend_color_state.state);
+    FREE(r300->clip_state.state);
     FREE(r300->rs_block);
-    FREE(r300->scissor_state);
-    FREE(r300->viewport_state);
+    FREE(r300->scissor_state.state);
+    FREE(r300->vertex_info);
+    FREE(r300->viewport_state.state);
+    FREE(r300->ztop_state.state);
     FREE(r300);
 }
 
@@ -80,7 +87,7 @@ r300_is_texture_referenced(struct pipe_context *pipe,
 {
     struct pipe_buffer* buf = 0;
 
-    r300_get_texture_buffer(texture, &buf, NULL);
+    r300_get_texture_buffer(pipe->screen, texture, &buf, NULL);
 
     return pipe->is_buffer_referenced(pipe, buf);
 }
@@ -103,8 +110,38 @@ static void r300_flush_cb(void *data)
     cs_context_copy->context.flush(&cs_context_copy->context, 0, NULL);
 }
 
+#define R300_INIT_ATOM(atomname, atomsize) \
+    r300->atomname##_state.name = #atomname; \
+    r300->atomname##_state.state = NULL; \
+    r300->atomname##_state.size = atomsize; \
+    r300->atomname##_state.emit = r300_emit_##atomname##_state; \
+    r300->atomname##_state.dirty = FALSE; \
+    insert_at_tail(&r300->atom_list, &r300->atomname##_state);
+
+static void r300_setup_atoms(struct r300_context* r300)
+{
+    /* Create the actual atom list.
+     *
+     * Each atom is examined and emitted in the order it appears here, which
+     * can affect performance and conformance if not handled with care.
+     *
+     * Some atoms never change size, others change every emit. This is just
+     * an upper bound on each atom, to keep the emission machinery from
+     * underallocating space. */
+    make_empty_list(&r300->atom_list);
+    R300_INIT_ATOM(invariant, 71);
+    R300_INIT_ATOM(ztop, 2);
+    R300_INIT_ATOM(blend, 8);
+    R300_INIT_ATOM(blend_color, 3);
+    R300_INIT_ATOM(clip, 29);
+    R300_INIT_ATOM(dsa, 8);
+    R300_INIT_ATOM(rs, 25);
+    R300_INIT_ATOM(scissor, 3);
+    R300_INIT_ATOM(viewport, 9);
+}
+
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
-                                         struct r300_winsys* r300_winsys)
+                                         struct radeon_winsys* radeon_winsys)
 {
     struct r300_context* r300 = CALLOC_STRUCT(r300_context);
     struct r300_screen* r300screen = r300_screen(screen);
@@ -112,26 +149,35 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     if (!r300)
         return NULL;
 
-    r300->winsys = r300_winsys;
+    r300->winsys = radeon_winsys;
 
-    r300->context.winsys = (struct pipe_winsys*)r300_winsys;
+    r300->context.winsys = (struct pipe_winsys*)radeon_winsys;
     r300->context.screen = screen;
 
-    r300_init_debug(r300);
-
     r300->context.destroy = r300_destroy_context;
 
     r300->context.clear = r300_clear;
+    r300->context.surface_copy = r300_surface_copy;
+    r300->context.surface_fill = r300_surface_fill;
 
-    if (r300screen->caps->has_tcl)
-    {
+    if (r300screen->caps->has_tcl) {
         r300->context.draw_arrays = r300_draw_arrays;
         r300->context.draw_elements = r300_draw_elements;
         r300->context.draw_range_elements = r300_draw_range_elements;
-    }
-    else
-    {
-        assert(0);
+    } else {
+        r300->context.draw_arrays = r300_swtcl_draw_arrays;
+        r300->context.draw_elements = r300_draw_elements;
+        r300->context.draw_range_elements = r300_swtcl_draw_range_elements;
+
+        /* Create a Draw. This is used for SW TCL. */
+        r300->draw = draw_create();
+        /* Enable our renderer. */
+        draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
+        /* Enable Draw's clipping. */
+        draw_set_driver_clipping(r300->draw, FALSE);
+        /* Force Draw to never do viewport transform, since we can do
+         * transform in hardware, always. */
+        draw_set_viewport_state(r300->draw, &r300_viewport_identity);
     }
 
     r300->context.is_texture_referenced = r300_is_texture_referenced;
@@ -140,20 +186,15 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300->shader_hash_table = util_hash_table_create(r300_shader_key_hash,
         r300_shader_key_compare);
 
-    r300->blend_color_state = CALLOC_STRUCT(r300_blend_color_state);
+    r300_setup_atoms(r300);
+
+    r300->blend_color_state.state = CALLOC_STRUCT(r300_blend_color_state);
+    r300->clip_state.state = CALLOC_STRUCT(pipe_clip_state);
     r300->rs_block = CALLOC_STRUCT(r300_rs_block);
-    r300->scissor_state = CALLOC_STRUCT(r300_scissor_state);
-    r300->viewport_state = CALLOC_STRUCT(r300_viewport_state);
-
-    /* Create a Draw. This is used for vert collation and SW TCL. */
-    r300->draw = draw_create();
-    /* Enable our renderer. */
-    draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300));
-    /* Disable Draw's clipping if TCL is present. */
-    draw_set_driver_clipping(r300->draw, r300_screen(screen)->caps->has_tcl);
-    /* Force Draw to never do viewport transform, since (again) we can do
-     * transform in hardware, always. */
-    draw_set_viewport_state(r300->draw, &r300_viewport_identity);
+    r300->scissor_state.state = CALLOC_STRUCT(pipe_scissor_state);
+    r300->vertex_info = CALLOC_STRUCT(r300_vertex_info);
+    r300->viewport_state.state = CALLOC_STRUCT(r300_viewport_state);
+    r300->ztop_state.state = CALLOC_STRUCT(r300_ztop_state);
 
     /* Open up the OQ BO. */
     r300->oqbo = screen->buffer_create(screen, 4096,
@@ -168,10 +209,13 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300_init_state_functions(r300);
 
-    r300_emit_invariant_state(r300);
+    r300->invariant_state.dirty = TRUE;
 
     r300->winsys->set_flush_cb(r300->winsys, r300_flush_cb, r300);
     r300->dirty_state = R300_NEW_KITCHEN_SINK;
     r300->dirty_hw++;
+
+    r300->blitter = util_blitter_create(&r300->context);
+
     return &r300->context;
 }
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f954ba7f9aa..48c86fdad7a 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -25,12 +25,35 @@
 
 #include "draw/draw_vertex.h"
 
+#include "util/u_blitter.h"
+
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
 
+#include "r300_screen.h"
+
+struct r300_context;
+
 struct r300_fragment_shader;
 struct r300_vertex_shader;
 
+struct r300_atom {
+    /* List pointers. */
+    struct r300_atom *prev, *next;
+    /* Name, for debugging. */
+    const char* name;
+    /* Opaque state. */
+    void* state;
+    /* Emit the state to the context. */
+    void (*emit)(struct r300_context*, void*);
+    /* Upper bound on number of dwords to emit. */
+    unsigned size;
+    /* Whether this atom should be emitted. */
+    boolean dirty;
+    /* Another dirty flag that is never automatically cleared. */
+    boolean always_dirty;
+};
+
 struct r300_blend_state {
     uint32_t blend_control;       /* R300_RB3D_CBLEND: 0x4e04 */
     uint32_t alpha_blend_control; /* R300_RB3D_ABLEND: 0x4e08 */
@@ -60,19 +83,15 @@ struct r300_rs_state {
     /* Draw-specific rasterizer state */
     struct pipe_rasterizer_state rs;
 
-    /* Whether or not to enable the VTE. This is referenced at the very
-     * last moment during emission of VTE state, to decide whether or not
-     * the VTE should be used for transformation. */
-    boolean enable_vte;
-
     uint32_t vap_control_status;    /* R300_VAP_CNTL_STATUS: 0x2140 */
+    uint32_t antialiasing_config;   /* R300_GB_AA_CONFIG: 0x4020 */
     uint32_t point_size;            /* R300_GA_POINT_SIZE: 0x421c */
     uint32_t point_minmax;          /* R300_GA_POINT_MINMAX: 0x4230 */
     uint32_t line_control;          /* R300_GA_LINE_CNTL: 0x4234 */
-    uint32_t depth_scale_front;  /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
-    uint32_t depth_offset_front;/* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
-    uint32_t depth_scale_back;    /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
-    uint32_t depth_offset_back;  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
+    float depth_scale;            /* R300_SU_POLY_OFFSET_FRONT_SCALE: 0x42a4 */
+                                  /* R300_SU_POLY_OFFSET_BACK_SCALE: 0x42ac */
+    float depth_offset;           /* R300_SU_POLY_OFFSET_FRONT_OFFSET: 0x42a8 */
+                                  /* R300_SU_POLY_OFFSET_BACK_OFFSET: 0x42b0 */
     uint32_t polygon_offset_enable; /* R300_SU_POLY_OFFSET_ENABLE: 0x42b4 */
     uint32_t cull_mode;             /* R300_SU_CULL_MODE: 0x42b8 */
     uint32_t line_stipple_config;   /* R300_GA_LINE_STIPPLE_CONFIG: 0x4328 */
@@ -89,14 +108,15 @@ struct r300_rs_block {
 };
 
 struct r300_sampler_state {
+    struct pipe_sampler_state state;
+
     uint32_t filter0;      /* R300_TX_FILTER0: 0x4400 */
     uint32_t filter1;      /* R300_TX_FILTER1: 0x4440 */
     uint32_t border_color; /* R300_TX_BORDER_COLOR: 0x45c0 */
-};
 
-struct r300_scissor_state {
-    uint32_t scissor_top_left;     /* R300_SC_SCISSORS_TL: 0x43e0 */
-    uint32_t scissor_bottom_right; /* R300_SC_SCISSORS_BR: 0x43e4 */
+    /* Min/max LOD must be clamped to [0, last_level], thus
+     * it's dependent on a currently bound texture */
+    unsigned min_lod, max_lod;
 };
 
 struct r300_texture_state {
@@ -119,24 +139,17 @@ struct r300_ztop_state {
     uint32_t z_buffer_top;      /* R300_ZB_ZTOP: 0x4f14 */
 };
 
-#define R300_NEW_BLEND           0x00000001
-#define R300_NEW_BLEND_COLOR     0x00000002
-#define R300_NEW_CLIP            0x00000004
-#define R300_NEW_DSA             0x00000008
 #define R300_NEW_FRAMEBUFFERS    0x00000010
 #define R300_NEW_FRAGMENT_SHADER 0x00000020
 #define R300_NEW_FRAGMENT_SHADER_CONSTANTS    0x00000040
-#define R300_NEW_RASTERIZER      0x00000080
 #define R300_NEW_RS_BLOCK        0x00000100
 #define R300_NEW_SAMPLER         0x00000200
 #define R300_ANY_NEW_SAMPLERS    0x0001fe00
-#define R300_NEW_SCISSOR         0x00020000
 #define R300_NEW_TEXTURE         0x00040000
 #define R300_ANY_NEW_TEXTURES    0x03fc0000
 #define R300_NEW_VERTEX_FORMAT   0x04000000
 #define R300_NEW_VERTEX_SHADER   0x08000000
 #define R300_NEW_VERTEX_SHADER_CONSTANTS    0x10000000
-#define R300_NEW_VIEWPORT        0x20000000
 #define R300_NEW_QUERY           0x40000000
 #define R300_NEW_KITCHEN_SINK    0x7fffffff
 
@@ -178,6 +191,12 @@ struct r300_query {
     struct r300_query* next;
 };
 
+enum r300_buffer_tiling {
+    R300_BUFFER_LINEAR = 0,
+    R300_BUFFER_TILED,
+    R300_BUFFER_SQUARETILED
+};
+
 struct r300_texture {
     /* Parent class */
     struct pipe_texture tex;
@@ -214,16 +233,14 @@ struct r300_texture {
 
     /* Registers carrying texture format data. */
     struct r300_texture_state state;
+
+    /* Buffer tiling */
+    enum r300_buffer_tiling microtile, macrotile;
 };
 
 struct r300_vertex_info {
     /* Parent class */
     struct vertex_info vinfo;
-    /* Map of vertex attributes into PVS memory for HW TCL,
-     * or GA memory for SW TCL. */
-    int vs_tab[16];
-    /* Map of rasterizer attributes from GB through RS to US. */
-    int fs_tab[16];
 
     /* R300_VAP_PROG_STREAK_CNTL_[0-7] */
     uint32_t vap_prog_stream_cntl[8];
@@ -238,9 +255,11 @@ struct r300_context {
     struct pipe_context context;
 
     /* The interface to the windowing system, etc. */
-    struct r300_winsys* winsys;
+    struct radeon_winsys* winsys;
     /* Draw module. Used mostly for SW TCL. */
     struct draw_context* draw;
+    /* Accelerated blit support. */
+    struct blitter_context* blitter;
 
     /* Vertex buffer for rendering. */
     struct pipe_buffer* vbo;
@@ -260,38 +279,43 @@ struct r300_context {
     struct r300_vertex_info* vertex_info;
 
     /* Various CSO state objects. */
+    /* Beginning of atom list. */
+    struct r300_atom atom_list;
     /* Blend state. */
-    struct r300_blend_state* blend_state;
+    struct r300_atom blend_state;
     /* Blend color state. */
-    struct r300_blend_color_state* blend_color_state;
+    struct r300_atom blend_color_state;
     /* User clip planes. */
-    struct pipe_clip_state clip_state;
+    struct r300_atom clip_state;
     /* Shader constants. */
     struct r300_constant_buffer shader_constants[PIPE_SHADER_TYPES];
     /* Depth, stencil, and alpha state. */
-    struct r300_dsa_state* dsa_state;
+    struct r300_atom dsa_state;
     /* Fragment shader. */
     struct r300_fragment_shader* fs;
     /* Framebuffer state. We currently don't need our own version of this. */
     struct pipe_framebuffer_state framebuffer_state;
     /* Rasterizer state. */
-    struct r300_rs_state* rs_state;
+    struct r300_atom rs_state;
     /* RS block state. */
     struct r300_rs_block* rs_block;
     /* Sampler states. */
     struct r300_sampler_state* sampler_states[8];
     int sampler_count;
     /* Scissor state. */
-    struct r300_scissor_state* scissor_state;
+    struct r300_atom scissor_state;
     /* Texture states. */
     struct r300_texture* textures[8];
     int texture_count;
     /* Vertex shader. */
     struct r300_vertex_shader* vs;
     /* Viewport state. */
-    struct r300_viewport_state* viewport_state;
+    struct r300_atom viewport_state;
     /* ZTOP state. */
-    struct r300_ztop_state ztop_state;
+    struct r300_atom ztop_state;
+
+    /* Invariant state. This must be emitted to get the engine started. */
+    struct r300_atom invariant_state;
 
     /* Vertex buffers for Gallium. */
     struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -304,9 +328,12 @@ struct r300_context {
     uint32_t dirty_state;
     /* Flag indicating whether or not the HW is dirty. */
     uint32_t dirty_hw;
-
-    /** Combination of DBG_xxx flags */
-    unsigned debug;
+    /* Whether the TCL engine should be in bypass mode. */
+    boolean tcl_bypass;
+    /* Whether polygon offset is enabled. */
+    boolean polygon_offset_enabled;
+    /* Z buffer bit depth. */
+    uint32_t zbuffer_bpp;
 };
 
 /* Convenience cast wrapper. */
@@ -320,35 +347,15 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_surface_functions(struct r300_context* r300);
 
-/* Debug functionality. */
-
-/**
- * Debug flags to disable/enable certain groups of debugging outputs.
- *
- * \note These may be rather coarse, and the grouping may be impractical.
- * If you find, while debugging the driver, that a different grouping
- * of these flags would be beneficial, just feel free to change them
- * but make sure to update the documentation in r300_debug.c to reflect
- * those changes.
- */
-/*@{*/
-#define DBG_HELP    0x0000001
-#define DBG_FP      0x0000002
-#define DBG_VP      0x0000004
-#define DBG_CS      0x0000008
-#define DBG_DRAW    0x0000010
-#define DBG_TEX     0x0000020
-#define DBG_FALL    0x0000040
-/*@}*/
-
-static INLINE boolean DBG_ON(struct r300_context * ctx, unsigned flags)
+static INLINE boolean CTX_DBG_ON(struct r300_context * ctx, unsigned flags)
 {
-    return (ctx->debug & flags) ? TRUE : FALSE;
+    return SCREEN_DBG_ON(r300_screen(ctx->context.screen), flags);
 }
 
-static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * fmt, ...)
+static INLINE void CTX_DBG(struct r300_context * ctx, unsigned flags,
+                       const char * fmt, ...)
 {
-    if (DBG_ON(ctx, flags)) {
+    if (CTX_DBG_ON(ctx, flags)) {
         va_list va;
         va_start(va, fmt);
         debug_vprintf(fmt, va);
@@ -356,6 +363,8 @@ static INLINE void DBG(struct r300_context * ctx, unsigned flags, const char * f
     }
 }
 
-void r300_init_debug(struct r300_context * ctx);
+#define DBG_ON  CTX_DBG_ON
+#define DBG     CTX_DBG
 
 #endif /* R300_CONTEXT_H */
+
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 5342488d0d4..151f72b0fe4 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -26,7 +26,8 @@
 #include "util/u_math.h"
 
 #include "r300_reg.h"
-#include "r300_winsys.h"
+
+#include "radeon_winsys.h"
 
 /* Yes, I know macros are ugly. However, they are much prettier than the code
  * that they neatly hide away, and don't have the cost of function setup,so
@@ -50,11 +51,11 @@
 
 #define CS_LOCALS(context) \
     struct r300_context* const cs_context_copy = (context); \
-    struct r300_winsys* cs_winsys = cs_context_copy->winsys; \
-    int cs_count = 0
+    struct radeon_winsys* cs_winsys = cs_context_copy->winsys; \
+    int cs_count = 0; (void) cs_count;
 
 #define CHECK_CS(size) \
-    cs_winsys->check_cs(cs_winsys, (size))
+    assert(cs_winsys->check_cs(cs_winsys, (size)))
 
 #define BEGIN_CS(size) do { \
     CHECK_CS(size); \
@@ -114,6 +115,15 @@
     cs_count -= 3; \
 } while (0)
 
+#define OUT_CS_RELOC_NO_OFFSET(bo, rd, wd, flags) do { \
+    DBG(cs_context_copy, DBG_CS, "r300: writing relocation for buffer %p, " \
+            "domains (%d, %d, %d)\n", \
+        bo, rd, wd, flags); \
+    assert(bo); \
+    cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
+    cs_count -= 2; \
+} while (0)
+
 #define END_CS do { \
     if (VERY_VERBOSE_CS) { \
         DBG(cs_context_copy, DBG_CS, "r300: END_CS in %s (%s:%d)\n", __FUNCTION__, \
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index 2a6ed54ac9b..00d4f31c2b6 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -46,7 +46,7 @@ static struct debug_option debug_options[] = {
     { 0, 0, 0 }
 };
 
-void r300_init_debug(struct r300_context * ctx)
+void r300_init_debug(struct r300_screen * screen)
 {
     const char * options = debug_get_option("RADEON_DEBUG", 0);
     boolean printhint = FALSE;
@@ -64,7 +64,7 @@ void r300_init_debug(struct r300_context * ctx)
 
             for(opt = debug_options; opt->name; ++opt) {
                 if (!strncmp(options, opt->name, length)) {
-                    ctx->debug |= opt->flag;
+                    screen->debug |= opt->flag;
                     break;
                 }
             }
@@ -77,11 +77,11 @@ void r300_init_debug(struct r300_context * ctx)
             options += length;
         }
 
-        if (!ctx->debug)
+        if (!screen->debug)
             printhint = TRUE;
     }
 
-    if (printhint || ctx->debug & DBG_HELP) {
+    if (printhint || screen->debug & DBG_HELP) {
         debug_printf("You can enable debug output by setting the RADEON_DEBUG environment variable\n"
                      "to a comma-separated list of debug options. Available options are:\n");
         for(opt = debug_options; opt->name; ++opt) {
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index b44c7bdbb3d..badbf3715c7 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -22,7 +23,9 @@
 
 /* r300_emit: Functions for emitting state. */
 
+#include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_simple_list.h"
 
 #include "r300_context.h"
 #include "r300_cs.h"
@@ -34,23 +37,31 @@
 #include "r300_texture.h"
 #include "r300_vs.h"
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend)
+void r300_emit_blend_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_state* blend = (struct r300_blend_state*)state;
     CS_LOCALS(r300);
+
     BEGIN_CS(8);
-    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
-    OUT_CS(blend->blend_control);
-    OUT_CS(blend->alpha_blend_control);
-    OUT_CS(blend->color_channel_mask);
     OUT_CS_REG(R300_RB3D_ROPCNTL, blend->rop);
+    OUT_CS_REG_SEQ(R300_RB3D_CBLEND, 3);
+    if (r300->framebuffer_state.nr_cbufs) {
+        OUT_CS(blend->blend_control);
+        OUT_CS(blend->alpha_blend_control);
+        OUT_CS(blend->color_channel_mask);
+    } else {
+        OUT_CS(0);
+        OUT_CS(0);
+        OUT_CS(0);
+        /* XXX also disable fastfill here once it's supported */
+    }
     OUT_CS_REG(R300_RB3D_DITHER_CTL, blend->dither);
     END_CS;
 }
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc)
+void r300_emit_blend_color_state(struct r300_context* r300, void* state)
 {
+    struct r300_blend_color_state* bc = (struct r300_blend_color_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
@@ -67,9 +78,9 @@ void r300_emit_blend_color_state(struct r300_context* r300,
     }
 }
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip)
+void r300_emit_clip_state(struct r300_context* r300, void* state)
 {
+    struct pipe_clip_state* clip = (struct pipe_clip_state*)state;
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
@@ -97,13 +108,13 @@ void r300_emit_clip_state(struct r300_context* r300,
 
 }
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                           struct r300_dsa_state* dsa)
+void r300_emit_dsa_state(struct r300_context* r300, void* state)
 {
+    struct r300_dsa_state* dsa = (struct r300_dsa_state*)state;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     CS_LOCALS(r300);
 
-    BEGIN_CS(r300screen->caps->is_r500 ? 10 : 8);
+    BEGIN_CS(r300screen->caps->is_r500 ? 8 : 6);
     OUT_CS_REG(R300_FG_ALPHA_FUNC, dsa->alpha_function);
 
     /* not needed since we use the 8bit alpha ref */
@@ -112,10 +123,16 @@ void r300_emit_dsa_state(struct r300_context* r300,
     }*/
 
     OUT_CS_REG_SEQ(R300_ZB_CNTL, 3);
-    OUT_CS(dsa->z_buffer_control);
-    OUT_CS(dsa->z_stencil_control);
+
+    if (r300->framebuffer_state.zsbuf) {
+        OUT_CS(dsa->z_buffer_control);
+        OUT_CS(dsa->z_stencil_control);
+    } else {
+        OUT_CS(0);
+        OUT_CS(0);
+    }
+
     OUT_CS(dsa->stencil_ref_mask);
-    OUT_CS_REG(R300_ZB_ZTOP, r300->ztop_state.z_buffer_top);
 
     /* XXX it seems r3xx doesn't support STENCILREFMASK_BF */
     if (r300screen->caps->is_r500) {
@@ -129,7 +146,11 @@ static const float * get_shader_constant(
     struct rc_constant * constant,
     struct r300_constant_buffer * externals)
 {
-    static const float zero[4] = { 0.0, 0.0, 0.0, 0.0 };
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
+    static float vec[4] = { 0.0, 0.0, 0.0, 1.0 };
+    struct pipe_texture *tex;
+
     switch(constant->Type) {
         case RC_CONSTANT_EXTERNAL:
             return externals->constants[constant->u.External];
@@ -137,11 +158,58 @@ static const float * get_shader_constant(
         case RC_CONSTANT_IMMEDIATE:
             return constant->u.Immediate;
 
+        case RC_CONSTANT_STATE:
+            switch (constant->u.State[0]) {
+                /* Factor for converting rectangle coords to
+                 * normalized coords. Should only show up on non-r500. */
+                case RC_STATE_R300_TEXRECT_FACTOR:
+                    tex = &r300->textures[constant->u.State[1]]->tex;
+                    vec[0] = 1.0 / tex->width0;
+                    vec[1] = 1.0 / tex->height0;
+                    break;
+
+                /* Texture compare-fail value. */
+                /* XXX Since Gallium doesn't support GL_ARB_shadow_ambient,
+                 * this is always (0,0,0,0), right? */
+                case RC_STATE_SHADOW_AMBIENT:
+                    vec[3] = 0;
+                    break;
+
+                case RC_STATE_R300_VIEWPORT_SCALE:
+                    if (r300->tcl_bypass) {
+                        vec[0] = 1;
+                        vec[1] = 1;
+                        vec[2] = 1;
+                    } else {
+                        vec[0] = viewport->xscale;
+                        vec[1] = viewport->yscale;
+                        vec[2] = viewport->zscale;
+                    }
+                    break;
+
+                case RC_STATE_R300_VIEWPORT_OFFSET:
+                    if (!r300->tcl_bypass) {
+                        vec[0] = viewport->xoffset;
+                        vec[1] = viewport->yoffset;
+                        vec[2] = viewport->zoffset;
+                    }
+                    break;
+
+                default:
+                    debug_printf("r300: Implementation error: "
+                        "Unknown RC_CONSTANT type %d\n", constant->u.State[0]);
+            }
+            break;
+
         default:
-            debug_printf("r300: Implementation error: Unhandled constant type %i\n",
-                constant->Type);
-            return zero;
+            debug_printf("r300: Implementation error: "
+                "Unhandled constant type %d\n", constant->Type);
     }
+
+    /* This should either be (0, 0, 0, 1), which should be a relatively safe
+     * RGBA or STRQ value, or it could be one of the RC_CONSTANT_STATE
+     * state factors. */
+    return vec;
 }
 
 /* Convert a normal single-precision float into the 7.16 format
@@ -245,6 +313,22 @@ void r300_emit_fs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
+static void r300_emit_fragment_depth_config(struct r300_context* r300,
+                                            struct r300_fragment_shader* fs)
+{
+    CS_LOCALS(r300);
+
+    BEGIN_CS(4);
+    if (r300_fragment_shader_writes_depth(fs)) {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SHADER);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W24 | R300_W_SRC_US);
+    } else {
+        OUT_CS_REG(R300_FG_DEPTH_SRC, R300_FG_DEPTH_SRC_SCAN);
+        OUT_CS_REG(R300_US_W_FMT, R300_W_FMT_W0 | R300_W_SRC_US);
+    }
+    END_CS;
+}
+
 void r500_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code)
 {
@@ -254,7 +338,7 @@ void r500_emit_fragment_program_code(struct r300_context* r300,
 
     BEGIN_CS(13 +
              ((code->inst_end + 1) * 6));
-    OUT_CS_REG(R500_US_CONFIG, 0);
+    OUT_CS_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
     OUT_CS_REG(R500_US_PIXSIZE, code->max_temp_idx);
     OUT_CS_REG(R500_US_CODE_RANGE,
                R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end));
@@ -308,7 +392,13 @@ void r300_emit_fb_state(struct r300_context* r300,
     int i;
     CS_LOCALS(r300);
 
-    BEGIN_CS((10 * fb->nr_cbufs) + (fb->zsbuf ? 10 : 0) + 4);
+    /* Shouldn't fail unless there is a bug in the state tracker. */
+    assert(fb->nr_cbufs <= 4);
+
+    BEGIN_CS((10 * fb->nr_cbufs) + (2 * (4 - fb->nr_cbufs)) +
+             (fb->zsbuf ? 10 : 0) + 6);
+
+    /* Flush and free renderbuffer caches. */
     OUT_CS_REG(R300_RB3D_DSTCACHE_CTLSTAT,
         R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
         R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
@@ -316,6 +406,10 @@ void r300_emit_fb_state(struct r300_context* r300,
         R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
         R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
 
+    /* Set the number of colorbuffers. */
+    OUT_CS_REG(R300_RB3D_CCTL, R300_RB3D_CCTL_NUM_MULTIWRITES(fb->nr_cbufs));
+
+    /* Set up colorbuffers. */
     for (i = 0; i < fb->nr_cbufs; i++) {
         surf = fb->cbufs[i];
         tex = (struct r300_texture*)surf->texture;
@@ -326,13 +420,21 @@ void r300_emit_fb_state(struct r300_context* r300,
 
         OUT_CS_REG_SEQ(R300_RB3D_COLORPITCH0 + (4 * i), 1);
         OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
-                     r300_translate_colorformat(tex->tex.format), 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+                     r300_translate_colorformat(tex->tex.format) |
+                     R300_COLOR_TILE(tex->macrotile) |
+                     R300_COLOR_MICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
 
         OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i),
             r300_translate_out_fmt(surf->format));
     }
 
+    /* Disable unused colorbuffers. */
+    for (; i < 4; i++) {
+        OUT_CS_REG(R300_US_OUT_FMT_0 + (4 * i), R300_US_OUT_FMT_UNUSED);
+    }
+
+    /* Set up a zbuffer. */
     if (fb->zsbuf) {
         surf = fb->zsbuf;
         tex = (struct r300_texture*)surf->texture;
@@ -344,8 +446,10 @@ void r300_emit_fb_state(struct r300_context* r300,
         OUT_CS_REG(R300_ZB_FORMAT, r300_translate_zsformat(tex->tex.format));
 
         OUT_CS_REG_SEQ(R300_ZB_DEPTHPITCH, 1);
-        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level], 0,
-                     RADEON_GEM_DOMAIN_VRAM, 0);
+        OUT_CS_RELOC(tex->buffer, tex->pitch[surf->level] |
+                     R300_DEPTHMACROTILE(tex->macrotile) |
+                     R300_DEPTHMICROTILE(tex->microtile),
+                     0, RADEON_GEM_DOMAIN_VRAM, 0);
     }
 
     END_CS;
@@ -360,8 +464,6 @@ static void r300_emit_query_start(struct r300_context *r300)
     if (!query)
 	return;
 
-    /* XXX This will almost certainly not return good results
-     * for overlapping queries. */
     BEGIN_CS(4);
     if (caps->family == CHIP_FAMILY_RV530) {
         OUT_CS_REG(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
@@ -479,21 +581,43 @@ void r300_emit_query_end(struct r300_context* r300)
         r300_emit_query_finish(r300, query);
 }
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs)
+void r300_emit_rs_state(struct r300_context* r300, void* state)
 {
+    struct r300_rs_state* rs = (struct r300_rs_state*)state;
+    float scale, offset;
     CS_LOCALS(r300);
 
-    BEGIN_CS(22);
+    BEGIN_CS(20 + (rs->polygon_offset_enable ? 5 : 0));
     OUT_CS_REG(R300_VAP_CNTL_STATUS, rs->vap_control_status);
+
+    OUT_CS_REG(R300_GB_AA_CONFIG, rs->antialiasing_config);
+
     OUT_CS_REG(R300_GA_POINT_SIZE, rs->point_size);
     OUT_CS_REG_SEQ(R300_GA_POINT_MINMAX, 2);
     OUT_CS(rs->point_minmax);
     OUT_CS(rs->line_control);
-    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 6);
-    OUT_CS(rs->depth_scale_front);
-    OUT_CS(rs->depth_offset_front);
-    OUT_CS(rs->depth_scale_back);
-    OUT_CS(rs->depth_offset_back);
+
+    if (rs->polygon_offset_enable) {
+        scale = rs->depth_scale * 12;
+        offset = rs->depth_offset;
+
+        switch (r300->zbuffer_bpp) {
+            case 16:
+                offset *= 4;
+                break;
+            case 24:
+                offset *= 2;
+                break;
+        }
+
+        OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+        OUT_CS_32F(scale);
+        OUT_CS_32F(offset);
+    }
+
+    OUT_CS_REG_SEQ(R300_SU_POLY_OFFSET_ENABLE, 2);
     OUT_CS(rs->polygon_offset_enable);
     OUT_CS(rs->cull_mode);
     OUT_CS_REG(R300_GA_LINE_STIPPLE_CONFIG, rs->line_stipple_config);
@@ -543,15 +667,62 @@ void r300_emit_rs_block_state(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor)
+void r300_emit_scissor_state(struct r300_context* r300, void* state)
 {
+    unsigned minx, miny, maxx, maxy;
+    uint32_t top_left, bottom_right;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct pipe_scissor_state* scissor = (struct pipe_scissor_state*)state;
     CS_LOCALS(r300);
 
+    minx = miny = 0;
+    maxx = r300->framebuffer_state.width;
+    maxy = r300->framebuffer_state.height;
+
+    if (((struct r300_rs_state*)r300->rs_state.state)->rs.scissor) {
+        minx = MAX2(minx, scissor->minx);
+        miny = MAX2(miny, scissor->miny);
+        maxx = MIN2(maxx, scissor->maxx);
+        maxy = MIN2(maxy, scissor->maxy);
+    }
+
+    /* Special case for zero-area scissor.
+     *
+     * We can't allow the variables maxx and maxy to be zero because they are
+     * subtracted from later in the code, which would cause emitting ~0 and
+     * making the kernel checker angry.
+     *
+     * Let's consider we change maxx and maxy to 1, which is effectively
+     * a one-pixel area. We must then change minx and miny to a number which is
+     * greater than 1 to get the zero area back. */
+    if (!maxx || !maxy) {
+        minx = 2;
+        miny = 2;
+        maxx = 1;
+        maxy = 1;
+    }
+
+    if (r300screen->caps->is_r500) {
+        top_left =
+            (minx << R300_SCISSORS_X_SHIFT) |
+            (miny << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            ((maxx - 1) << R300_SCISSORS_X_SHIFT) |
+            ((maxy - 1) << R300_SCISSORS_Y_SHIFT);
+    } else {
+        /* Offset of 1440 in non-R500 chipsets. */
+        top_left =
+            ((minx + 1440) << R300_SCISSORS_X_SHIFT) |
+            ((miny + 1440) << R300_SCISSORS_Y_SHIFT);
+        bottom_right =
+            (((maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
+            (((maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
+    }
+
     BEGIN_CS(3);
     OUT_CS_REG_SEQ(R300_SC_SCISSORS_TL, 2);
-    OUT_CS(scissor->scissor_top_left);
-    OUT_CS(scissor->scissor_bottom_right);
+    OUT_CS(top_left);
+    OUT_CS(bottom_right);
     END_CS;
 }
 
@@ -561,6 +732,8 @@ void r300_emit_texture(struct r300_context* r300,
                        unsigned offset)
 {
     uint32_t filter0 = sampler->filter0;
+    uint32_t format0 = tex->state.format0;
+    unsigned min_level, max_level;
     CS_LOCALS(r300);
 
     /* to emulate 1D textures through 2D ones correctly */
@@ -569,90 +742,75 @@ void r300_emit_texture(struct r300_context* r300,
         filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE);
     }
 
+    if (tex->is_npot) {
+        /* NPOT textures don't support mip filter, unfortunately.
+         * This prevents incorrect rendering. */
+        filter0 &= ~R300_TX_MIN_FILTER_MIP_MASK;
+    } else {
+        /* determine min/max levels */
+        /* the MAX_MIP level is the largest (finest) one */
+        max_level = MIN2(sampler->max_lod, tex->tex.last_level);
+        min_level = MIN2(sampler->min_lod, max_level);
+        format0 |= R300_TX_NUM_LEVELS(max_level);
+        filter0 |= R300_TX_MAX_MIP_LEVEL(min_level);
+    }
+
     BEGIN_CS(16);
     OUT_CS_REG(R300_TX_FILTER0_0 + (offset * 4), filter0 |
         (offset << 28));
     OUT_CS_REG(R300_TX_FILTER1_0 + (offset * 4), sampler->filter1);
     OUT_CS_REG(R300_TX_BORDER_COLOR_0 + (offset * 4), sampler->border_color);
 
-    OUT_CS_REG(R300_TX_FORMAT0_0 + (offset * 4), tex->state.format0);
+    OUT_CS_REG(R300_TX_FORMAT0_0 + (offset * 4), format0);
     OUT_CS_REG(R300_TX_FORMAT1_0 + (offset * 4), tex->state.format1);
     OUT_CS_REG(R300_TX_FORMAT2_0 + (offset * 4), tex->state.format2);
     OUT_CS_REG_SEQ(R300_TX_OFFSET_0 + (offset * 4), 1);
-    OUT_CS_RELOC(tex->buffer, 0,
-            RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
+    OUT_CS_RELOC(tex->buffer,
+                 R300_TXO_MACRO_TILE(tex->macrotile) |
+                 R300_TXO_MICRO_TILE(tex->microtile),
+                 RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0, 0);
     END_CS;
 }
 
-/* XXX I can't read this and that's not good */
 void r300_emit_aos(struct r300_context* r300, unsigned offset)
 {
-    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_buffer *vb1, *vb2, *vbuf = r300->vertex_buffer;
     struct pipe_vertex_element *velem = r300->vertex_element;
-    CS_LOCALS(r300);
     int i;
-    unsigned aos_count = r300->vertex_element_count;
-
+    unsigned size1, size2, aos_count = r300->vertex_element_count;
     unsigned packet_size = (aos_count * 3 + 1) / 2;
+    CS_LOCALS(r300);
+
     BEGIN_CS(2 + packet_size + aos_count * 2);
     OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, packet_size);
     OUT_CS(aos_count);
+
     for (i = 0; i < aos_count - 1; i += 2) {
-        int buf_num1 = velem[i].vertex_buffer_index;
-        int buf_num2 = velem[i+1].vertex_buffer_index;
-        assert(vbuf[buf_num1].stride % 4 == 0 && pf_get_size(velem[i].src_format) % 4 == 0);
-        assert(vbuf[buf_num2].stride % 4 == 0 && pf_get_size(velem[i+1].src_format) % 4 == 0);
-        OUT_CS((pf_get_size(velem[i].src_format) >> 2) | (vbuf[buf_num1].stride << 6) |
-               (pf_get_size(velem[i+1].src_format) << 14) | (vbuf[buf_num2].stride << 22));
-        OUT_CS(vbuf[buf_num1].buffer_offset + velem[i].src_offset +
-               offset * vbuf[buf_num1].stride);
-        OUT_CS(vbuf[buf_num2].buffer_offset + velem[i+1].src_offset +
-               offset * vbuf[buf_num2].stride);
+        vb1 = &vbuf[velem[i].vertex_buffer_index];
+        vb2 = &vbuf[velem[i+1].vertex_buffer_index];
+        size1 = util_format_get_blocksize(velem[i].src_format);
+        size2 = util_format_get_blocksize(velem[i+1].src_format);
+
+        OUT_CS(R300_VBPNTR_SIZE0(size1) | R300_VBPNTR_STRIDE0(vb1->stride) |
+               R300_VBPNTR_SIZE1(size2) | R300_VBPNTR_STRIDE1(vb2->stride));
+        OUT_CS(vb1->buffer_offset + velem[i].src_offset   + offset * vb1->stride);
+        OUT_CS(vb2->buffer_offset + velem[i+1].src_offset + offset * vb2->stride);
     }
+
     if (aos_count & 1) {
-        int buf_num = velem[i].vertex_buffer_index;
-        assert(vbuf[buf_num].stride % 4 == 0 && pf_get_size(velem[i].src_format) % 4 == 0);
-        OUT_CS((pf_get_size(velem[i].src_format) >> 2) | (vbuf[buf_num].stride << 6));
-        OUT_CS(vbuf[buf_num].buffer_offset + velem[i].src_offset +
-               offset * vbuf[buf_num].stride);
+        vb1 = &vbuf[velem[i].vertex_buffer_index];
+        size1 = util_format_get_blocksize(velem[i].src_format);
+
+        OUT_CS(R300_VBPNTR_SIZE0(size1) | R300_VBPNTR_STRIDE0(vb1->stride));
+        OUT_CS(vb1->buffer_offset + velem[i].src_offset + offset * vb1->stride);
     }
 
-    /* XXX bare CS reloc */
     for (i = 0; i < aos_count; i++) {
-        cs_winsys->write_cs_reloc(cs_winsys,
-                                  vbuf[velem[i].vertex_buffer_index].buffer,
-                                  RADEON_GEM_DOMAIN_GTT,
-                                  0,
-                                  0);
-        cs_count -= 2;
+        OUT_CS_RELOC_NO_OFFSET(vbuf[velem[i].vertex_buffer_index].buffer,
+                               RADEON_GEM_DOMAIN_GTT, 0, 0);
     }
     END_CS;
 }
-#if 0
-void r300_emit_draw_packet(struct r300_context* r300)
-{
-    CS_LOCALS(r300);
-
-    DBG(r300, DBG_DRAW, "r300: Preparing vertex buffer %p for render, "
-            "vertex size %d\n", r300->vbo,
-            r300->vertex_info->vinfo.size);
-    /* Set the pointer to our vertex buffer. The emitted values are this:
-     * PACKET3 [3D_LOAD_VBPNTR]
-     * COUNT   [1]
-     * FORMAT  [size | stride << 8]
-     * OFFSET  [offset into BO]
-     * VBPNTR  [relocated BO]
-     */
-    BEGIN_CS(7);
-    OUT_CS_PKT3(R300_PACKET3_3D_LOAD_VBPNTR, 3);
-    OUT_CS(1);
-    OUT_CS(r300->vertex_info->vinfo.size |
-            (r300->vertex_info->vinfo.size << 8));
-    OUT_CS(r300->vbo_offset);
-    OUT_CS_RELOC(r300->vbo, 0, RADEON_GEM_DOMAIN_GTT, 0, 0);
-    END_CS;
-}
-#endif
 
 void r300_emit_vertex_format_state(struct r300_context* r300)
 {
@@ -690,12 +848,22 @@ void r300_emit_vertex_format_state(struct r300_context* r300)
     END_CS;
 }
 
+
 void r300_emit_vertex_program_code(struct r300_context* r300,
                                    struct r300_vertex_program_code* code)
 {
     int i;
     struct r300_screen* r300screen = r300_screen(r300->context.screen);
     unsigned instruction_count = code->length / 4;
+
+    int vtx_mem_size = r300screen->caps->is_r500 ? 128 : 72;
+    int input_count = MAX2(util_bitcount(code->InputsRead), 1);
+    int output_count = MAX2(util_bitcount(code->OutputsWritten), 1);
+    int temp_count = MAX2(code->num_temporaries, 1);
+    int pvs_num_slots = MIN3(vtx_mem_size / input_count,
+                             vtx_mem_size / output_count, 10);
+    int pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6);
+
     CS_LOCALS(r300);
 
     if (!r300screen->caps->has_tcl) {
@@ -708,8 +876,7 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     /* R300_VAP_PVS_CODE_CNTL_0
      * R300_VAP_PVS_CONST_CNTL
      * R300_VAP_PVS_CODE_CNTL_1
-     * See the r5xx docs for instructions on how to use these.
-     * XXX these could be optimized to select better values... */
+     * See the r5xx docs for instructions on how to use these. */
     OUT_CS_REG_SEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
     OUT_CS(R300_PVS_FIRST_INST(0) |
             R300_PVS_XYZW_VALID_INST(instruction_count - 1) |
@@ -722,10 +889,11 @@ void r300_emit_vertex_program_code(struct r300_context* r300,
     for (i = 0; i < code->length; i++)
         OUT_CS(code->body.d[i]);
 
-    OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(10) |
-            R300_PVS_NUM_CNTLRS(5) |
+    OUT_CS_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(pvs_num_slots) |
+            R300_PVS_NUM_CNTLRS(pvs_num_controllers) |
             R300_PVS_NUM_FPUS(r300screen->caps->num_vert_fpus) |
-            R300_PVS_VF_MAX_VTX_NUM(12));
+            R300_PVS_VF_MAX_VTX_NUM(12) |
+            (r300screen->caps->is_r500 ? R500_TCL_STATE_OPTIMIZATION : 0));
     END_CS;
 }
 
@@ -768,25 +936,57 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
     END_CS;
 }
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport)
+void r300_emit_viewport_state(struct r300_context* r300, void* state)
 {
+    struct r300_viewport_state* viewport = (struct r300_viewport_state*)state;
     CS_LOCALS(r300);
 
-    BEGIN_CS(9);
-    OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
-    OUT_CS_32F(viewport->xscale);
-    OUT_CS_32F(viewport->xoffset);
-    OUT_CS_32F(viewport->yscale);
-    OUT_CS_32F(viewport->yoffset);
-    OUT_CS_32F(viewport->zscale);
-    OUT_CS_32F(viewport->zoffset);
-
-    if (r300->rs_state->enable_vte) {
-        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
-    } else {
+    if (r300->tcl_bypass) {
+        BEGIN_CS(2);
         OUT_CS_REG(R300_VAP_VTE_CNTL, 0);
+        END_CS;
+    } else {
+        BEGIN_CS(9);
+        OUT_CS_REG_SEQ(R300_SE_VPORT_XSCALE, 6);
+        OUT_CS_32F(viewport->xscale);
+        OUT_CS_32F(viewport->xoffset);
+        OUT_CS_32F(viewport->yscale);
+        OUT_CS_32F(viewport->yoffset);
+        OUT_CS_32F(viewport->zscale);
+        OUT_CS_32F(viewport->zoffset);
+        OUT_CS_REG(R300_VAP_VTE_CNTL, viewport->vte_control);
+        END_CS;
     }
+}
+
+void r300_emit_texture_count(struct r300_context* r300)
+{
+    uint32_t tx_enable = 0;
+    int i;
+    CS_LOCALS(r300);
+
+    /* Notice that texture_count and sampler_count are just sizes
+     * of the respective arrays. We still have to check for the individual
+     * elements. */
+    for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
+        if (r300->textures[i]) {
+            tx_enable |= 1 << i;
+        }
+    }
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_TX_ENABLE, tx_enable);
+    END_CS;
+
+}
+
+void r300_emit_ztop_state(struct r300_context* r300, void* state)
+{
+    struct r300_ztop_state* ztop = (struct r300_ztop_state*)state;
+    CS_LOCALS(r300);
+
+    BEGIN_CS(2);
+    OUT_CS_REG(R300_ZB_ZTOP, ztop->z_buffer_top);
     END_CS;
 }
 
@@ -794,9 +994,8 @@ void r300_flush_textures(struct r300_context* r300)
 {
     CS_LOCALS(r300);
 
-    BEGIN_CS(4);
+    BEGIN_CS(2);
     OUT_CS_REG(R300_TX_INVALTAGS, 0);
-    OUT_CS_REG(R300_TX_ENABLE, (1 << r300->texture_count) - 1);
     END_CS;
 }
 
@@ -809,22 +1008,15 @@ static void r300_flush_pvs(struct r300_context* r300)
     END_CS;
 }
 
-/* Emit all dirty state. */
-void r300_emit_dirty_state(struct r300_context* r300)
+void r300_emit_buffer_validate(struct r300_context *r300)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
     struct r300_texture* tex;
-    int i, dirty_tex = 0;
+    unsigned i;
     boolean invalid = FALSE;
 
-    if (!(r300->dirty_state)) {
-        return;
-    }
-
     /* Clean out BOs. */
     r300->winsys->reset_bos(r300->winsys);
 
-    /* XXX check size */
 validate:
     /* Color buffers... */
     for (i = 0; i < r300->framebuffer_state.nr_cbufs; i++) {
@@ -858,10 +1050,12 @@ validate:
         }
     }
     /* ...occlusion query buffer... */
-    if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
-                0, RADEON_GEM_DOMAIN_GTT)) {
-        r300->context.flush(&r300->context, 0, NULL);
-        goto validate;
+    if (r300->dirty_state & R300_NEW_QUERY) {
+        if (!r300->winsys->add_buffer(r300->winsys, r300->oqbo,
+                    0, RADEON_GEM_DOMAIN_GTT)) {
+            r300->context.flush(&r300->context, 0, NULL);
+            goto validate;
+        }
     }
     /* ...and vertex buffer. */
     if (r300->vbo) {
@@ -883,46 +1077,61 @@ validate:
         invalid = TRUE;
         goto validate;
     }
+}
 
-    if (r300->dirty_state & R300_NEW_QUERY) {
-        r300_emit_query_start(r300);
-        r300->dirty_state &= ~R300_NEW_QUERY;
-    }
+/* Emit all dirty state. */
+void r300_emit_dirty_state(struct r300_context* r300)
+{
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct r300_atom* atom;
+    unsigned i, dwords = 1024;
+    int dirty_tex = 0;
 
-    if (r300->dirty_state & R300_NEW_BLEND) {
-        r300_emit_blend_state(r300, r300->blend_state);
-        r300->dirty_state &= ~R300_NEW_BLEND;
+    /* Check the required number of dwords against the space remaining in the
+     * current CS object. If we need more, then flush. */
+
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty || atom->always_dirty) {
+            dwords += atom->size;
+        }
     }
 
-    if (r300->dirty_state & R300_NEW_BLEND_COLOR) {
-        r300_emit_blend_color_state(r300, r300->blend_color_state);
-        r300->dirty_state &= ~R300_NEW_BLEND_COLOR;
+    /* Make sure we have at least 2*1024 spare dwords. */
+    /* XXX It would be nice to know the number of dwords we really need to
+     * XXX emit. */
+    while (!r300->winsys->check_cs(r300->winsys, dwords)) {
+        r300->context.flush(&r300->context, 0, NULL);
     }
 
-    if (r300->dirty_state & R300_NEW_CLIP) {
-        r300_emit_clip_state(r300, &r300->clip_state);
-        r300->dirty_state &= ~R300_NEW_CLIP;
+    if (r300->dirty_state & R300_NEW_QUERY) {
+        r300_emit_query_start(r300);
+        r300->dirty_state &= ~R300_NEW_QUERY;
     }
 
-    if (r300->dirty_state & R300_NEW_DSA) {
-        r300_emit_dsa_state(r300, r300->dsa_state);
-        r300->dirty_state &= ~R300_NEW_DSA;
+    foreach(atom, &r300->atom_list) {
+        if (atom->dirty || atom->always_dirty) {
+            atom->emit(r300, atom->state);
+            atom->dirty = FALSE;
+        }
     }
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER) {
+        r300_emit_fragment_depth_config(r300, r300->fs);
         if (r300screen->caps->is_r500) {
-            r500_emit_fragment_program_code(r300, &r300->fs->code);
+            r500_emit_fragment_program_code(r300, &r300->fs->shader->code);
         } else {
-            r300_emit_fragment_program_code(r300, &r300->fs->code);
+            r300_emit_fragment_program_code(r300, &r300->fs->shader->code);
         }
         r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER;
     }
 
     if (r300->dirty_state & R300_NEW_FRAGMENT_SHADER_CONSTANTS) {
         if (r300screen->caps->is_r500) {
-            r500_emit_fs_constant_buffer(r300, &r300->fs->code.constants);
+            r500_emit_fs_constant_buffer(r300,
+                                         &r300->fs->shader->code.constants);
         } else {
-            r300_emit_fs_constant_buffer(r300, &r300->fs->code.constants);
+            r300_emit_fs_constant_buffer(r300,
+                                         &r300->fs->shader->code.constants);
         }
         r300->dirty_state &= ~R300_NEW_FRAGMENT_SHADER_CONSTANTS;
     }
@@ -932,24 +1141,16 @@ validate:
         r300->dirty_state &= ~R300_NEW_FRAMEBUFFERS;
     }
 
-    if (r300->dirty_state & R300_NEW_RASTERIZER) {
-        r300_emit_rs_state(r300, r300->rs_state);
-        r300->dirty_state &= ~R300_NEW_RASTERIZER;
-    }
-
     if (r300->dirty_state & R300_NEW_RS_BLOCK) {
         r300_emit_rs_block_state(r300, r300->rs_block);
         r300->dirty_state &= ~R300_NEW_RS_BLOCK;
     }
 
-    if (r300->dirty_state & R300_NEW_SCISSOR) {
-        r300_emit_scissor_state(r300, r300->scissor_state);
-        r300->dirty_state &= ~R300_NEW_SCISSOR;
-    }
-
     /* Samplers and textures are tracked separately but emitted together. */
     if (r300->dirty_state &
             (R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES)) {
+        r300_emit_texture_count(r300);
+
         for (i = 0; i < MIN2(r300->sampler_count, r300->texture_count); i++) {
   	    if (r300->dirty_state &
 		((R300_NEW_SAMPLER << i) | (R300_NEW_TEXTURE << i))) {
@@ -966,11 +1167,6 @@ validate:
         r300->dirty_state &= ~(R300_ANY_NEW_SAMPLERS | R300_ANY_NEW_TEXTURES);
     }
 
-    if (r300->dirty_state & R300_NEW_VIEWPORT) {
-        r300_emit_viewport_state(r300, r300->viewport_state);
-        r300->dirty_state &= ~R300_NEW_VIEWPORT;
-    }
-
     if (dirty_tex) {
         r300_flush_textures(r300);
     }
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 7c83c5166de..2f3d013f5e0 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -31,17 +31,13 @@ struct r300_vertex_program_code;
 
 void r300_emit_aos(struct r300_context* r300, unsigned offset);
 
-void r300_emit_blend_state(struct r300_context* r300,
-                           struct r300_blend_state* blend);
+void r300_emit_blend_state(struct r300_context* r300, void* state);
 
-void r300_emit_blend_color_state(struct r300_context* r300,
-                                 struct r300_blend_color_state* bc);
+void r300_emit_blend_color_state(struct r300_context* r300, void* state);
 
-void r300_emit_clip_state(struct r300_context* r300,
-                          struct pipe_clip_state* clip);
+void r300_emit_clip_state(struct r300_context* r300, void* state);
 
-void r300_emit_dsa_state(struct r300_context* r300,
-                         struct r300_dsa_state* dsa);
+void r300_emit_dsa_state(struct r300_context* r300, void* state);
 
 void r300_emit_fragment_program_code(struct r300_context* r300,
                                      struct rX00_fragment_program_code* generic_code);
@@ -63,13 +59,12 @@ void r300_emit_query_begin(struct r300_context* r300,
 
 void r300_emit_query_end(struct r300_context* r300);
 
-void r300_emit_rs_state(struct r300_context* r300, struct r300_rs_state* rs);
+void r300_emit_rs_state(struct r300_context* r300, void* state);
 
 void r300_emit_rs_block_state(struct r300_context* r300,
                               struct r300_rs_block* rs);
 
-void r300_emit_scissor_state(struct r300_context* r300,
-                             struct r300_scissor_state* scissor);
+void r300_emit_scissor_state(struct r300_context* r300, void* state);
 
 void r300_emit_texture(struct r300_context* r300,
                        struct r300_sampler_state* sampler,
@@ -89,12 +84,17 @@ void r300_emit_vs_constant_buffer(struct r300_context* r300,
 void r300_emit_vertex_shader(struct r300_context* r300,
                              struct r300_vertex_shader* vs);
 
-void r300_emit_viewport_state(struct r300_context* r300,
-                              struct r300_viewport_state* viewport);
+void r300_emit_viewport_state(struct r300_context* r300, void* state);
+
+void r300_emit_texture_count(struct r300_context* r300);
+
+void r300_emit_ztop_state(struct r300_context* r300, void* state);
 
 void r300_flush_textures(struct r300_context* r300);
 
 /* Emit all dirty state. */
 void r300_emit_dirty_state(struct r300_context* r300);
 
+void r300_emit_buffer_validate(struct r300_context *r300);
+
 #endif /* R300_EMIT_H */
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 4282357521b..15e612d8a6b 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -37,6 +37,7 @@ static void r300_flush(struct pipe_context* pipe,
 {
     struct r300_context *r300 = r300_context(pipe);
     struct r300_query *query;
+    struct r300_atom *atom;
 
     CS_LOCALS(r300);
     (void) cs_count;
@@ -52,10 +53,17 @@ static void r300_flush(struct pipe_context* pipe,
 
     if (r300->dirty_hw) {
         FLUSH_CS;
-        r300_emit_invariant_state(r300);
         r300->dirty_state = R300_NEW_KITCHEN_SINK;
         r300->dirty_hw = 0;
+
+        /* New kitchen sink, baby. */
+        foreach(atom, &r300->atom_list) {
+            if (atom->state) {
+                atom->dirty = TRUE;
+            }
+        }
     }
+
     /* reset flushed query */
     foreach(query, &r300->query_list) {
         query->flushed = TRUE;
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 29ddc84c411..60ea9c171d5 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -1,6 +1,7 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
  *                Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,6 +22,9 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "tgsi/tgsi_dump.h"
 
 #include "r300_context.h"
@@ -31,6 +35,45 @@
 #include "radeon_code.h"
 #include "radeon_compiler.h"
 
+/* Convert info about FS input semantics to r300_shader_semantics. */
+void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
+                                struct r300_shader_semantics* fs_inputs)
+{
+    int i;
+    unsigned index;
+
+    r300_shader_semantics_reset(fs_inputs);
+
+    for (i = 0; i < info->num_inputs; i++) {
+        index = info->input_semantic_index[i];
+
+        switch (info->input_semantic_name[i]) {
+            case TGSI_SEMANTIC_COLOR:
+                assert(index <= ATTR_COLOR_COUNT);
+                fs_inputs->color[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_GENERIC:
+                assert(index <= ATTR_GENERIC_COUNT);
+                fs_inputs->generic[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_FOG:
+                assert(index == 0);
+                fs_inputs->fog = i;
+                break;
+
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                fs_inputs->wpos = i;
+                break;
+
+            default:
+                assert(0);
+        }
+    }
+}
+
 static void find_output_registers(struct r300_fragment_program_compiler * compiler,
                                   struct r300_fragment_shader * fs)
 {
@@ -58,61 +101,69 @@ static void allocate_hardware_inputs(
     void (*allocate)(void * data, unsigned input, unsigned hwreg),
     void * mydata)
 {
-    struct tgsi_shader_info* info = &((struct r300_fragment_shader*)c->UserData)->info;
-    int total_colors = 0;
-    int colors = 0;
-    int total_generic = 0;
-    int generic = 0;
-    int i;
-
-    for (i = 0; i < info->num_inputs; i++) {
-        switch (info->input_semantic_name[i]) {
-            case TGSI_SEMANTIC_COLOR:
-                total_colors++;
-                break;
-            case TGSI_SEMANTIC_FOG:
-            case TGSI_SEMANTIC_GENERIC:
-                total_generic++;
-                break;
+    struct r300_shader_semantics* inputs =
+        (struct r300_shader_semantics*)c->UserData;
+    int i, reg = 0;
+
+    /* Allocate input registers. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (inputs->color[i] != ATTR_UNUSED) {
+            allocate(mydata, inputs->color[i], reg++);
+        }
+    }
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (inputs->generic[i] != ATTR_UNUSED) {
+            allocate(mydata, inputs->generic[i], reg++);
         }
     }
+    if (inputs->fog != ATTR_UNUSED) {
+        allocate(mydata, inputs->fog, reg++);
+    }
+    if (inputs->wpos != ATTR_UNUSED) {
+        allocate(mydata, inputs->wpos, reg++);
+    }
+}
 
-    for(i = 0; i < info->num_inputs; i++) {
-        switch (info->input_semantic_name[i]) {
-            case TGSI_SEMANTIC_COLOR:
-                allocate(mydata, i, colors);
-                colors++;
-                break;
-            case TGSI_SEMANTIC_FOG:
-            case TGSI_SEMANTIC_GENERIC:
-                allocate(mydata, i, total_colors + generic);
-                generic++;
-                break;
+static void get_compare_state(
+    struct r300_context* r300,
+    struct r300_fragment_program_external_state* state,
+    unsigned shadow_samplers)
+{
+    memset(state, 0, sizeof(*state));
+
+    for (int i = 0; i < r300->sampler_count; i++) {
+        struct r300_sampler_state* s = r300->sampler_states[i];
+
+        if (s && s->state.compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+            /* XXX Gallium doesn't provide us with any information regarding
+             * this mode, so we are screwed. I'm setting 0 = LUMINANCE. */
+            state->unit[i].depth_texture_mode = 0;
+
+            /* Fortunately, no need to translate this. */
+            state->unit[i].texture_compare_func = s->state.compare_func;
         }
     }
 }
 
-void r300_translate_fragment_shader(struct r300_context* r300,
-                                    struct r300_fragment_shader* fs)
+static void r300_translate_fragment_shader(
+    struct r300_context* r300,
+    struct r300_fragment_shader_code* shader)
 {
+    struct r300_fragment_shader* fs = r300->fs;
     struct r300_fragment_program_compiler compiler;
     struct tgsi_to_rc ttr;
+    int wpos = fs->inputs.wpos;
 
+    /* Setup the compiler. */
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
     compiler.Base.Debug = DBG_ON(r300, DBG_FP);
 
-    compiler.code = &fs->code;
+    compiler.code = &shader->code;
+    compiler.state = shader->compare_state;
     compiler.is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
     compiler.AllocateHwInputs = &allocate_hardware_inputs;
-    compiler.UserData = fs;
-
-    /* TODO: Program compilation depends on texture compare modes,
-     * which are sampler state. Therefore, programs need to be recompiled
-     * depending on this state as in the classic Mesa driver.
-     *
-     * This is not yet handled correctly.
-     */
+    compiler.UserData = &fs->inputs;
 
     find_output_registers(&compiler, fs);
 
@@ -127,15 +178,76 @@ void r300_translate_fragment_shader(struct r300_context* r300,
 
     r300_tgsi_to_rc(&ttr, fs->state.tokens);
 
+    fs->shadow_samplers = compiler.Base.Program.ShadowSamplers;
+
+    /**
+     * Transform the program to support WPOS.
+     *
+     * Introduce a small fragment at the start of the program that will be
+     * the only code that directly reads the WPOS input.
+     * All other code pieces that reference that input will be rewritten
+     * to read from a newly allocated temporary. */
+    if (wpos != ATTR_UNUSED) {
+        /* Moving the input to some other reg is not really necessary. */
+        rc_transform_fragment_wpos(&compiler.Base, wpos, wpos, TRUE);
+    }
+
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
     if (compiler.Base.Error) {
         /* XXX failover maybe? */
         DBG(r300, DBG_FP, "r300: Error compiling fragment program: %s\n",
             compiler.Base.ErrorMsg);
+        assert(0);
     }
 
     /* And, finally... */
     rc_destroy(&compiler.Base);
-    fs->translated = TRUE;
+}
+
+boolean r300_pick_fragment_shader(struct r300_context* r300)
+{
+    struct r300_fragment_shader* fs = r300->fs;
+    struct r300_fragment_program_external_state state;
+    struct r300_fragment_shader_code* ptr;
+
+    if (!fs->first) {
+        /* Build the fragment shader for the first time. */
+        fs->first = fs->shader = CALLOC_STRUCT(r300_fragment_shader_code);
+
+        /* BTW shadow samplers will be known after the first translation,
+         * therefore we set ~0, which means it should look at all sampler
+         * states. This choice doesn't have any impact on the correctness. */
+        get_compare_state(r300, &fs->shader->compare_state, ~0);
+        r300_translate_fragment_shader(r300, fs->shader);
+        return TRUE;
+
+    } else if (fs->shadow_samplers) {
+        get_compare_state(r300, &state, fs->shadow_samplers);
+
+        /* Check if the currently-bound shader has been compiled
+         * with the texture-compare state we need. */
+        if (memcmp(&fs->shader->compare_state, &state, sizeof(state)) != 0) {
+            /* Search for the right shader. */
+            ptr = fs->first;
+            while (ptr) {
+                if (memcmp(&ptr->compare_state, &state, sizeof(state)) == 0) {
+                    fs->shader = ptr;
+                    return TRUE;
+                }
+                ptr = ptr->next;
+            }
+
+            /* Not found, gotta compile a new one. */
+            ptr = CALLOC_STRUCT(r300_fragment_shader_code);
+            ptr->next = fs->first;
+            fs->first = fs->shader = ptr;
+
+            ptr->compare_state = state;
+            r300_translate_fragment_shader(r300, ptr);
+            return TRUE;
+        }
+    }
+
+    return FALSE;
 }
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index e831c30301b..40ce874353c 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -1,6 +1,7 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
  *                Joakim Sindholt <opensource@zhasha.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,31 +26,46 @@
 #define R300_FS_H
 
 #include "pipe/p_state.h"
-
 #include "tgsi/tgsi_scan.h"
-
 #include "radeon_code.h"
+#include "r300_shader_semantics.h"
+
+struct r300_fragment_shader_code {
+    struct r300_fragment_program_external_state compare_state;
+    struct rX00_fragment_program_code code;
+
+    struct r300_fragment_shader_code* next;
+};
 
 struct r300_fragment_shader {
     /* Parent class */
     struct pipe_shader_state state;
+
     struct tgsi_shader_info info;
+    struct r300_shader_semantics inputs;
 
-    /* Has this shader been translated yet? */
-    boolean translated;
+    /* Bits 0-15: TRUE if it's a shadow sampler, FALSE otherwise. */
+    unsigned shadow_samplers;
 
-    /* Compiled code */
-    struct rX00_fragment_program_code code;
+    /* Currently-bound fragment shader. */
+    struct r300_fragment_shader_code* shader;
+
+    /* List of the same shaders compiled with different texture-compare
+     * states. */
+    struct r300_fragment_shader_code* first;
 };
 
+void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
+                                struct r300_shader_semantics* fs_inputs);
 
-void r300_translate_fragment_shader(struct r300_context* r300,
-                                    struct r300_fragment_shader* fs);
+/* Return TRUE if the shader was switched and should be re-emitted. */
+boolean r300_pick_fragment_shader(struct r300_context* r300);
 
-static inline boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
+static INLINE boolean r300_fragment_shader_writes_depth(struct r300_fragment_shader *fs)
 {
     if (!fs)
-	return FALSE;
-    return (fs->code.writes_depth) ? TRUE : FALSE;
+        return FALSE;
+    return (fs->shader->code.writes_depth) ? TRUE : FALSE;
 }
+
 #endif /* R300_FS_H */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 744ea6568d3..361813891fb 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -661,20 +661,20 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_GB_SUPER_TILE_B		(1 << 15)
 #	define R300_GB_SUBPIXEL_1_12		(0 << 16)
 #	define R300_GB_SUBPIXEL_1_16		(1 << 16)
-#	define GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
-#	define GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
-#	define GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
-#	define GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
-#	define GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
-#	define GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
-#	define GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
-#	define GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
-#	define GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
-#	define GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
-#	define GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
-#	define GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
-#	define GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
-#	define GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
+#	define R300_GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
+#	define R300_GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
+#	define R300_GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
+#	define R300_GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
+#	define R300_GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
+#	define R300_GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
+#	define R300_GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
+#	define R300_GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
+#	define R300_GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
+#	define R300_GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
+#	define R300_GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
 
 /* Specifies the sizes of the various FIFO`s in the sc/rs/us. This register must be the first one written */
 #define R300_GB_FIFO_SIZE	0x4024
@@ -700,9 +700,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_OFIFO_HIGHWATER_SHIFT	22	/* two bits only */
 #	define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT	24
 
-#define GB_Z_PEQ_CONFIG                          0x4028
-#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
-#	define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
+#define R300_GB_Z_PEQ_CONFIG                          0x4028
+#	define R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
+#	define R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
 
 /* Specifies various polygon specific selects (fog, depth, perspective). */
 #define R300_GB_SELECT                           0x401c
@@ -725,39 +725,39 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /* Specifies the graphics pipeline configuration for antialiasing. */
 #define R300_GB_AA_CONFIG                         0x4020
-#	define GB_AA_CONFIG_AA_DISABLE           (0 << 0)
-#	define GB_AA_CONFIG_AA_ENABLE            (1 << 0)
-#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
-#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
-#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
-#	define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
+#	define R300_GB_AA_CONFIG_AA_DISABLE           (0 << 0)
+#	define R300_GB_AA_CONFIG_AA_ENABLE            (1 << 0)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
+#	define R300_GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
 
 /* Selects which of 4 pipes are active. */
-#define GB_PIPE_SELECT                           0x402c
-#	define GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
-#	define GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
-#	define GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
-#	define GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
-#	define GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
-#	define GB_PIPE_SELECT_MAX_PIPE        12
-#	define GB_PIPE_SELECT_BAD_PIPES       14
-#	define GB_PIPE_SELECT_CONFIG_PIPES    18
+#define R300_GB_PIPE_SELECT                           0x402c
+#	define R300_GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
+#	define R300_GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
+#	define R300_GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
+#	define R300_GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
+#	define R300_GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
+#	define R300_GB_PIPE_SELECT_MAX_PIPE        12
+#	define R300_GB_PIPE_SELECT_BAD_PIPES       14
+#	define R300_GB_PIPE_SELECT_CONFIG_PIPES    18
 
 
 /* Specifies the sizes of the various FIFO`s in the sc/rs. */
-#define GB_FIFO_SIZE1                            0x4070
+#define R300_GB_FIFO_SIZE1                            0x4070
 /* High water mark for SC input fifo */
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
 /* High water mark for SC input fifo (B) */
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
 /* High water mark for RS colors' fifo */
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
 /* High water mark for RS textures' fifo */
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
-#	define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
+#	define R300_GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
 
 /* This table specifies the source location and format for up to 16 texture
  * addresses (i[0]:i[15]) and four colors (c[0]:c[3])
@@ -1293,7 +1293,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #        define R500_RS_INST_TEX_ID(x)                  ((x) << 0)
 #define R500_RS_INST_TEX_CN_WRITE			(1 << 4)
 #define R500_RS_INST_TEX_ADDR_SHIFT			5
-#        define R500_RS_INST_TEX_ADDR(x)                ((x) << 0)
+#        define R500_RS_INST_TEX_ADDR(x)                ((x) << 5)
 #define R500_RS_INST_COL_ID_SHIFT			12
 #        define R500_RS_INST_COL_ID(x)                  ((x) << 12)
 #define R500_RS_INST_COL_CN_NO_WRITE			(0 << 16)
@@ -1463,6 +1463,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
 #	define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
 #	define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       17
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 17)
 #	define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
 #	define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
 #	define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
@@ -1471,6 +1473,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_MAX_ANISO_MASK            (7 << 21)
 #       define R300_TX_WRAP_S(x)                 ((x) << 0)
 #       define R300_TX_WRAP_T(x)                 ((x) << 3)
+#       define R300_TX_MAX_MIP_LEVEL(x)          ((x) << 17)
 
 #define R300_TX_FILTER1_0                      0x4440
 #	define R300_CHROMA_KEY_MODE_DISABLE    0
@@ -1500,8 +1503,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
 #	define R300_TX_DEPTHMASK_SHIFT           22
 #	define R300_TX_DEPTHMASK_MASK            (0xf << 22)
-#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
-#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
 #       define R300_TX_SIZE_PROJECTED            (1 << 30)
 #       define R300_TX_PITCH_EN                  (1 << 31)
 #       define R300_TX_WIDTH(x)                  ((x) << 0)
@@ -1618,18 +1619,20 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_TX_OFFSET_5                    0x4554
 #define R300_TX_OFFSET_6                    0x4558
 #define R300_TX_OFFSET_7                    0x455C
-	/* BEGIN: Guess from R200 */
+
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
 #       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
 #       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
-#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MACRO_TILE_LINEAR        (0 << 2)
+#       define R300_TXO_MACRO_TILE_TILED         (1 << 2)
+#       define R300_TXO_MACRO_TILE(x)            ((x) << 2)
 #       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
-#       define R300_TXO_MICRO_TILE               (1 << 3)
-#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_MICRO_TILE_TILED         (1 << 3)
+#       define R300_TXO_MICRO_TILE_TILED_SQUARE  (2 << 3)
+#       define R300_TXO_MICRO_TILE(x)            ((x) << 3)
 #       define R300_TXO_OFFSET_MASK              0xffffffe0
 #       define R300_TXO_OFFSET_SHIFT             5
-	/* END: Guess from R200 */
 
 /* 32 bit chroma key */
 #define R300_TX_CHROMA_KEY_0                      0x4580
@@ -2144,6 +2147,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /* Unpipelined. */
 #define R300_RB3D_CCTL                      0x4e00
+#	define R300_RB3D_CCTL_NUM_MULTIWRITES(x)       (MAX2(((x)-1), 0) << 5)
 #	define R300_RB3D_CCTL_NUM_MULTIWRITES_1_BUFFER                (0 << 5)
 #	define R300_RB3D_CCTL_NUM_MULTIWRITES_2_BUFFERS               (1 << 5)
 #	define R300_RB3D_CCTL_NUM_MULTIWRITES_3_BUFFERS               (2 << 5)
@@ -2184,6 +2188,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
 #       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+#       define R500_SRC_ALPHA_0_NO_READ                (1 << 30)
+#       define R500_SRC_ALPHA_1_NO_READ                (1 << 31)
 
 /* the following are shared between CBLEND and ABLEND */
 #       define R300_FCN_MASK                         (3  << 12)
@@ -2279,9 +2285,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_COLORPITCH_MASK              0x00003FFE
 #       define R300_COLOR_TILE_DISABLE            (0 << 16)
 #       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_TILE(x)                 ((x) << 16)
 #       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
 #       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_MICROTILE(x)            ((x) << 17)
 #       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
 #       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
 #       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
@@ -2540,9 +2548,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_DEPTHPITCH_MASK              0x00003FFC
 #       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
 #       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMACROTILE(x)           ((x) << 16)
 #       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
 #       define R300_DEPTHMICROTILE_TILED        (1 << 17)
 #       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHMICROTILE(x)           ((x) << 17)
 #       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
 #       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
 #       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
@@ -3292,6 +3302,11 @@ enum {
  */
 #define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
 
+#   define R300_VBPNTR_SIZE0(x)    ((x) >> 2)
+#   define R300_VBPNTR_STRIDE0(x)  (((x) >> 2) << 8)
+#   define R300_VBPNTR_SIZE1(x)    (((x) >> 2) << 16)
+#   define R300_VBPNTR_STRIDE1(x)  (((x) >> 2) << 24)
+
 #define R300_PACKET3_INDX_BUFFER            0x00003300
 #    define R300_INDX_BUFFER_DST_SHIFT          0
 #    define R300_INDX_BUFFER_SKIP_SHIFT         16
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 62e1456ed36..dcd0761944f 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -28,6 +28,7 @@
 
 #include "pipe/p_inlines.h"
 
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
 
@@ -37,7 +38,6 @@
 #include "r300_reg.h"
 #include "r300_render.h"
 #include "r300_state_derived.h"
-#include "r300_vbo.h"
 
 /* r300_render: Vertex and index buffer primitive emission. */
 #define R300_MAX_VBO_SIZE  (1024 * 1024)
@@ -70,14 +70,151 @@ uint32_t r300_translate_primitive(unsigned prim)
     }
 }
 
+static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300,
+                                            unsigned mode)
+{
+    struct r300_rs_state* rs = (struct r300_rs_state*)r300->rs_state.state;
+    uint32_t color_control = rs->color_control;
+
+    /* By default (see r300_state.c:r300_create_rs_state) color_control is
+     * initialized to provoking the first vertex.
+     *
+     * Triangle fans must be reduced to the second vertex, not the first, in
+     * Gallium flatshade-first mode, as per the GL spec.
+     * (http://www.opengl.org/registry/specs/ARB/provoking_vertex.txt)
+     *
+     * Quads never provoke correctly in flatshade-first mode. The first
+     * vertex is never considered as provoking, so only the second, third,
+     * and fourth vertices can be selected, and both "third" and "last" modes
+     * select the fourth vertex. This is probably due to D3D lacking quads.
+     *
+     * Similarly, polygons reduce to the first, not the last, vertex, when in
+     * "last" mode, and all other modes start from the second vertex.
+     *
+     * ~ C.
+     */
+
+    if (rs->rs.flatshade_first) {
+        switch (mode) {
+            case PIPE_PRIM_TRIANGLE_FAN:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND;
+                break;
+            case PIPE_PRIM_QUADS:
+            case PIPE_PRIM_QUAD_STRIP:
+            case PIPE_PRIM_POLYGON:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+                break;
+            default:
+                color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST;
+                break;
+        }
+    } else {
+        color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+    }
+
+    return color_control;
+}
+
+static boolean immd_is_good_idea(struct r300_context *r300,
+                                      unsigned count)
+{
+    return count <= 4;
+}
+
+static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
+                                            unsigned mode,
+                                            unsigned start,
+                                            unsigned count)
+{
+    struct pipe_vertex_element* velem;
+    struct pipe_vertex_buffer* vbuf;
+    unsigned vertex_element_count = r300->vertex_element_count;
+    unsigned i, v, vbi, dw, elem_offset;
+
+    /* Size of the vertex, in dwords. */
+    unsigned vertex_size = 0;
+
+    /* Offsets of the attribute, in dwords, from the start of the vertex. */
+    unsigned offset[PIPE_MAX_ATTRIBS];
+
+    /* Size of the vertex element, in dwords. */
+    unsigned size[PIPE_MAX_ATTRIBS];
+
+    /* Stride to the same attrib in the next vertex in the vertex buffer,
+     * in dwords. */
+    unsigned stride[PIPE_MAX_ATTRIBS];
+
+    /* Mapped vertex buffers. */
+    uint32_t* map[PIPE_MAX_ATTRIBS] = {0};
+
+    CS_LOCALS(r300);
+
+    /* Calculate the vertex size, offsets, strides etc. and map the buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        velem = &r300->vertex_element[i];
+        offset[i] = velem->src_offset / 4;
+        size[i] = util_format_get_blocksize(velem->src_format) / 4;
+        vertex_size += size[i];
+        vbi = velem->vertex_buffer_index;
+
+        /* Map the buffer. */
+        if (!map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            map[vbi] = (uint32_t*)pipe_buffer_map(r300->context.screen,
+                                                  vbuf->buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+            map[vbi] += vbuf->buffer_offset / 4;
+            stride[vbi] = vbuf->stride / 4;
+        }
+    }
+
+    BEGIN_CS(10 + count * vertex_size);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG(R300_VAP_VTX_SIZE, vertex_size);
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count - 1);
+    OUT_CS_PKT3(R300_PACKET3_3D_DRAW_IMMD_2, count * vertex_size);
+    OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (count << 16) |
+            r300_translate_primitive(mode));
+
+    /* Emit vertices. */
+    for (v = 0; v < count; v++) {
+        for (i = 0; i < vertex_element_count; i++) {
+            velem = &r300->vertex_element[i];
+            vbi = velem->vertex_buffer_index;
+            elem_offset = offset[i] + stride[vbi] * (v + start);
+
+            for (dw = 0; dw < size[i]; dw++) {
+                OUT_CS(map[vbi][elem_offset + dw]);
+            }
+        }
+    }
+    END_CS;
+
+    /* Unmap buffers. */
+    for (i = 0; i < vertex_element_count; i++) {
+        vbi = r300->vertex_element[i].vertex_buffer_index;
+
+        if (map[vbi]) {
+            vbuf = &r300->vertex_buffer[vbi];
+            pipe_buffer_unmap(r300->context.screen, vbuf->buffer);
+            map[vbi] = NULL;
+        }
+    }
+}
+
 static void r300_emit_draw_arrays(struct r300_context *r300,
                                   unsigned mode,
                                   unsigned count)
 {
     CS_LOCALS(r300);
 
-    BEGIN_CS(4);
-    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count);
+    BEGIN_CS(8);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0);
+    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, count - 1);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (count << 16) |
            r300_translate_primitive(mode));
@@ -102,7 +239,10 @@ static void r300_emit_draw_elements(struct r300_context *r300,
     assert((start * indexSize)  % 4 == 0);
     assert(offset_dwords == 0);
 
-    BEGIN_CS(10);
+    BEGIN_CS(14);
+    OUT_CS_REG(R300_GA_COLOR_CONTROL,
+            r300_provoking_vertex_fixes(r300, mode));
+    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, minIndex);
     OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, maxIndex);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, 0);
     if (indexSize == 4) {
@@ -133,17 +273,18 @@ static void r300_emit_draw_elements(struct r300_context *r300,
     END_CS;
 }
 
-
 static boolean r300_setup_vertex_buffers(struct r300_context *r300)
 {
     struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
     struct pipe_vertex_element *velem = r300->vertex_element;
+    struct pipe_buffer *pbuf;
 
 validate:
     for (int i = 0; i < r300->vertex_element_count; i++) {
-        if (!r300->winsys->add_buffer(r300->winsys,
-                vbuf[velem[i].vertex_buffer_index].buffer,
-            RADEON_GEM_DOMAIN_GTT, 0)) {
+        pbuf = vbuf[velem[i].vertex_buffer_index].buffer;
+
+        if (!r300->winsys->add_buffer(r300->winsys, pbuf,
+                                      RADEON_GEM_DOMAIN_GTT, 0)) {
             r300->context.flush(&r300->context, 0, NULL);
             goto validate;
         }
@@ -157,35 +298,82 @@ validate:
     return TRUE;
 }
 
+static void r300_shorten_ubyte_elts(struct r300_context* r300,
+                                    struct pipe_buffer** elts,
+                                    unsigned count)
+{
+    struct pipe_screen* screen = r300->context.screen;
+    struct pipe_buffer* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = screen->buffer_create(screen, 32,
+                                     PIPE_BUFFER_USAGE_INDEX |
+                                     PIPE_BUFFER_USAGE_CPU_WRITE |
+                                     PIPE_BUFFER_USAGE_GPU_READ,
+                                     2 * count);
+
+    in_map = pipe_buffer_map(screen, *elts, PIPE_BUFFER_USAGE_CPU_READ);
+    out_map = pipe_buffer_map(screen, new_elts, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)*in_map;
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(screen, *elts);
+    pipe_buffer_unmap(screen, new_elts);
+
+    *elts = new_elts;
+}
+
 /* This is the fast-path drawing & emission for HW TCL. */
-boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count)
+void r300_draw_range_elements(struct pipe_context* pipe,
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct pipe_buffer* orgIndexBuffer = indexBuffer;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
+       /* XXX: use aux/indices functions to split this into smaller
+        * primitives.
+        */
+        return;
     }
 
     r300_update_derived_state(r300);
 
+    r300_emit_buffer_validate(r300);
+
     if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
+        return;
     }
 
-    setup_vertex_attributes(r300);
+    if (indexSize == 1) {
+        r300_shorten_ubyte_elts(r300, &indexBuffer, count);
+        indexSize = 2;
+    }
 
-    setup_index_buffer(r300, indexBuffer, indexSize);
+    if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
+                                  RADEON_GEM_DOMAIN_GTT, 0)) {
+        goto cleanup;
+    }
+
+    if (!r300->winsys->validate(r300->winsys)) {
+        goto cleanup;
+    }
 
     r300_emit_dirty_state(r300);
 
@@ -194,47 +382,54 @@ boolean r300_draw_range_elements(struct pipe_context* pipe,
     r300_emit_draw_elements(r300, indexBuffer, indexSize, minIndex, maxIndex,
                             mode, start, count);
 
-    return TRUE;
+cleanup:
+    if (indexBuffer != orgIndexBuffer) {
+        pipe->screen->buffer_destroy(indexBuffer);
+    }
 }
 
 /* Simple helpers for context setup. Should probably be moved to util. */
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count)
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count)
 {
-    return pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
-                                     mode, start, count);
+   pipe->draw_range_elements(pipe, indexBuffer, indexSize, 0, ~0,
+                             mode, start, count);
 }
 
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count)
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                      unsigned start, unsigned count)
 {
     struct r300_context* r300 = r300_context(pipe);
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     if (count > 65535) {
-        return FALSE;
+        /* XXX: driver needs to handle this -- use the functions in
+         * aux/indices to split this into several smaller primitives.
+         */
+        return;
     }
 
     r300_update_derived_state(r300);
 
-    if (!r300_setup_vertex_buffers(r300)) {
-        return FALSE;
-    }
+    r300_emit_buffer_validate(r300);
 
-    setup_vertex_attributes(r300);
-
-    r300_emit_dirty_state(r300);
-
-    r300_emit_aos(r300, start);
-
-    r300_emit_draw_arrays(r300, mode, count);
+    if (immd_is_good_idea(r300, count)) {
+        r300_emit_dirty_state(r300);
+        r300_emit_draw_arrays_immediate(r300, mode, start, count);
+    } else {
+        if (!r300_setup_vertex_buffers(r300)) {
+            return;
+        }
 
-    return TRUE;
+        r300_emit_dirty_state(r300);
+        r300_emit_aos(r300, start);
+        r300_emit_draw_arrays(r300, mode, count);
+    }
 }
 
 /****************************************************************************
@@ -243,7 +438,7 @@ boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
  ***************************************************************************/
 
 /* SW TCL arrays, using Draw. */
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
                                unsigned mode,
                                unsigned start,
                                unsigned count)
@@ -252,7 +447,7 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
     int i;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -265,8 +460,9 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
     draw_set_mapped_element_buffer(r300->draw, 0, NULL);
 
     draw_set_mapped_constant_buffer(r300->draw,
-            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
-            r300->shader_constants[PIPE_SHADER_VERTEX].count *
+				    PIPE_SHADER_VERTEX,
+				    r300->shader_constants[PIPE_SHADER_VERTEX].constants,
+				    r300->shader_constants[PIPE_SHADER_VERTEX].count *
                 (sizeof(float) * 4));
 
     draw_arrays(r300->draw, mode, start, count);
@@ -275,12 +471,10 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
         pipe_buffer_unmap(pipe->screen, r300->vertex_buffer[i].buffer);
         draw_set_mapped_vertex_buffer(r300->draw, i, NULL);
     }
-
-    return TRUE;
 }
 
 /* SW TCL elements, using Draw. */
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
                                        struct pipe_buffer* indexBuffer,
                                        unsigned indexSize,
                                        unsigned minIndex,
@@ -291,9 +485,10 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
     int i;
+    void* indices;
 
     if (!u_trim_pipe_prim(mode, &count)) {
-        return FALSE;
+        return;
     }
 
     for (i = 0; i < r300->vertex_buffer_count; i++) {
@@ -303,12 +498,13 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
         draw_set_mapped_vertex_buffer(r300->draw, i, buf);
     }
 
-    void* indices = pipe_buffer_map(pipe->screen, indexBuffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+    indices = pipe_buffer_map(pipe->screen, indexBuffer,
+                              PIPE_BUFFER_USAGE_CPU_READ);
     draw_set_mapped_element_buffer_range(r300->draw, indexSize,
                                          minIndex, maxIndex, indices);
 
     draw_set_mapped_constant_buffer(r300->draw,
+				    PIPE_SHADER_VERTEX,
             r300->shader_constants[PIPE_SHADER_VERTEX].constants,
             r300->shader_constants[PIPE_SHADER_VERTEX].count *
                 (sizeof(float) * 4));
@@ -323,8 +519,6 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
     pipe_buffer_unmap(pipe->screen, indexBuffer);
     draw_set_mapped_element_buffer_range(r300->draw, 0, start,
                                          start + count - 1, NULL);
-
-    return TRUE;
 }
 
 /* Object for rendering using Draw. */
@@ -400,7 +594,7 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
                                           PIPE_BUFFER_USAGE_CPU_WRITE);
 
-    return (r300render->vbo_ptr + r300render->vbo_offset);
+    return ((uint8_t*)r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
diff --git a/src/gallium/drivers/r300/r300_render.h b/src/gallium/drivers/r300/r300_render.h
index da83069083d..27b5e6a9630 100644
--- a/src/gallium/drivers/r300/r300_render.h
+++ b/src/gallium/drivers/r300/r300_render.h
@@ -25,35 +25,35 @@
 
 uint32_t r300_translate_primitive(unsigned prim);
 
-boolean r300_draw_range_elements(struct pipe_context* pipe,
-                                 struct pipe_buffer* indexBuffer,
-                                 unsigned indexSize,
-                                 unsigned minIndex,
-                                 unsigned maxIndex,
-                                 unsigned mode,
-                                 unsigned start,
-                                 unsigned count);
-
-boolean r300_draw_elements(struct pipe_context* pipe,
-                           struct pipe_buffer* indexBuffer,
-                           unsigned indexSize, unsigned mode,
-                           unsigned start, unsigned count);
-
-boolean r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
-                         unsigned start, unsigned count);
-
-boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
-                               unsigned mode,
-                               unsigned start,
-                               unsigned count);
-
-boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
-                                       struct pipe_buffer* indexBuffer,
-                                       unsigned indexSize,
-                                       unsigned minIndex,
-                                       unsigned maxIndex,
-                                       unsigned mode,
-                                       unsigned start,
-                                       unsigned count);
+void r300_draw_range_elements(struct pipe_context* pipe,
+                              struct pipe_buffer* indexBuffer,
+                              unsigned indexSize,
+                              unsigned minIndex,
+                              unsigned maxIndex,
+                              unsigned mode,
+                              unsigned start,
+                              unsigned count);
+
+void r300_draw_elements(struct pipe_context* pipe,
+                        struct pipe_buffer* indexBuffer,
+                        unsigned indexSize, unsigned mode,
+                        unsigned start, unsigned count);
+
+void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
+                      unsigned start, unsigned count);
+
+void r300_swtcl_draw_arrays(struct pipe_context* pipe,
+                            unsigned mode,
+                            unsigned start,
+                            unsigned count);
+
+void r300_swtcl_draw_range_elements(struct pipe_context* pipe,
+                                    struct pipe_buffer* indexBuffer,
+                                    unsigned indexSize,
+                                    unsigned minIndex,
+                                    unsigned maxIndex,
+                                    unsigned mode,
+                                    unsigned start,
+                                    unsigned count);
 
 #endif /* R300_RENDER_H */
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 390b63007e5..67325c6b80a 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -21,13 +21,15 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_simple_screen.h"
 
 #include "r300_context.h"
 #include "r300_screen.h"
 #include "r300_texture.h"
-#include "r300_winsys.h"
+
+#include "radeon_winsys.h"
 
 /* Return the identifier behind whom the brave coders responsible for this
  * amalgamation of code, sweat, and duct tape, routinely obscure their names.
@@ -81,6 +83,7 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
 
     switch (param) {
         case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+        case PIPE_CAP_MAX_COMBINED_SAMPLERS:
             /* XXX I'm told this goes up to 16 */
             return 8;
         case PIPE_CAP_NPOT_TEXTURES:
@@ -140,6 +143,12 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
             return 0;
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
             return 1;
+        case PIPE_CAP_SM3:
+            if (r300screen->caps->is_r500) {
+                return 1;
+            } else {
+                return 0;
+            }
         default:
             debug_printf("r300: Implementation error: Bad param %d\n",
                 param);
@@ -174,10 +183,20 @@ static float r300_get_paramf(struct pipe_screen* pscreen, int param)
     }
 }
 
-static boolean check_tex_format(enum pipe_format format, uint32_t usage,
-                                boolean is_r500)
+static boolean r300_is_format_supported(struct pipe_screen* screen,
+                                        enum pipe_format format,
+                                        enum pipe_texture_target target,
+                                        unsigned usage,
+                                        unsigned geom_flags)
 {
     uint32_t retval = 0;
+    boolean is_r500 = r300_screen(screen)->caps->is_r500;
+
+    if (target >= PIPE_MAX_TEXTURE_TYPES) {
+        debug_printf("r300: Implementation error: Received bogus texture "
+            "target %d in %s\n", target, __FUNCTION__);
+        return FALSE;
+    }
 
     switch (format) {
         /* Supported formats. */
@@ -219,12 +238,18 @@ static boolean check_tex_format(enum pipe_format format, uint32_t usage,
 
         /* Z buffer or texture */
         case PIPE_FORMAT_Z16_UNORM:
+            retval = usage &
+                (PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+                 PIPE_TEXTURE_USAGE_SAMPLER);
+            break;
+
+        /* 24bit Z buffer can only be used as a texture on R500. */
         case PIPE_FORMAT_Z24X8_UNORM:
         /* Z buffer with stencil or texture */
         case PIPE_FORMAT_Z24S8_UNORM:
             retval = usage &
                 (PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
-                 PIPE_TEXTURE_USAGE_SAMPLER);
+                 (is_r500 ? PIPE_TEXTURE_USAGE_SAMPLER : 0));
             break;
 
         /* Definitely unsupported formats. */
@@ -232,28 +257,13 @@ static boolean check_tex_format(enum pipe_format format, uint32_t usage,
         case PIPE_FORMAT_Z32_UNORM:
         case PIPE_FORMAT_S8Z24_UNORM:
         case PIPE_FORMAT_X8Z24_UNORM:
-            debug_printf("r300: Note: Got unsupported format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
+            SCREEN_DBG(r300_screen(screen), DBG_TEX,
+                       "r300: Note: Got unsupported format: %s in %s\n",
+                       pf_name(format), __FUNCTION__);
             return FALSE;
 
-        /* XXX These don't even exist
-        case PIPE_FORMAT_A32R32G32B32:
-        case PIPE_FORMAT_A16R16G16B16: */
-        /* XXX What the deuce is UV88? (r3xx accel page 14)
-            debug_printf("r300: Warning: Got unimplemented format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
-            return FALSE; */
-
-        /* XXX Supported yet unimplemented r5xx formats: */
-        /* XXX Again, what is UV1010 this time? (r5xx accel page 148) */
-        /* XXX Even more that don't exist
-        case PIPE_FORMAT_A10R10G10B10_UNORM:
-        case PIPE_FORMAT_A2R10G10B10_UNORM:
-        case PIPE_FORMAT_I10_UNORM:
-            debug_printf(
-                "r300: Warning: Got unimplemented r500 format: %s in %s\n",
-                pf_name(format), __FUNCTION__);
-            return FALSE; */
+        /* XXX Add all remaining gallium-supported formats,
+         * see util/u_format.csv. */
 
         default:
             /* Unknown format... */
@@ -271,30 +281,6 @@ static boolean check_tex_format(enum pipe_format format, uint32_t usage,
     return (retval >= usage);
 }
 
-static boolean r300_is_format_supported(struct pipe_screen* pscreen,
-                                        enum pipe_format format,
-                                        enum pipe_texture_target target,
-                                        unsigned tex_usage,
-                                        unsigned geom_flags)
-{
-    switch (target) {
-        case PIPE_TEXTURE_1D:   /* handle 1D textures as 2D ones */
-        case PIPE_TEXTURE_2D:
-        case PIPE_TEXTURE_3D:
-        case PIPE_TEXTURE_CUBE:
-            return check_tex_format(format, tex_usage,
-                r300_screen(pscreen)->caps->is_r500);
-
-        default:
-            debug_printf("r300: Fatal: This is not a format target: %d\n",
-                target);
-            assert(0);
-            break;
-    }
-
-    return FALSE;
-}
-
 static struct pipe_transfer*
 r300_get_tex_transfer(struct pipe_screen *screen,
                       struct pipe_texture *texture,
@@ -304,6 +290,7 @@ r300_get_tex_transfer(struct pipe_screen *screen,
 {
     struct r300_texture *tex = (struct r300_texture *)texture;
     struct r300_transfer *trans;
+    struct r300_screen *rscreen = r300_screen(screen);
     unsigned offset;
 
     offset = r300_texture_get_offset(tex, level, zslice, face);  /* in bytes */
@@ -311,19 +298,12 @@ r300_get_tex_transfer(struct pipe_screen *screen,
     trans = CALLOC_STRUCT(r300_transfer);
     if (trans) {
         pipe_texture_reference(&trans->transfer.texture, texture);
-        trans->transfer.format = texture->format;
         trans->transfer.x = x;
         trans->transfer.y = y;
         trans->transfer.width = w;
         trans->transfer.height = h;
-        trans->transfer.block = texture->block;
-        trans->transfer.nblocksx = texture->nblocksx[level];
-        trans->transfer.nblocksy = texture->nblocksy[level];
-        trans->transfer.stride = r300_texture_get_stride(tex, level);
+        trans->transfer.stride = r300_texture_get_stride(rscreen, tex, level);
         trans->transfer.usage = usage;
-
-        /* XXX not sure whether it's required to set these two,
-               the driver doesn't use them */
         trans->transfer.zslice = zslice;
         trans->transfer.face = face;
 
@@ -344,6 +324,7 @@ static void* r300_transfer_map(struct pipe_screen* screen,
 {
     struct r300_texture* tex = (struct r300_texture*)transfer->texture;
     char* map;
+    enum pipe_format format = tex->tex.format;
 
     map = pipe_buffer_map(screen, tex->buffer,
                           pipe_transfer_buffer_flags(transfer));
@@ -353,8 +334,8 @@ static void* r300_transfer_map(struct pipe_screen* screen,
     }
 
     return map + r300_transfer(transfer)->offset +
-        transfer->y / transfer->block.height * transfer->stride +
-        transfer->x / transfer->block.width * transfer->block.size;
+        transfer->y / util_format_get_blockheight(format) * transfer->stride +
+        transfer->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
 }
 
 static void r300_transfer_unmap(struct pipe_screen* screen,
@@ -372,7 +353,7 @@ static void r300_destroy_screen(struct pipe_screen* pscreen)
     FREE(r300screen);
 }
 
-struct pipe_screen* r300_create_screen(struct r300_winsys* r300_winsys)
+struct pipe_screen* r300_create_screen(struct radeon_winsys* radeon_winsys)
 {
     struct r300_screen* r300screen = CALLOC_STRUCT(r300_screen);
     struct r300_capabilities* caps = CALLOC_STRUCT(r300_capabilities);
@@ -380,14 +361,15 @@ struct pipe_screen* r300_create_screen(struct r300_winsys* r300_winsys)
     if (!r300screen || !caps)
         return NULL;
 
-    caps->pci_id = r300_winsys->pci_id;
-    caps->num_frag_pipes = r300_winsys->gb_pipes;
-    caps->num_z_pipes = r300_winsys->z_pipes;
+    caps->pci_id = radeon_winsys->pci_id;
+    caps->num_frag_pipes = radeon_winsys->gb_pipes;
+    caps->num_z_pipes = radeon_winsys->z_pipes;
 
+    r300_init_debug(r300screen);
     r300_parse_chipset(caps);
 
     r300screen->caps = caps;
-    r300screen->screen.winsys = (struct pipe_winsys*)r300_winsys;
+    r300screen->screen.winsys = (struct pipe_winsys*)radeon_winsys;
     r300screen->screen.destroy = r300_destroy_screen;
     r300screen->screen.get_name = r300_get_name;
     r300screen->screen.get_vendor = r300_get_vendor;
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index 41df31f670f..580fda3984e 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -27,12 +27,17 @@
 
 #include "r300_chipset.h"
 
+struct radeon_winsys;
+
 struct r300_screen {
     /* Parent class */
     struct pipe_screen screen;
 
     /* Chipset capabilities */
     struct r300_capabilities* caps;
+
+    /** Combination of DBG_xxx flags */
+    unsigned debug;
 };
 
 struct r300_transfer {
@@ -56,6 +61,46 @@ r300_transfer(struct pipe_transfer* transfer)
 }
 
 /* Creates a new r300 screen. */
-struct pipe_screen* r300_create_screen(struct r300_winsys* r300_winsys);
+struct pipe_screen* r300_create_screen(struct radeon_winsys* radeon_winsys);
+
+/* Debug functionality. */
+
+/**
+ * Debug flags to disable/enable certain groups of debugging outputs.
+ *
+ * \note These may be rather coarse, and the grouping may be impractical.
+ * If you find, while debugging the driver, that a different grouping
+ * of these flags would be beneficial, just feel free to change them
+ * but make sure to update the documentation in r300_debug.c to reflect
+ * those changes.
+ */
+/*@{*/
+#define DBG_HELP    0x0000001
+#define DBG_FP      0x0000002
+#define DBG_VP      0x0000004
+#define DBG_CS      0x0000008
+#define DBG_DRAW    0x0000010
+#define DBG_TEX     0x0000020
+#define DBG_FALL    0x0000040
+/*@}*/
+
+static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
+{
+    return (screen->debug & flags) ? TRUE : FALSE;
+}
+
+static INLINE void SCREEN_DBG(struct r300_screen * screen, unsigned flags,
+                              const char * fmt, ...)
+{
+    if (SCREEN_DBG_ON(screen, flags)) {
+        va_list va;
+        va_start(va, fmt);
+        debug_vprintf(fmt, va);
+        va_end(va);
+    }
+}
+
+void r300_init_debug(struct r300_screen* ctx);
 
 #endif /* R300_SCREEN_H */
+
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
new file mode 100644
index 00000000000..6796841b29b
--- /dev/null
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef R300_SHADER_SEMANTICS_H
+#define R300_SHADER_SEMANTICS_H
+
+#define ATTR_UNUSED             (-1)
+#define ATTR_COLOR_COUNT        2
+#define ATTR_GENERIC_COUNT      16
+
+/* This structure contains information about what attributes are written by VS
+ * or read by FS. (but not both) It's much easier to work with than
+ * tgsi_shader_info.
+ *
+ * The variables contain indices to tgsi_shader_info semantics and those
+ * indices are nothing else than input/output register numbers. */
+struct r300_shader_semantics {
+    int pos;
+    int psize;
+    int color[ATTR_COLOR_COUNT];
+    int bcolor[ATTR_COLOR_COUNT];
+    int generic[ATTR_GENERIC_COUNT];
+    int fog;
+    int wpos;
+};
+
+static INLINE void r300_shader_semantics_reset(
+    struct r300_shader_semantics* info)
+{
+    int i;
+
+    info->pos = ATTR_UNUSED;
+    info->psize = ATTR_UNUSED;
+    info->fog = ATTR_UNUSED;
+    info->wpos = ATTR_UNUSED;
+
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        info->color[i] = ATTR_UNUSED;
+        info->bcolor[i] = ATTR_UNUSED;
+    }
+
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        info->generic[i] = ATTR_UNUSED;
+    }
+}
+
+#endif
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index d1eced61db1..641e95e7fca 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -41,6 +42,120 @@
 /* r300_state: Functions used to intialize state context by translating
  * Gallium state objects into semi-native r300 state objects. */
 
+static boolean blend_discard_if_src_alpha_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 0, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA == 1, and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_0(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (0,0,0), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_color_1(unsigned srcRGB, unsigned srcA,
+                                            unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_COLOR == (1,1,1), and the following state is set, the colorbuffer
+     * will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_0(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (0,0,0,0), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
+static boolean blend_discard_if_src_alpha_color_1(unsigned srcRGB, unsigned srcA,
+                                                  unsigned dstRGB, unsigned dstA)
+{
+    /* If the blend equation is ADD or REVERSE_SUBTRACT,
+     * SRC_ALPHA_COLOR == (1,1,1,1), and the following state is set,
+     * the colorbuffer will not be changed.
+     * Notice that the dst factors are the src factors inverted. */
+    return (srcRGB == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_ZERO) &&
+           (srcA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+            srcA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+            srcA == PIPE_BLENDFACTOR_ZERO) &&
+           (dstRGB == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstRGB == PIPE_BLENDFACTOR_ONE) &&
+           (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+            dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+            dstA == PIPE_BLENDFACTOR_ONE);
+}
+
 /* Create a new blend state based on the CSO blend state.
  *
  * This encompasses alpha blending, logic/raster ops, and blend dithering. */
@@ -66,7 +181,11 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             ( r300_translate_blend_factor(srcRGB) << R300_SRC_BLEND_SHIFT) |
             ( r300_translate_blend_factor(dstRGB) << R300_DST_BLEND_SHIFT);
 
-        /* optimization: some operations do not require the destination color */
+        /* Optimization: some operations do not require the destination color.
+         *
+         * When SRC_ALPHA_SATURATE is used, colorbuffer reads must be enabled,
+         * otherwise blending gives incorrect results. It seems to be
+         * a hardware bug. */
         if (eqRGB == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MIN ||
             eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MAX ||
             dstRGB != PIPE_BLENDFACTOR_ZERO ||
@@ -78,11 +197,81 @@ static void* r300_create_blend_state(struct pipe_context* pipe,
             srcA == PIPE_BLENDFACTOR_DST_COLOR ||
             srcA == PIPE_BLENDFACTOR_DST_ALPHA ||
             srcA == PIPE_BLENDFACTOR_INV_DST_COLOR ||
-            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA)
+            srcA == PIPE_BLENDFACTOR_INV_DST_ALPHA ||
+            srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) {
+            /* Enable reading from the colorbuffer. */
             blend->blend_control |= R300_READ_ENABLE;
 
-        /* XXX implement the optimization with DISCARD_SRC_PIXELS*/
-        /* XXX implement the optimization with SRC_ALPHA_?_NO_READ */
+            if (r300_screen(r300_context(pipe)->context.screen)->caps->is_r500) {
+                /* Optimization: Depending on incoming pixels, we can
+                 * conditionally disable the reading in hardware... */
+                if (eqRGB != PIPE_BLEND_MIN && eqA != PIPE_BLEND_MIN &&
+                    eqRGB != PIPE_BLEND_MAX && eqA != PIPE_BLEND_MAX) {
+                    /* Disable reading if SRC_ALPHA == 0. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_0_NO_READ;
+                    }
+
+                    /* Disable reading if SRC_ALPHA == 1. */
+                    if ((dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstRGB == PIPE_BLENDFACTOR_ZERO) &&
+                        (dstA == PIPE_BLENDFACTOR_INV_SRC_COLOR ||
+                         dstA == PIPE_BLENDFACTOR_INV_SRC_ALPHA ||
+                         dstA == PIPE_BLENDFACTOR_ZERO)) {
+                         blend->blend_control |= R500_SRC_ALPHA_1_NO_READ;
+                    }
+                }
+            }
+        }
+
+        /* Optimization: discard pixels which don't change the colorbuffer.
+         *
+         * The code below is non-trivial and some math is involved.
+         *
+         * Discarding pixels must be disabled when FP16 AA is enabled.
+         * This is a hardware bug. Also, this implementation wouldn't work
+         * with FP blending enabled and equation clamping disabled.
+         *
+         * Equations other than ADD are rarely used and therefore won't be
+         * optimized. */
+        if ((eqRGB == PIPE_BLEND_ADD || eqRGB == PIPE_BLEND_REVERSE_SUBTRACT) &&
+            (eqA == PIPE_BLEND_ADD || eqA == PIPE_BLEND_REVERSE_SUBTRACT)) {
+            /* ADD: X+Y
+             * REVERSE_SUBTRACT: Y-X
+             *
+             * The idea is:
+             * If X = src*srcFactor = 0 and Y = dst*dstFactor = 1,
+             * then CB will not be changed.
+             *
+             * Given the srcFactor and dstFactor variables, we can derive
+             * what src and dst should be equal to and discard appropriate
+             * pixels.
+             */
+            if (blend_discard_if_src_alpha_0(srcRGB, srcA, dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+            } else if (blend_discard_if_src_alpha_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1;
+            } else if (blend_discard_if_src_color_0(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_0;
+            } else if (blend_discard_if_src_color_1(srcRGB, srcA,
+                                                    dstRGB, dstA)) {
+                blend->blend_control |= R300_DISCARD_SRC_PIXELS_SRC_COLOR_1;
+            } else if (blend_discard_if_src_alpha_color_0(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0;
+            } else if (blend_discard_if_src_alpha_color_1(srcRGB, srcA,
+                                                          dstRGB, dstA)) {
+                blend->blend_control |=
+                    R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1;
+            }
+        }
 
         /* separate alpha */
         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
@@ -128,8 +317,8 @@ static void r300_bind_blend_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    r300->blend_state = (struct r300_blend_state*)state;
-    r300->dirty_state |= R300_NEW_BLEND;
+    r300->blend_state.state = state;
+    r300->blend_state.dirty = TRUE;
 }
 
 /* Free blend state. */
@@ -151,19 +340,24 @@ static void r300_set_blend_color(struct pipe_context* pipe,
                                  const struct pipe_blend_color* color)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
+    struct r300_blend_color_state* state =
+        (struct r300_blend_color_state*)r300->blend_color_state.state;
+    union util_color uc;
 
-    util_pack_color(color->color, PIPE_FORMAT_A8R8G8B8_UNORM,
-            &r300->blend_color_state->blend_color);
+    util_pack_color(color->color, PIPE_FORMAT_A8R8G8B8_UNORM, &uc);
+    state->blend_color = uc.ui;
 
     /* XXX if FP16 blending is enabled, we should use the FP16 format */
-    r300->blend_color_state->blend_color_red_alpha =
+    state->blend_color_red_alpha =
         float_to_fixed10(color->color[0]) |
         (float_to_fixed10(color->color[3]) << 16);
-    r300->blend_color_state->blend_color_green_blue =
+    state->blend_color_green_blue =
         float_to_fixed10(color->color[2]) |
         (float_to_fixed10(color->color[1]) << 16);
 
-    r300->dirty_state |= R300_NEW_BLEND_COLOR;
+    r300->blend_color_state.size = r300screen->caps->is_r500 ? 3 : 2;
+    r300->blend_color_state.dirty = TRUE;
 }
 
 static void r300_set_clip_state(struct pipe_context* pipe,
@@ -172,12 +366,15 @@ static void r300_set_clip_state(struct pipe_context* pipe,
     struct r300_context* r300 = r300_context(pipe);
 
     if (r300_screen(pipe->screen)->caps->has_tcl) {
-        r300->clip_state = *state;
-        r300->dirty_state |= R300_NEW_CLIP;
+        memcpy(r300->clip_state.state, state, sizeof(struct pipe_clip_state));
+        r300->clip_state.size = 29;
     } else {
         draw_flush(r300->draw);
         draw_set_clip_state(r300->draw, state);
+        r300->clip_state.size = 2;
     }
+
+    r300->clip_state.dirty = TRUE;
 }
 
 /* Create a new depth, stencil, and alpha state based on the CSO dsa state.
@@ -270,9 +467,11 @@ static void r300_bind_dsa_state(struct pipe_context* pipe,
                                 void* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_screen* r300screen = r300_screen(pipe->screen);
 
-    r300->dsa_state = (struct r300_dsa_state*)state;
-    r300->dirty_state |= R300_NEW_DSA;
+    r300->dsa_state.state = state;
+    r300->dsa_state.size = r300screen->caps->is_r500 ? 8 : 6;
+    r300->dsa_state.dirty = TRUE;
 }
 
 /* Free DSA state. */
@@ -282,18 +481,12 @@ static void r300_delete_dsa_state(struct pipe_context* pipe,
     FREE(state);
 }
 
-static void r300_set_edgeflags(struct pipe_context* pipe,
-                               const unsigned* bitfield)
-{
-    /* XXX you know it's bad when i915 has this blank too */
-    /* XXX and even worse, I have no idea WTF the bitfield is */
-}
-
 static void
     r300_set_framebuffer_state(struct pipe_context* pipe,
                                const struct pipe_framebuffer_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    uint32_t zbuffer_bpp = 0;
 
     if (r300->draw) {
         draw_flush(r300->draw);
@@ -301,7 +494,29 @@ static void
 
     r300->framebuffer_state = *state;
 
+    /* Don't rely on the order of states being set for the first time. */
     r300->dirty_state |= R300_NEW_FRAMEBUFFERS;
+
+    r300->blend_state.dirty = TRUE;
+    r300->dsa_state.dirty = TRUE;
+    r300->scissor_state.dirty = TRUE;
+
+    /* Polyfon offset depends on the zbuffer bit depth. */
+    if (state->zsbuf && r300->polygon_offset_enabled) {
+        switch (util_format_get_blocksize(state->zsbuf->texture->format)) {
+            case 2:
+                zbuffer_bpp = 16;
+                break;
+            case 4:
+                zbuffer_bpp = 24;
+                break;
+        }
+
+        if (r300->zbuffer_bpp != zbuffer_bpp) {
+            r300->zbuffer_bpp = zbuffer_bpp;
+            r300->rs_state.dirty = TRUE;
+        }
+    }
 }
 
 /* Create fragment shader state. */
@@ -317,6 +532,7 @@ static void* r300_create_fs_state(struct pipe_context* pipe,
     fs->state.tokens = tgsi_dup_tokens(shader->tokens);
 
     tgsi_scan_shader(shader->tokens, &fs->info);
+    r300_shader_read_fs_inputs(&fs->info, &fs->inputs);
 
     return (void*)fs;
 }
@@ -330,11 +546,14 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
     if (fs == NULL) {
         r300->fs = NULL;
         return;
-    } else if (!fs->translated) {
-        r300_translate_fragment_shader(r300, fs);
     }
 
     r300->fs = fs;
+    r300_pick_fragment_shader(r300);
+
+    if (r300->vs && r300_vertex_shader_setup_wpos(r300)) {
+        r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
+    }
 
     r300->dirty_state |= R300_NEW_FRAGMENT_SHADER | R300_NEW_FRAGMENT_SHADER_CONSTANTS;
 }
@@ -343,7 +562,14 @@ static void r300_bind_fs_state(struct pipe_context* pipe, void* shader)
 static void r300_delete_fs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_fragment_shader* fs = (struct r300_fragment_shader*)shader;
-    rc_constants_destroy(&fs->code.constants);
+    struct r300_fragment_shader_code *tmp, *ptr = fs->first;
+
+    while (ptr) {
+        tmp = ptr;
+        ptr = ptr->next;
+        rc_constants_destroy(&tmp->code.constants);
+        FREE(tmp);
+    }
     FREE((void*)fs->state.tokens);
     FREE(shader);
 }
@@ -369,8 +595,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     /* Copy rasterizer state for Draw. */
     rs->rs = *state;
 
-    rs->enable_vte = !state->bypass_vs_clip_and_viewport;
-
 #ifdef PIPE_ARCH_LITTLE_ENDIAN
     rs->vap_control_status = R300_VC_NO_SWAP;
 #else
@@ -382,8 +606,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     if (state->bypass_vs_clip_and_viewport ||
             !r300_screen(pipe->screen)->caps->has_tcl) {
         rs->vap_control_status |= R300_VAP_TCL_BYPASS;
-    } else {
-        rs->rs.bypass_vs_clip_and_viewport = TRUE;
     }
 
     rs->point_size = pack_float_16_6x(state->point_size) |
@@ -398,9 +620,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     rs->line_control = pack_float_16_6x(state->line_width) |
         R300_GA_LINE_CNTL_END_TYPE_COMP;
 
-    /* XXX I think there is something wrong with the polygon mode,
-     * XXX re-test when r300g is in a better shape */
-
     /* Enable polygon mode */
     if (state->fill_cw != PIPE_POLYGON_MODE_FILL ||
         state->fill_ccw != PIPE_POLYGON_MODE_FILL) {
@@ -453,10 +672,8 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
     }
 
     if (rs->polygon_offset_enable) {
-        rs->depth_offset_front = rs->depth_offset_back =
-            fui(state->offset_units);
-        rs->depth_scale_front = rs->depth_scale_back =
-            fui(state->offset_scale);
+        rs->depth_offset = state->offset_units;
+        rs->depth_scale = state->offset_scale;
     }
 
     if (state->line_stipple_enable) {
@@ -474,10 +691,6 @@ static void* r300_create_rs_state(struct pipe_context* pipe,
         rs->color_control = R300_SHADE_MODEL_SMOOTH;
     }
 
-    if (!state->flatshade_first) {
-        rs->color_control |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
-    }
-
     return (void*)rs;
 }
 
@@ -492,12 +705,25 @@ static void r300_bind_rs_state(struct pipe_context* pipe, void* state)
         draw_set_rasterizer_state(r300->draw, &rs->rs);
     }
 
-    r300->rs_state = rs;
+    if (rs) {
+        r300->tcl_bypass = rs->rs.bypass_vs_clip_and_viewport;
+        r300->polygon_offset_enabled = rs->rs.offset_cw || rs->rs.offset_ccw;
+    } else {
+        r300->tcl_bypass = FALSE;
+        r300->polygon_offset_enabled = FALSE;
+    }
+
+    r300->rs_state.state = rs;
+    r300->rs_state.dirty = TRUE;
+    /* XXX Why is this still needed, dammit!? */
+    r300->scissor_state.dirty = TRUE;
+    r300->viewport_state.dirty = TRUE;
+
     /* XXX Clean these up when we move to atom emits */
-    r300->dirty_state |= R300_NEW_RASTERIZER;
     r300->dirty_state |= R300_NEW_RS_BLOCK;
-    r300->dirty_state |= R300_NEW_SCISSOR;
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 /* Free rasterizer state. */
@@ -513,6 +739,9 @@ static void*
     struct r300_context* r300 = r300_context(pipe);
     struct r300_sampler_state* sampler = CALLOC_STRUCT(r300_sampler_state);
     int lod_bias;
+    union util_color uc;
+
+    sampler->state = *state;
 
     sampler->filter0 |=
         (r300_translate_wrap(state->wrap_s) << R300_TX_WRAP_S_SHIFT) |
@@ -521,7 +750,13 @@ static void*
 
     sampler->filter0 |= r300_translate_tex_filters(state->min_img_filter,
                                                    state->mag_img_filter,
-                                                   state->min_mip_filter);
+                                                   state->min_mip_filter,
+                                                   state->max_anisotropy > 1.0);
+
+    /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */
+    /* We must pass these to the emit function to clamp them properly. */
+    sampler->min_lod = MAX2((unsigned)state->min_lod, 0);
+    sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0);
 
     lod_bias = CLAMP((int)(state->lod_bias * 32), -(1 << 9), (1 << 9) - 1);
 
@@ -529,8 +764,8 @@ static void*
 
     sampler->filter1 |= r300_anisotropy(state->max_anisotropy);
 
-    util_pack_color(state->border_color, PIPE_FORMAT_A8R8G8B8_UNORM,
-                    &sampler->border_color);
+    util_pack_color(state->border_color, PIPE_FORMAT_A8R8G8B8_UNORM, &uc);
+    sampler->border_color = uc.ui;
 
     /* R500-specific fixups and optimizations */
     if (r300_screen(r300->context.screen)->caps->is_r500) {
@@ -559,6 +794,20 @@ static void r300_bind_sampler_states(struct pipe_context* pipe,
     }
 
     r300->sampler_count = count;
+
+    /* Pick a fragment shader based on the texture compare state. */
+    if (r300->fs && (r300->dirty_state & R300_ANY_NEW_SAMPLERS)) {
+        if (r300_pick_fragment_shader(r300)) {
+            r300->dirty_state |= R300_NEW_FRAGMENT_SHADER |
+                                 R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+        }
+    }
+}
+
+static void r300_lacks_vertex_textures(struct pipe_context* pipe,
+                                       unsigned count,
+                                       void** states)
+{
 }
 
 static void r300_delete_sampler_state(struct pipe_context* pipe, void* state)
@@ -571,6 +820,7 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
                                       struct pipe_texture** texture)
 {
     struct r300_context* r300 = r300_context(pipe);
+    boolean is_r500 = r300_screen(r300->context.screen)->caps->is_r500;
     int i;
 
     /* XXX magic num */
@@ -578,13 +828,18 @@ static void r300_set_sampler_textures(struct pipe_context* pipe,
         return;
     }
     
-    r300->context.flush(&r300->context, 0, NULL);
-
     for (i = 0; i < count; i++) {
         if (r300->textures[i] != (struct r300_texture*)texture[i]) {
             pipe_texture_reference((struct pipe_texture**)&r300->textures[i],
                 texture[i]);
             r300->dirty_state |= (R300_NEW_TEXTURE << i);
+
+            /* R300-specific - set the texrect factor in a fragment shader */
+            if (!is_r500 && r300->textures[i]->is_npot) {
+                /* XXX It would be nice to re-emit just 1 constant,
+                 * XXX not all of them */
+                r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+            }
         }
     }
 
@@ -604,60 +859,51 @@ static void r300_set_scissor_state(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    if (r300_screen(r300->context.screen)->caps->is_r500) {
-        r300->scissor_state->scissor_top_left =
-            (state->minx << R300_SCISSORS_X_SHIFT) |
-            (state->miny << R300_SCISSORS_Y_SHIFT);
-        r300->scissor_state->scissor_bottom_right =
-            ((state->maxx - 1) << R300_SCISSORS_X_SHIFT) |
-            ((state->maxy - 1) << R300_SCISSORS_Y_SHIFT);
-    } else {
-        /* Offset of 1440 in non-R500 chipsets. */
-        r300->scissor_state->scissor_top_left =
-            ((state->minx + 1440) << R300_SCISSORS_X_SHIFT) |
-            ((state->miny + 1440) << R300_SCISSORS_Y_SHIFT);
-        r300->scissor_state->scissor_bottom_right =
-            (((state->maxx - 1) + 1440) << R300_SCISSORS_X_SHIFT) |
-            (((state->maxy - 1) + 1440) << R300_SCISSORS_Y_SHIFT);
-    }
+    memcpy(r300->scissor_state.state, state,
+        sizeof(struct pipe_scissor_state));
 
-    r300->dirty_state |= R300_NEW_SCISSOR;
+    r300->scissor_state.dirty = TRUE;
 }
 
 static void r300_set_viewport_state(struct pipe_context* pipe,
                                     const struct pipe_viewport_state* state)
 {
     struct r300_context* r300 = r300_context(pipe);
+    struct r300_viewport_state* viewport =
+        (struct r300_viewport_state*)r300->viewport_state.state;
 
     /* Do the transform in HW. */
-    r300->viewport_state->vte_control = R300_VTX_W0_FMT;
+    viewport->vte_control = R300_VTX_W0_FMT;
 
     if (state->scale[0] != 1.0f) {
-        r300->viewport_state->xscale = state->scale[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_SCALE_ENA;
+        viewport->xscale = state->scale[0];
+        viewport->vte_control |= R300_VPORT_X_SCALE_ENA;
     }
     if (state->scale[1] != 1.0f) {
-        r300->viewport_state->yscale = state->scale[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_SCALE_ENA;
+        viewport->yscale = state->scale[1];
+        viewport->vte_control |= R300_VPORT_Y_SCALE_ENA;
     }
     if (state->scale[2] != 1.0f) {
-        r300->viewport_state->zscale = state->scale[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_SCALE_ENA;
+        viewport->zscale = state->scale[2];
+        viewport->vte_control |= R300_VPORT_Z_SCALE_ENA;
     }
     if (state->translate[0] != 0.0f) {
-        r300->viewport_state->xoffset = state->translate[0];
-        r300->viewport_state->vte_control |= R300_VPORT_X_OFFSET_ENA;
+        viewport->xoffset = state->translate[0];
+        viewport->vte_control |= R300_VPORT_X_OFFSET_ENA;
     }
     if (state->translate[1] != 0.0f) {
-        r300->viewport_state->yoffset = state->translate[1];
-        r300->viewport_state->vte_control |= R300_VPORT_Y_OFFSET_ENA;
+        viewport->yoffset = state->translate[1];
+        viewport->vte_control |= R300_VPORT_Y_OFFSET_ENA;
     }
     if (state->translate[2] != 0.0f) {
-        r300->viewport_state->zoffset = state->translate[2];
-        r300->viewport_state->vte_control |= R300_VPORT_Z_OFFSET_ENA;
+        viewport->zoffset = state->translate[2];
+        viewport->vte_control |= R300_VPORT_Z_OFFSET_ENA;
     }
 
-    r300->dirty_state |= R300_NEW_VIEWPORT;
+    r300->viewport_state.dirty = TRUE;
+    if (r300->fs && r300->fs->inputs.wpos != ATTR_UNUSED) {
+        r300->dirty_state |= R300_NEW_FRAGMENT_SHADER_CONSTANTS;
+    }
 }
 
 static void r300_set_vertex_buffers(struct pipe_context* pipe,
@@ -674,6 +920,24 @@ static void r300_set_vertex_buffers(struct pipe_context* pipe,
         draw_flush(r300->draw);
         draw_set_vertex_buffers(r300->draw, count, buffers);
     }
+
+    r300->dirty_state |= R300_NEW_VERTEX_FORMAT;
+}
+
+static boolean r300_validate_aos(struct r300_context *r300)
+{
+    struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
+    struct pipe_vertex_element *velem = r300->vertex_element;
+    int i;
+
+    /* Check if formats and strides are aligned to the size of DWORD. */
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        if (vbuf[velem[i].vertex_buffer_index].stride % 4 != 0 ||
+            util_format_get_blocksize(velem[i].src_format) % 4 != 0) {
+            return FALSE;
+        }
+    }
+    return TRUE;
 }
 
 static void r300_set_vertex_elements(struct pipe_context* pipe,
@@ -691,6 +955,12 @@ static void r300_set_vertex_elements(struct pipe_context* pipe,
         draw_flush(r300->draw);
         draw_set_vertex_elements(r300->draw, count, elements);
     }
+
+    if (!r300_validate_aos(r300)) {
+        /* XXX We should fallback using draw. */
+        assert(0);
+        abort();
+    }
 }
 
 static void* r300_create_vs_state(struct pipe_context* pipe,
@@ -706,9 +976,6 @@ static void* r300_create_vs_state(struct pipe_context* pipe,
 
         tgsi_scan_shader(shader->tokens, &vs->info);
 
-        /* Appease Draw. */
-        vs->draw = draw_create_vertex_shader(r300->draw, shader);
-
         return (void*)vs;
     } else {
         return draw_create_vertex_shader(r300->draw, shader);
@@ -719,8 +986,6 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
 {
     struct r300_context* r300 = r300_context(pipe);
 
-    draw_flush(r300->draw);
-
     if (r300_screen(pipe->screen)->caps->has_tcl) {
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
@@ -731,10 +996,16 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
             r300_translate_vertex_shader(r300, vs);
         }
 
-        draw_bind_vertex_shader(r300->draw, vs->draw);
         r300->vs = vs;
-        r300->dirty_state |= R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS;
+        if (r300->fs) {
+            r300_vertex_shader_setup_wpos(r300);
+        }
+
+        r300->dirty_state |=
+            R300_NEW_VERTEX_SHADER | R300_NEW_VERTEX_SHADER_CONSTANTS |
+            R300_NEW_VERTEX_FORMAT;
     } else {
+        draw_flush(r300->draw);
         draw_bind_vertex_shader(r300->draw,
                 (struct draw_vertex_shader*)shader);
     }
@@ -748,7 +1019,6 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
         struct r300_vertex_shader* vs = (struct r300_vertex_shader*)shader;
 
         rc_constants_destroy(&vs->code.constants);
-        draw_delete_vertex_shader(r300->draw, vs->draw);
         FREE((void*)vs->state.tokens);
         FREE(shader);
     } else {
@@ -759,22 +1029,22 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader)
 
 static void r300_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
     struct r300_context* r300 = r300_context(pipe);
     void *mapped;
 
-    if (buf == NULL || buf->buffer->size == 0 ||
-        (mapped = pipe_buffer_map(pipe->screen, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)) == NULL)
+    if (buf == NULL || buf->size == 0 ||
+        (mapped = pipe_buffer_map(pipe->screen, buf, PIPE_BUFFER_USAGE_CPU_READ)) == NULL)
     {
         r300->shader_constants[shader].count = 0;
         return;
     }
 
-    assert((buf->buffer->size % 4 * sizeof(float)) == 0);
-    memcpy(r300->shader_constants[shader].constants, mapped, buf->buffer->size);
-    r300->shader_constants[shader].count = buf->buffer->size / (4 * sizeof(float));
-    pipe_buffer_unmap(pipe->screen, buf->buffer);
+    assert((buf->size % 4 * sizeof(float)) == 0);
+    memcpy(r300->shader_constants[shader].constants, mapped, buf->size);
+    r300->shader_constants[shader].count = buf->size / (4 * sizeof(float));
+    pipe_buffer_unmap(pipe->screen, buf);
 
     if (shader == PIPE_SHADER_VERTEX)
         r300->dirty_state |= R300_NEW_VERTEX_SHADER_CONSTANTS;
@@ -798,8 +1068,6 @@ void r300_init_state_functions(struct r300_context* r300)
     r300->context.bind_depth_stencil_alpha_state = r300_bind_dsa_state;
     r300->context.delete_depth_stencil_alpha_state = r300_delete_dsa_state;
 
-    r300->context.set_edgeflags = r300_set_edgeflags;
-
     r300->context.set_framebuffer_state = r300_set_framebuffer_state;
 
     r300->context.create_fs_state = r300_create_fs_state;
@@ -813,10 +1081,11 @@ void r300_init_state_functions(struct r300_context* r300)
     r300->context.delete_rasterizer_state = r300_delete_rs_state;
 
     r300->context.create_sampler_state = r300_create_sampler_state;
-    r300->context.bind_sampler_states = r300_bind_sampler_states;
+    r300->context.bind_fragment_sampler_states = r300_bind_sampler_states;
+    r300->context.bind_vertex_sampler_states = r300_lacks_vertex_textures;
     r300->context.delete_sampler_state = r300_delete_sampler_state;
 
-    r300->context.set_sampler_textures = r300_set_sampler_textures;
+    r300->context.set_fragment_sampler_textures = r300_set_sampler_textures;
 
     r300->context.set_scissor_state = r300_set_scissor_state;
 
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 55430a9ad67..99c2720897f 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -28,6 +29,7 @@
 #include "r300_context.h"
 #include "r300_fs.h"
 #include "r300_screen.h"
+#include "r300_shader_semantics.h"
 #include "r300_state_derived.h"
 #include "r300_state_inlines.h"
 #include "r300_vs.h"
@@ -47,8 +49,8 @@ struct r300_shader_derived_value {
 
 unsigned r300_shader_key_hash(void* key) {
     struct r300_shader_key* shader_key = (struct r300_shader_key*)key;
-    unsigned vs = (unsigned)shader_key->vs;
-    unsigned fs = (unsigned)shader_key->fs;
+    unsigned vs = (intptr_t)shader_key->vs;
+    unsigned fs = (intptr_t)shader_key->fs;
 
     return (vs << 16) | (fs & 0xffff);
 }
@@ -61,209 +63,152 @@ int r300_shader_key_compare(void* key1, void* key2) {
         (shader_key1->fs == shader_key2->fs);
 }
 
-/* Set up the vs_tab and routes. */
-static void r300_vs_tab_routes(struct r300_context* r300,
-                               struct r300_vertex_info* vformat)
+static void r300_draw_emit_attrib(struct r300_context* r300,
+                                  enum attrib_emit emit,
+                                  enum interp_mode interp,
+                                  int index)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
-    struct vertex_info* vinfo = &vformat->vinfo;
-    int* tab = vformat->vs_tab;
-    boolean pos = FALSE, psize = FALSE, fog = FALSE;
-    int i, texs = 0, cols = 0;
-    struct tgsi_shader_info* info;
+    struct tgsi_shader_info* info = &r300->vs->info;
+    int output;
 
-    if (r300screen->caps->has_tcl) {
-        /* Use vertex shader to determine required routes. */
-        info = &r300->vs->info;
+    output = draw_find_shader_output(r300->draw,
+                                     info->output_semantic_name[index],
+                                     info->output_semantic_index[index]);
+    draw_emit_vertex_attr(&r300->vertex_info->vinfo, emit, interp, output);
+}
+
+static void r300_draw_emit_all_attribs(struct r300_context* r300)
+{
+    struct r300_shader_semantics* vs_outputs = &r300->vs->outputs;
+    int i, gen_count;
+
+    /* Position. */
+    if (vs_outputs->pos != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                              vs_outputs->pos);
     } else {
-        /* Use fragment shader to determine required routes. */
-        info = &r300->fs->info;
+        assert(0);
     }
 
-    assert(info->num_inputs <= 16);
-
-    if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
-    {
-        for (i = 0; i < info->num_inputs; i++) {
-            switch (r300->vs->code.inputs[i]) {
-                case TGSI_SEMANTIC_POSITION:
-                    pos = TRUE;
-                    tab[i] = 0;
-                    break;
-                case TGSI_SEMANTIC_COLOR:
-                    tab[i] = 2 + cols;
-                    cols++;
-                    break;
-                case TGSI_SEMANTIC_PSIZE:
-                    assert(psize == FALSE);
-                    psize = TRUE;
-                    tab[i] = 15;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                    assert(fog == FALSE);
-                    fog = TRUE;
-                    /* Fall through */
-                case TGSI_SEMANTIC_GENERIC:
-                    tab[i] = 6 + texs;
-                    texs++;
-                    break;
-                default:
-                    debug_printf("r300: Unknown vertex input %d\n",
-                        info->input_semantic_name[i]);
-                    break;
-            }
-        }
+    /* Point size. */
+    if (vs_outputs->psize != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_1F_PSIZE, INTERP_POS,
+                              vs_outputs->psize);
     }
-    else
-    {
-        /* Just copy vert attribs over as-is. */
-        for (i = 0; i < info->num_inputs; i++) {
-            tab[i] = i;
-        }
 
-        for (i = 0; i < info->num_outputs; i++) {
-            switch (info->output_semantic_name[i]) {
-                case TGSI_SEMANTIC_POSITION:
-                    pos = TRUE;
-                    break;
-                case TGSI_SEMANTIC_COLOR:
-                    cols++;
-                    break;
-                case TGSI_SEMANTIC_PSIZE:
-                    psize = TRUE;
-                    break;
-                case TGSI_SEMANTIC_FOG:
-                    fog = TRUE;
-                    /* Fall through */
-                case TGSI_SEMANTIC_GENERIC:
-                    texs++;
-                    break;
-                default:
-                    debug_printf("r300: Unknown vertex output %d\n",
-                        info->output_semantic_name[i]);
-                    break;
-            }
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED) {
+            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_LINEAR,
+                                  vs_outputs->color[i]);
         }
     }
 
-    /* XXX magic */
-    assert(texs <= 8);
+    /* XXX Back-face colors. */
 
-    /* Do the actual vertex_info setup.
-     *
-     * vertex_info has four uints of hardware-specific data in it.
-     * vinfo.hwfmt[0] is R300_VAP_VTX_STATE_CNTL
-     * vinfo.hwfmt[1] is R300_VAP_VSM_VTX_ASSM
-     * vinfo.hwfmt[2] is R300_VAP_OUTPUT_VTX_FMT_0
-     * vinfo.hwfmt[3] is R300_VAP_OUTPUT_VTX_FMT_1 */
-
-    vinfo->hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
-
-    /* We need to add vertex position attribute only for SW TCL case,
-     * for HW TCL case it could be generated by vertex shader */
-    if (!pos && !r300screen->caps->has_tcl) {
-        debug_printf("r300: Forcing vertex position attribute emit...\n");
-        /* Make room for the position attribute
-         * at the beginning of the tab. */
-        for (i = 15; i > 0; i--) {
-            tab[i] = tab[i-1];
+    /* Texture coordinates. */
+    gen_count = 0;
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
+            r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                                  vs_outputs->generic[i]);
+            gen_count++;
         }
-        tab[0] = 0;
     }
 
-    /* Position. */
-    if (r300->draw) {
-        draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-            draw_find_vs_output(r300->draw, TGSI_SEMANTIC_POSITION, 0));
+    /* Fog coordinates. */
+    if (vs_outputs->fog != ATTR_UNUSED) {
+        r300_draw_emit_attrib(r300, EMIT_4F, INTERP_PERSPECTIVE,
+                              vs_outputs->fog);
+        gen_count++;
     }
-    vinfo->hwfmt[1] |= R300_INPUT_CNTL_POS;
-    vinfo->hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
-    /* Point size. */
-    if (psize) {
-        if (r300->draw) {
-            draw_emit_vertex_attr(vinfo, EMIT_1F_PSIZE, INTERP_POS,
-                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_PSIZE, 0));
-        }
-        vinfo->hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
-    }
+    /* XXX magic */
+    assert(gen_count <= 8);
+}
 
-    /* Colors. */
-    for (i = 0; i < cols; i++) {
-        if (r300->draw) {
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR,
-                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_COLOR, i));
-        }
-        vinfo->hwfmt[1] |= R300_INPUT_CNTL_COLOR;
-        vinfo->hwfmt[2] |= (R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i);
+/* Update the PSC tables. */
+static void r300_vertex_psc(struct r300_context* r300)
+{
+    struct r300_vertex_info *vformat = r300->vertex_info;
+    uint16_t type, swizzle;
+    enum pipe_format format;
+    unsigned i;
+    int identity[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+    int* stream_tab;
+
+    /* If TCL is bypassed, map vertex streams to equivalent VS output
+     * locations. */
+    if (r300->tcl_bypass) {
+        stream_tab = r300->vs->stream_loc_notcl;
+    } else {
+        stream_tab = identity;
     }
 
-    /* Init i right here, increment it if fog is enabled.
-     * This gets around a double-increment problem. */
-    i = 0;
+    /* Vertex shaders have no semantics on their inputs,
+     * so PSC should just route stuff based on the vertex elements,
+     * and not on attrib information. */
+    DBG(r300, DBG_DRAW, "r300: vs expects %d attribs, routing %d elements"
+            " in psc\n",
+            r300->vs->info.num_inputs,
+            r300->vertex_element_count);
 
-    /* Fog. This is a special-cased texcoord. */
-    if (fog) {
-        i++;
-        if (r300->draw) {
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_FOG, 0));
-        }
-        vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
-        vinfo->hwfmt[3] |= (4 << (3 * i));
-    }
+    for (i = 0; i < r300->vertex_element_count; i++) {
+        format = r300->vertex_element[i].src_format;
+
+        type = r300_translate_vertex_data_type(format) |
+            (stream_tab[i] << R300_DST_VEC_LOC_SHIFT);
+        swizzle = r300_translate_vertex_data_swizzle(format);
 
-    /* Texcoords. */
-    for (; i < texs; i++) {
-        if (r300->draw) {
-            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
-                draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
+        if (i & 1) {
+            vformat->vap_prog_stream_cntl[i >> 1] |= type << 16;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
+        } else {
+            vformat->vap_prog_stream_cntl[i >> 1] |= type;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
         }
-        vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
-        vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    draw_compute_vertex_size(vinfo);
+    assert(i <= 15);
+
+    /* Set the last vector in the PSC. */
+    if (i) {
+        i -= 1;
+    }
+    vformat->vap_prog_stream_cntl[i >> 1] |=
+        (R300_LAST_VEC << (i & 1 ? 16 : 0));
 }
 
-/* Update the PSC tables. */
-static void r300_vertex_psc(struct r300_context* r300,
-                            struct r300_vertex_info* vformat)
+/* Update the PSC tables for SW TCL, using Draw. */
+static void r300_swtcl_vertex_psc(struct r300_context* r300)
 {
-    struct r300_screen* r300screen = r300_screen(r300->context.screen);
+    struct r300_vertex_info *vformat = r300->vertex_info;
     struct vertex_info* vinfo = &vformat->vinfo;
-    int* tab = vformat->vs_tab;
     uint16_t type, swizzle;
     enum pipe_format format;
     unsigned i, attrib_count;
+    int* vs_output_tab = r300->vs->stream_loc_notcl;
 
-    /* Vertex shaders have no semantics on their inputs,
-     * so PSC should just route stuff based on their info,
-     * and not on attrib information. */
-    if (r300screen->caps->has_tcl) {
-        attrib_count = r300->vs->info.num_inputs;
-        DBG(r300, DBG_DRAW, "r300: routing %d attribs in psc for vs\n",
-                attrib_count);
-    } else {
-        attrib_count = vinfo->num_attribs;
-        DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
-        for (i = 0; i < attrib_count; i++) {
-            DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
-                   " tab %d\n", vinfo->attrib[i].src_index,
-                   vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
-                   tab[i]);
-        }
+    /* For each Draw attribute, route it to the fragment shader according
+     * to the vs_output_tab. */
+    attrib_count = vinfo->num_attribs;
+    DBG(r300, DBG_DRAW, "r300: attrib count: %d\n", attrib_count);
+    for (i = 0; i < attrib_count; i++) {
+        DBG(r300, DBG_DRAW, "r300: attrib: offset %d, interp %d, size %d,"
+               " vs_output_tab %d\n", vinfo->attrib[i].src_index,
+               vinfo->attrib[i].interp_mode, vinfo->attrib[i].emit,
+               vs_output_tab[i]);
     }
 
     for (i = 0; i < attrib_count; i++) {
         /* Make sure we have a proper destination for our attribute. */
-        assert(tab[i] != -1);
+        assert(vs_output_tab[i] != -1);
 
         format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
 
         /* Obtain the type of data in this attribute. */
         type = r300_translate_vertex_data_type(format) |
-            tab[i] << R300_DST_VEC_LOC_SHIFT;
+            vs_output_tab[i] << R300_DST_VEC_LOC_SHIFT;
 
         /* Obtain the swizzle for this attribute. Note that the default
          * swizzle in the hardware is not XYZW! */
@@ -272,12 +217,10 @@ static void r300_vertex_psc(struct r300_context* r300,
         /* Add the attribute to the PSC table. */
         if (i & 1) {
             vformat->vap_prog_stream_cntl[i >> 1] |= type << 16;
-
             vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 16;
         } else {
-            vformat->vap_prog_stream_cntl[i >> 1] |= type <<  0;
-
-            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle << 0;
+            vformat->vap_prog_stream_cntl[i >> 1] |= type;
+            vformat->vap_prog_stream_cntl_ext[i >> 1] |= swizzle;
         }
     }
 
@@ -289,185 +232,213 @@ static void r300_vertex_psc(struct r300_context* r300,
         (R300_LAST_VEC << (i & 1 ? 16 : 0));
 }
 
-/* Set up the mappings from GB to US, for RS block. */
-static void r300_update_fs_tab(struct r300_context* r300,
-                               struct r300_vertex_info* vformat)
+static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr,
+                        boolean swizzle_0001)
+{
+    rs->ip[id] |= R300_RS_COL_PTR(ptr);
+    if (swizzle_0001) {
+        rs->ip[id] |= R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+    } else {
+        rs->ip[id] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+    }
+    rs->inst[id] |= R300_RS_INST_COL_ID(id);
+}
+
+static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
 {
-    struct tgsi_shader_info* info = &r300->fs->info;
-    int i, cols = 0, texs = 0, cols_emitted = 0;
-    int* tab = vformat->fs_tab;
+    rs->inst[id] |= R300_RS_INST_COL_CN_WRITE |
+                    R300_RS_INST_COL_ADDR(fp_offset);
+}
 
-    for (i = 0; i < 16; i++) {
-        tab[i] = -1;
+static void r300_rs_tex(struct r300_rs_block* rs, int id, int ptr,
+                        boolean swizzle_X001)
+{
+    if (swizzle_X001) {
+        rs->ip[id] |= R300_RS_TEX_PTR(ptr*4) |
+                      R300_RS_SEL_S(R300_RS_SEL_C0) |
+                      R300_RS_SEL_T(R300_RS_SEL_K0) |
+                      R300_RS_SEL_R(R300_RS_SEL_K0) |
+                      R300_RS_SEL_Q(R300_RS_SEL_K1);
+    } else {
+        rs->ip[id] |= R300_RS_TEX_PTR(ptr*4) |
+                      R300_RS_SEL_S(R300_RS_SEL_C0) |
+                      R300_RS_SEL_T(R300_RS_SEL_C1) |
+                      R300_RS_SEL_R(R300_RS_SEL_C2) |
+                      R300_RS_SEL_Q(R300_RS_SEL_C3);
     }
+    rs->inst[id] |= R300_RS_INST_TEX_ID(id);
+}
 
-    assert(info->num_inputs <= 16);
-    for (i = 0; i < info->num_inputs; i++) {
-        switch (info->input_semantic_name[i]) {
-            case TGSI_SEMANTIC_COLOR:
-                tab[i] = INTERP_LINEAR;
-                cols++;
-                break;
-            case TGSI_SEMANTIC_POSITION:
-            case TGSI_SEMANTIC_PSIZE:
-                debug_printf("r300: Implementation error: Can't use "
-                        "pos attribs in fragshader yet!\n");
-                /* Pass through for now */
-            case TGSI_SEMANTIC_FOG:
-            case TGSI_SEMANTIC_GENERIC:
-                tab[i] = INTERP_PERSPECTIVE;
-                break;
-            default:
-                debug_printf("r300: Unknown vertex input %d\n",
-                    info->input_semantic_name[i]);
-                break;
-        }
+static void r300_rs_tex_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R300_RS_INST_TEX_CN_WRITE |
+                    R300_RS_INST_TEX_ADDR(fp_offset);
+}
+
+static void r500_rs_col(struct r300_rs_block* rs, int id, int ptr,
+                        boolean swizzle_0001)
+{
+    rs->ip[id] |= R500_RS_COL_PTR(ptr);
+    if (swizzle_0001) {
+        rs->ip[id] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+    } else {
+        rs->ip[id] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
     }
+    rs->inst[id] |= R500_RS_INST_COL_ID(id);
+}
 
-    /* Now that we know where everything is... */
-    DBG(r300, DBG_DRAW, "r300: fp input count: %d\n", info->num_inputs);
-    for (i = 0; i < info->num_inputs; i++) {
-        switch (tab[i]) {
-            case INTERP_LINEAR:
-                DBG(r300, DBG_DRAW, "r300: attrib: "
-                        "stack offset %d, color,    tab %d\n",
-                        i, cols_emitted);
-                tab[i] = cols_emitted;
-                cols_emitted++;
-                break;
-            case INTERP_PERSPECTIVE:
-                DBG(r300, DBG_DRAW, "r300: attrib: "
-                        "stack offset %d, texcoord, tab %d\n",
-                        i, cols + texs);
-                tab[i] = cols + texs;
-                texs++;
-                break;
-            case -1:
-                debug_printf("r300: Implementation error: Bad fp interp!\n");
-            default:
-                break;
-        }
+static void r500_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R500_RS_INST_COL_CN_WRITE |
+                    R500_RS_INST_COL_ADDR(fp_offset);
+}
+
+static void r500_rs_tex(struct r300_rs_block* rs, int id, int ptr,
+                        boolean swizzle_X001)
+{
+    int rs_tex_comp = ptr*4;
+
+    if (swizzle_X001) {
+        rs->ip[id] |= R500_RS_SEL_S(rs_tex_comp) |
+                      R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
+                      R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
+                      R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
+    } else {
+        rs->ip[id] |= R500_RS_SEL_S(rs_tex_comp) |
+                      R500_RS_SEL_T(rs_tex_comp + 1) |
+                      R500_RS_SEL_R(rs_tex_comp + 2) |
+                      R500_RS_SEL_Q(rs_tex_comp + 3);
     }
+    rs->inst[id] |= R500_RS_INST_TEX_ID(id);
+}
 
+static void r500_rs_tex_write(struct r300_rs_block* rs, int id, int fp_offset)
+{
+    rs->inst[id] |= R500_RS_INST_TEX_CN_WRITE |
+                    R500_RS_INST_TEX_ADDR(fp_offset);
 }
 
-/* Set up the RS block. This is the part of the chipset that actually does
- * the rasterization of vertices into fragments. This is also the part of the
- * chipset that locks up if any part of it is even slightly wrong. */
+/* Set up the RS block.
+ *
+ * This is the part of the chipset that actually does the rasterization
+ * of vertices into fragments. This is also the part of the chipset that
+ * locks up if any part of it is even slightly wrong. */
 static void r300_update_rs_block(struct r300_context* r300,
-                                 struct r300_rs_block* rs)
+                                 struct r300_shader_semantics* vs_outputs,
+                                 struct r300_shader_semantics* fs_inputs)
 {
-    struct tgsi_shader_info* info = &r300->fs->info;
-    int col_count = 0, fp_offset = 0, i, tex_count = 0;
-    int rs_tex_comp = 0;
+    struct r300_rs_block* rs = r300->rs_block;
+    int i, col_count = 0, tex_count = 0, fp_offset = 0;
+    void (*rX00_rs_col)(struct r300_rs_block*, int, int, boolean);
+    void (*rX00_rs_col_write)(struct r300_rs_block*, int, int);
+    void (*rX00_rs_tex)(struct r300_rs_block*, int, int, boolean);
+    void (*rX00_rs_tex_write)(struct r300_rs_block*, int, int);
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
-        for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
-                case TGSI_SEMANTIC_COLOR:
-                    rs->ip[col_count] |=
-                        R500_RS_COL_PTR(col_count) |
-                        R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
-                    col_count++;
-                    break;
-                case TGSI_SEMANTIC_GENERIC:
-                    rs->ip[tex_count] |=
-                        R500_RS_SEL_S(rs_tex_comp) |
-                        R500_RS_SEL_T(rs_tex_comp + 1) |
-                        R500_RS_SEL_R(rs_tex_comp + 2) |
-                        R500_RS_SEL_Q(rs_tex_comp + 3);
-                    tex_count++;
-                    rs_tex_comp += 4;
-                    break;
-                default:
-                    break;
-            }
-        }
+        rX00_rs_col       = r500_rs_col;
+        rX00_rs_col_write = r500_rs_col_write;
+        rX00_rs_tex       = r500_rs_tex;
+        rX00_rs_tex_write = r500_rs_tex_write;
+    } else {
+        rX00_rs_col       = r300_rs_col;
+        rX00_rs_col_write = r300_rs_col_write;
+        rX00_rs_tex       = r300_rs_tex;
+        rX00_rs_tex_write = r300_rs_tex_write;
+    }
 
-        /* Rasterize at least one color, or bad things happen. */
-        if ((col_count == 0) && (tex_count == 0)) {
-            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+    /* Rasterize colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
+            /* Always rasterize if it's written by the VS,
+             * otherwise it locks up. */
+            rX00_rs_col(rs, col_count, i, FALSE);
+
+            /* Write it to the FS input register if it's used by the FS. */
+            if (fs_inputs->color[i] != ATTR_UNUSED) {
+                rX00_rs_col_write(rs, col_count, fp_offset);
+                fp_offset++;
+            }
             col_count++;
+        } else {
+            /* Skip the FS input register, leave it uninitialized. */
+            /* If we try to set it to (0,0,0,1), it will lock up. */
+            if (fs_inputs->color[i] != ATTR_UNUSED) {
+                fp_offset++;
+            }
         }
+    }
 
-        for (i = 0; i < tex_count; i++) {
-            rs->inst[i] |= R500_RS_INST_TEX_ID(i) |
-                R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_offset);
-            fp_offset++;
+    /* Rasterize texture coordinates. */
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
+            /* Always rasterize if it's written by the VS,
+             * otherwise it locks up. */
+            rX00_rs_tex(rs, tex_count, tex_count, FALSE);
+
+            /* Write it to the FS input register if it's used by the FS. */
+            if (fs_inputs->generic[i] != ATTR_UNUSED) {
+                rX00_rs_tex_write(rs, tex_count, fp_offset);
+                fp_offset++;
+            }
+            tex_count++;
+        } else {
+            /* Skip the FS input register, leave it uninitialized. */
+            /* If we try to set it to (0,0,0,1), it will lock up. */
+            if (fs_inputs->generic[i] != ATTR_UNUSED) {
+                fp_offset++;
+            }
         }
+    }
 
-        for (i = 0; i < col_count; i++) {
-            rs->inst[i] |= R500_RS_INST_COL_ID(i) |
-                R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_offset);
+    /* Rasterize fog coordinates. */
+    if (vs_outputs->fog != ATTR_UNUSED) {
+        /* Always rasterize if it's written by the VS,
+         * otherwise it locks up. */
+        rX00_rs_tex(rs, tex_count, tex_count, TRUE);
+
+        /* Write it to the FS input register if it's used by the FS. */
+        if (fs_inputs->fog != ATTR_UNUSED) {
+            rX00_rs_tex_write(rs, tex_count, fp_offset);
             fp_offset++;
         }
+        tex_count++;
     } else {
-        for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
-                case TGSI_SEMANTIC_COLOR:
-                    rs->ip[col_count] |=
-                        R300_RS_COL_PTR(col_count) |
-                        R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
-                    col_count++;
-                    break;
-                case TGSI_SEMANTIC_GENERIC:
-                    rs->ip[tex_count] |=
-                        R300_RS_TEX_PTR(rs_tex_comp) |
-                        R300_RS_SEL_S(R300_RS_SEL_C0) |
-                        R300_RS_SEL_T(R300_RS_SEL_C1) |
-                        R300_RS_SEL_R(R300_RS_SEL_C2) |
-                        R300_RS_SEL_Q(R300_RS_SEL_C3);
-                    tex_count++;
-                    rs_tex_comp+=4;
-                    break;
-                default:
-                    break;
-            }
-        }
-
-        if (col_count == 0) {
-            rs->ip[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
-        }
-
-        if (tex_count == 0) {
-            rs->ip[0] |=
-                R300_RS_SEL_S(R300_RS_SEL_K0) |
-                R300_RS_SEL_T(R300_RS_SEL_K0) |
-                R300_RS_SEL_R(R300_RS_SEL_K0) |
-                R300_RS_SEL_Q(R300_RS_SEL_K1);
+        /* Skip the FS input register, leave it uninitialized. */
+        /* If we try to set it to (0,0,0,1), it will lock up. */
+        if (fs_inputs->fog != ATTR_UNUSED) {
+            fp_offset++;
         }
+    }
 
-        /* Rasterize at least one color, or bad things happen. */
-        if ((col_count == 0) && (tex_count == 0)) {
-            col_count++;
-        }
+    /* Rasterize WPOS. */
+    /* If the FS doesn't need it, it's not written by the VS. */
+    if (fs_inputs->wpos != ATTR_UNUSED) {
+        rX00_rs_tex(rs, tex_count, tex_count, FALSE);
+        rX00_rs_tex_write(rs, tex_count, fp_offset);
 
-        for (i = 0; i < tex_count; i++) {
-            rs->inst[i] |= R300_RS_INST_TEX_ID(i) |
-                R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_offset);
-            fp_offset++;
-        }
+        fp_offset++;
+        tex_count++;
+    }
 
-        for (i = 0; i < col_count; i++) {
-            rs->inst[i] |= R300_RS_INST_COL_ID(i) |
-                R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_offset);
-            fp_offset++;
-        }
+    /* Rasterize at least one color, or bad things happen. */
+    if (col_count == 0 && tex_count == 0) {
+        rX00_rs_col(rs, 0, 0, TRUE);
+        col_count++;
     }
 
-    rs->count = (rs_tex_comp) | (col_count << R300_IC_COUNT_SHIFT) |
+    rs->count = (tex_count*4) | (col_count << R300_IC_COUNT_SHIFT) |
         R300_HIRES_EN;
 
-    rs->inst_count = MAX2(MAX2(col_count - 1, tex_count - 1), 0);
+    rs->inst_count = MAX3(col_count - 1, tex_count - 1, 0);
 }
 
 /* Update the vertex format. */
 static void r300_update_derived_shader_state(struct r300_context* r300)
 {
-    /* struct r300_screen* r300screen = r300_screen(r300->context.screen); */
-    struct r300_vertex_info* vformat;
-    struct r300_rs_block* rs_block;
-    int i;
+    struct r300_screen* r300screen = r300_screen(r300->context.screen);
 
     /*
     struct r300_shader_key* key;
@@ -495,32 +466,51 @@ static void r300_update_derived_shader_state(struct r300_context* r300)
             (void*)key, (void*)value);
     } */
 
-    /* XXX This will be refactored ASAP. */
-    vformat = CALLOC_STRUCT(r300_vertex_info);
-    rs_block = CALLOC_STRUCT(r300_rs_block);
+    /* Reset structures */
+    memset(r300->rs_block, 0, sizeof(struct r300_rs_block));
+    memset(r300->vertex_info, 0, sizeof(struct r300_vertex_info));
+    memcpy(r300->vertex_info->vinfo.hwfmt, r300->vs->hwfmt, sizeof(uint)*4);
+
+    r300_update_rs_block(r300, &r300->vs->outputs, &r300->fs->inputs);
 
-    for (i = 0; i < 16; i++) {
-        vformat->vs_tab[i] = -1;
-        vformat->fs_tab[i] = -1;
+    if (r300screen->caps->has_tcl) {
+        r300_vertex_psc(r300);
+    } else {
+        r300_draw_emit_all_attribs(r300);
+        draw_compute_vertex_size(&r300->vertex_info->vinfo);
+        r300_swtcl_vertex_psc(r300);
     }
 
-    r300_vs_tab_routes(r300, vformat);
-    r300_vertex_psc(r300, vformat);
-    r300_update_fs_tab(r300, vformat);
+    r300->dirty_state |= R300_NEW_RS_BLOCK;
+}
 
-    r300_update_rs_block(r300, rs_block);
+static boolean r300_dsa_writes_depth_stencil(struct r300_dsa_state* dsa)
+{
+    /* We are interested only in the cases when a new depth or stencil value
+     * can be written and changed. */
+
+    /* We might optionally check for [Z func: never] and inspect the stencil
+     * state in a similar fashion, but it's not terribly important. */
+    return (dsa->z_buffer_control & R300_Z_WRITE_ENABLE) ||
+           (dsa->stencil_ref_mask & R300_STENCILWRITEMASK_MASK) ||
+           ((dsa->z_buffer_control & R500_STENCIL_REFMASK_FRONT_BACK) &&
+            (dsa->stencil_ref_bf & R300_STENCILWRITEMASK_MASK));
+}
 
-    FREE(r300->vertex_info);
-    FREE(r300->rs_block);
+static boolean r300_dsa_alpha_test_enabled(struct r300_dsa_state* dsa)
+{
+    /* We are interested only in the cases when alpha testing can kill
+     * a fragment. */
+    uint32_t af = dsa->alpha_function;
 
-    r300->vertex_info = vformat;
-    r300->rs_block = rs_block;
-    r300->dirty_state |= (R300_NEW_VERTEX_FORMAT | R300_NEW_RS_BLOCK);
+    return (af & R300_FG_ALPHA_FUNC_ENABLE) &&
+           (af & R300_FG_ALPHA_FUNC_ALWAYS) != R300_FG_ALPHA_FUNC_ALWAYS;
 }
 
 static void r300_update_ztop(struct r300_context* r300)
 {
-    r300->ztop_state.z_buffer_top = R300_ZTOP_ENABLE;
+    struct r300_ztop_state* ztop_state =
+        (struct r300_ztop_state*)r300->ztop_state.state;
 
     /* This is important enough that I felt it warranted a comment.
      *
@@ -534,33 +524,45 @@ static void r300_update_ztop(struct r300_context* r300)
      * The docs claim that for the first three cases, if no ZS writes happen,
      * then ZTOP can be used.
      *
+     * (3) will never apply since we do not support chroma-keyed operations.
+     * (4) will need to be re-examined (and this comment updated) if/when
+     * Hyper-Z becomes supported.
+     *
      * Additionally, the following conditions require disabled ZTOP:
-     * ~) Depth writes in fragment shader
-     * ~) Outstanding occlusion queries
+     * 5) Depth writes in fragment shader
+     * 6) Outstanding occlusion queries
+     *
+     * This register causes stalls all the way from SC to CB when changed,
+     * but it is buffered on-chip so it does not hurt to write it if it has
+     * not changed.
      *
      * ~C.
      */
-    if (r300->dsa_state->alpha_function) {
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300->fs->info.uses_kill) {
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300_fragment_shader_writes_depth(r300->fs)) {
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
-    } else if (r300->query_current) {
-        r300->ztop_state.z_buffer_top = R300_ZTOP_DISABLE;
+
+    /* ZS writes */
+    if (r300_dsa_writes_depth_stencil(r300->dsa_state.state) &&
+           (r300_dsa_alpha_test_enabled(r300->dsa_state.state) ||/* (1) */
+            r300->fs->info.uses_kill)) {                         /* (2) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300_fragment_shader_writes_depth(r300->fs)) {    /* (5) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else if (r300->query_current) {                            /* (6) */
+        ztop_state->z_buffer_top = R300_ZTOP_DISABLE;
+    } else {
+        ztop_state->z_buffer_top = R300_ZTOP_ENABLE;
     }
+
+    r300->ztop_state.dirty = TRUE;
 }
 
 void r300_update_derived_state(struct r300_context* r300)
 {
     /* XXX */
-    if (TRUE || r300->dirty_state &
-        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER)) {
+    if (r300->dirty_state &
+        (R300_NEW_FRAGMENT_SHADER | R300_NEW_VERTEX_SHADER |
+         R300_NEW_VERTEX_FORMAT) || r300->rs_state.dirty) {
         r300_update_derived_shader_state(r300);
     }
 
-    if (r300->dirty_state &
-            (R300_NEW_DSA | R300_NEW_FRAGMENT_SHADER | R300_NEW_QUERY)) {
-        r300_update_ztop(r300);
-    }
+    r300_update_ztop(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_state_inlines.h b/src/gallium/drivers/r300/r300_state_inlines.h
index e6c1cb54dac..e2180b33b77 100644
--- a/src/gallium/drivers/r300/r300_state_inlines.h
+++ b/src/gallium/drivers/r300/r300_state_inlines.h
@@ -28,6 +28,8 @@
 
 #include "pipe/p_format.h"
 
+#include "util/u_format.h"
+
 #include "r300_reg.h"
 
 /* Some maths. These should probably find their way to u_math, if needed. */
@@ -255,38 +257,37 @@ static INLINE uint32_t r300_translate_wrap(int wrap)
     }
 }
 
-static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip)
+static INLINE uint32_t r300_translate_tex_filters(int min, int mag, int mip,
+                                                  int is_anisotropic)
 {
     uint32_t retval = 0;
-    switch (min) {
+    if (is_anisotropic)
+        retval |= R300_TX_MIN_FILTER_ANISO | R300_TX_MAG_FILTER_ANISO;
+    else {
+        switch (min) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MIN_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MIN_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MIN_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", min);
             assert(0);
             break;
-    }
-    switch (mag) {
+        }
+        switch (mag) {
         case PIPE_TEX_FILTER_NEAREST:
             retval |= R300_TX_MAG_FILTER_NEAREST;
             break;
         case PIPE_TEX_FILTER_LINEAR:
             retval |= R300_TX_MAG_FILTER_LINEAR;
             break;
-        case PIPE_TEX_FILTER_ANISO:
-            retval |= R300_TX_MAG_FILTER_ANISO;
-            break;
         default:
             debug_printf("r300: Unknown texture filter %d\n", mag);
             assert(0);
             break;
+        }
     }
     switch (mip) {
         case PIPE_TEX_MIPFILTER_NONE:
@@ -443,20 +444,22 @@ static INLINE uint32_t r300_translate_gb_pipes(int pipe_count)
 static INLINE unsigned pf_component_count(enum pipe_format format) {
     unsigned count = 0;
 
-    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
-        return count;
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0)) {
+        count++;
     }
-
-    if (pf_size_x(format)) {
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 1)) {
+        count++;
+    }
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 2)) {
         count++;
     }
-    if (pf_size_y(format)) {
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 3)) {
         count++;
     }
-    if (pf_size_z(format)) {
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_ZS, 0)) {
         count++;
     }
-    if (pf_size_w(format)) {
+    if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_ZS, 1)) {
         count++;
     }
 
@@ -467,19 +470,23 @@ static INLINE unsigned pf_component_count(enum pipe_format format) {
 static INLINE uint16_t
 r300_translate_vertex_data_type(enum pipe_format format) {
     uint32_t result = 0;
+    const struct util_format_description *desc;
     unsigned components = pf_component_count(format);
 
-    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
+    desc = util_format_description(format);
+
+    if (desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+        desc->layout != UTIL_FORMAT_LAYOUT_ARRAY) {
         debug_printf("r300: Bad format %s in %s:%d\n", pf_name(format),
             __FUNCTION__, __LINE__);
         assert(0);
     }
 
-    switch (pf_type(format)) {
+    switch (desc->channel[0].type) {
         /* Half-floats, floats, doubles */
-        case PIPE_FORMAT_TYPE_FLOAT:
-            switch (pf_size_x(format)) {
-                case 4:
+        case UTIL_FORMAT_TYPE_FLOAT:
+            switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0)) {
+                case 32:
                     result = R300_DATA_TYPE_FLOAT_1 + (components - 1);
                     break;
                 default:
@@ -488,19 +495,15 @@ r300_translate_vertex_data_type(enum pipe_format format) {
                     assert(0);
             }
             break;
-        /* Normalized unsigned ints */
-        case PIPE_FORMAT_TYPE_UNORM:
-        /* Normalized signed ints */
-        case PIPE_FORMAT_TYPE_SNORM:
-        /* Non-normalized unsigned ints */
-        case PIPE_FORMAT_TYPE_USCALED:
-        /* Non-normalized signed ints */
-        case PIPE_FORMAT_TYPE_SSCALED:
-            switch (pf_size_x(format)) {
-                case 1:
+        /* Unsigned ints */
+        case UTIL_FORMAT_TYPE_UNSIGNED:
+        /* Signed ints */
+        case UTIL_FORMAT_TYPE_SIGNED:
+            switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0)) {
+                case 8:
                     result = R300_DATA_TYPE_BYTE;
                     break;
-                case 2:
+                case 16:
                     if (components > 2) {
                         result = R300_DATA_TYPE_SHORT_4;
                     } else {
@@ -510,8 +513,8 @@ r300_translate_vertex_data_type(enum pipe_format format) {
                 default:
                     debug_printf("r300: Bad format %s in %s:%d\n",
                         pf_name(format), __FUNCTION__, __LINE__);
-                    debug_printf("r300: pf_size_x(format) == %d\n",
-                        pf_size_x(format));
+                    debug_printf("r300: util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == %d\n",
+                        util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0));
                     assert(0);
             }
             break;
@@ -521,12 +524,11 @@ r300_translate_vertex_data_type(enum pipe_format format) {
             assert(0);
     }
 
-    if (pf_type(format) == PIPE_FORMAT_TYPE_SSCALED) {
+    if (desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
         result |= R300_SIGNED;
-    } else if (pf_type(format) == PIPE_FORMAT_TYPE_UNORM) {
+    }
+    if (desc->channel[0].normalized) {
         result |= R300_NORMALIZE;
-    } else if (pf_type(format) == PIPE_FORMAT_TYPE_SNORM) {
-        result |= (R300_SIGNED | R300_NORMALIZE);
     }
 
     return result;
@@ -534,18 +536,38 @@ r300_translate_vertex_data_type(enum pipe_format format) {
 
 static INLINE uint16_t
 r300_translate_vertex_data_swizzle(enum pipe_format format) {
+    const struct util_format_description *desc = util_format_description(format);
+    unsigned swizzle[4], i;
 
-    if (pf_layout(format) != PIPE_FORMAT_LAYOUT_RGBAZS) {
+    assert(format);
+
+    if (desc->layout != UTIL_FORMAT_LAYOUT_ARITH &&
+        desc->layout != UTIL_FORMAT_LAYOUT_ARRAY) {
         debug_printf("r300: Bad format %s in %s:%d\n",
             pf_name(format), __FUNCTION__, __LINE__);
         return 0;
     }
 
-    return ((pf_swizzle_x(format) << R300_SWIZZLE_SELECT_X_SHIFT) |
-        (pf_swizzle_y(format) << R300_SWIZZLE_SELECT_Y_SHIFT) |
-        (pf_swizzle_z(format) << R300_SWIZZLE_SELECT_Z_SHIFT) |
-        (pf_swizzle_w(format) << R300_SWIZZLE_SELECT_W_SHIFT) |
-        (0xf << R300_WRITE_ENA_SHIFT));
+    /* Swizzles for 8bits formats are in the reversed order, not sure why. */
+    if (desc->channel[0].size == 8) {
+        for (i = 0; i < 4; i++) {
+            if (desc->swizzle[i] <= 3) {
+                swizzle[i] = 3 - desc->swizzle[i];
+            } else {
+                swizzle[i] = desc->swizzle[i];
+            }
+        }
+    } else {
+        for (i = 0; i < 4; i++) {
+            swizzle[i] = desc->swizzle[i];
+        }
+    }
+
+    return ((swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
+            (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
+            (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
+            (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT) |
+            (0xf << R300_WRITE_ENA_SHIFT));
 }
 
 #endif /* R300_STATE_INLINES_H */
diff --git a/src/gallium/drivers/r300/r300_state_invariant.c b/src/gallium/drivers/r300/r300_state_invariant.c
index c07e6ae676d..97927acf1b4 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.c
+++ b/src/gallium/drivers/r300/r300_state_invariant.c
@@ -38,12 +38,12 @@ struct pipe_viewport_state r300_viewport_identity = {
  *
  * Note that eventually this should be empty, but it's useful for development
  * and general unduplication of code. */
-void r300_emit_invariant_state(struct r300_context* r300)
+void r300_emit_invariant_state(struct r300_context* r300, void* state)
 {
     struct r300_capabilities* caps = r300_screen(r300->context.screen)->caps;
     CS_LOCALS(r300);
 
-    BEGIN_CS(24 + (caps->has_tcl ? 2: 0));
+    BEGIN_CS(14 + (caps->has_tcl ? 2: 0));
 
     /*** Graphics Backend (GB) ***/
     /* Various GB enables */
@@ -58,21 +58,14 @@ void r300_emit_invariant_state(struct r300_context* r300)
      */
     /* Source of fog depth */
     OUT_CS_REG(R300_GB_SELECT, R300_GB_FOG_SELECT_1_1_W);
-    /* AA enable */
-    OUT_CS_REG(R300_GB_AA_CONFIG, 0x0);
 
     /*** Fog (FG) ***/
     OUT_CS_REG(R300_FG_FOG_BLEND, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_R, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_G, 0x0);
     OUT_CS_REG(R300_FG_FOG_COLOR_B, 0x0);
-    OUT_CS_REG(R300_FG_DEPTH_SRC, 0x0);
-    OUT_CS_REG(R300_US_W_FMT, 0x0);
 
     /*** VAP ***/
-    /* Max and min vertex index clamp. */
-    OUT_CS_REG(R300_VAP_VF_MIN_VTX_INDX, 0x0);
-    OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, 0xffffff);
     /* Sign/normalize control */
     OUT_CS_REG(R300_VAP_PSC_SGN_NORM_CNTL, R300_SGN_NORM_NO_ZERO);
     /* TCL-only stuff */
@@ -84,15 +77,12 @@ void r300_emit_invariant_state(struct r300_context* r300)
     END_CS;
 
     /* XXX unsorted stuff from surface_fill */
-    BEGIN_CS(60 + (caps->has_tcl ? 5 : 0) + (caps->is_r500 ? 4 : 0));
-    /* Flush PVS. */
-    OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0x0);
+    BEGIN_CS(44 + (caps->has_tcl ? 7 : 0) +
+             (caps->family >= CHIP_FAMILY_RV350 ? 4 : 0));
 
-    OUT_CS_REG(R300_SE_VTE_CNTL, R300_VPORT_X_SCALE_ENA |
-        R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-        R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-        R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT);
     if (caps->has_tcl) {
+        /*Flushing PVS is required before the VAP_GB registers can be changed*/
+        OUT_CS_REG(R300_VAP_PVS_STATE_FLUSH_REG, 0);
         OUT_CS_REG_SEQ(R300_VAP_GB_VERT_CLIP_ADJ, 4);
         OUT_CS_32F(1.0);
         OUT_CS_32F(1.0);
@@ -123,21 +113,17 @@ void r300_emit_invariant_state(struct r300_context* r300)
     OUT_CS_REG(R300_SU_DEPTH_OFFSET, 0x00000000);
     OUT_CS_REG(R300_SC_HYPERZ, 0x0000001C);
     OUT_CS_REG(R300_SC_EDGERULE, 0x2DA49525);
-    OUT_CS_REG(R300_RB3D_CCTL, 0x00000000);
     OUT_CS_REG(R300_RB3D_AARESOLVE_CTL, 0x00000000);
-    if (caps->is_r500) {
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x00000000);
-        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFFFFFFFF);
+
+    if (caps->family >= CHIP_FAMILY_RV350) {
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
+        OUT_CS_REG(R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD, 0xFEFEFEFE);
     }
-    OUT_CS_REG(R300_ZB_FORMAT, 0x00000002);
-    OUT_CS_REG(R300_ZB_ZCACHE_CTLSTAT, 0x00000003);
+
     OUT_CS_REG(R300_ZB_BW_CNTL, 0x00000000);
     OUT_CS_REG(R300_ZB_DEPTHCLEARVALUE, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0x00000000);
     OUT_CS_REG(R300_ZB_HIZ_PITCH, 0x00000000);
-    OUT_CS_REG(R300_VAP_VTX_STATE_CNTL, 0x1);
-    OUT_CS_REG(R300_VAP_VSM_VTX_ASSM, 0x405);
-    OUT_CS_REG(R300_SE_VTE_CNTL, 0x0000043F);
 
     /* XXX */
     OUT_CS_REG(R300_SC_CLIP_RULE, 0xaaaa);
diff --git a/src/gallium/drivers/r300/r300_state_invariant.h b/src/gallium/drivers/r300/r300_state_invariant.h
index 05cff0d6dfe..5d1a9636545 100644
--- a/src/gallium/drivers/r300/r300_state_invariant.h
+++ b/src/gallium/drivers/r300/r300_state_invariant.h
@@ -25,6 +25,6 @@
 
 struct r300_context;
 
-void r300_emit_invariant_state(struct r300_context* r300);
+void r300_emit_invariant_state(struct r300_context* r300, void* state);
 
 #endif /* R300_STATE_INVARIANT_H */
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index aea25cf71dd..67bf8ce13fd 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -22,6 +22,7 @@
 
 #include "pipe/p_screen.h"
 
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -29,13 +30,28 @@
 #include "r300_texture.h"
 #include "r300_screen.h"
 
-static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
+#include "radeon_winsys.h"
+
+#define TILE_WIDTH 0
+#define TILE_HEIGHT 1
+
+static const unsigned microblock_table[5][3][2] = {
+    /*linear  tiled   square-tiled */
+    {{32, 1}, {8, 4}, {0, 0}}, /*   8 bits per pixel */
+    {{16, 1}, {8, 2}, {4, 4}}, /*  16 bits per pixel */
+    {{ 8, 1}, {4, 2}, {0, 0}}, /*  32 bits per pixel */
+    {{ 4, 1}, {0, 0}, {2, 2}}, /*  64 bits per pixel */
+    {{ 2, 1}, {0, 0}, {0, 0}}  /* 128 bits per pixel */
+};
+
+static void r300_setup_texture_state(struct r300_screen* screen, struct r300_texture* tex)
 {
     struct r300_texture_state* state = &tex->state;
     struct pipe_texture *pt = &tex->tex;
+    boolean is_r500 = screen->caps->is_r500;
 
-    state->format0 = R300_TX_WIDTH((pt->width[0] - 1) & 0x7ff) |
-                     R300_TX_HEIGHT((pt->height[0] - 1) & 0x7ff);
+    state->format0 = R300_TX_WIDTH((pt->width0 - 1) & 0x7ff) |
+                     R300_TX_HEIGHT((pt->height0 - 1) & 0x7ff);
 
     if (tex->is_npot) {
         /* rectangles love this */
@@ -43,8 +59,7 @@ static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
         state->format2 = (tex->pitch[0] - 1) & 0x1fff;
     } else {
         /* power of two textures (3D, mipmaps, and no pitch) */
-        state->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf) |
-                          R300_TX_NUM_LEVELS(pt->last_level & 0xf);
+        state->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth0) & 0xf);
     }
 
     state->format1 = r300_translate_texformat(pt->format);
@@ -58,17 +73,17 @@ static void r300_setup_texture_state(struct r300_texture* tex, boolean is_r500)
     /* large textures on r500 */
     if (is_r500)
     {
-        if (pt->width[0] > 2048) {
+        if (pt->width0 > 2048) {
             state->format2 |= R500_TXWIDTH_BIT11;
         }
-        if (pt->height[0] > 2048) {
+        if (pt->height0 > 2048) {
             state->format2 |= R500_TXHEIGHT_BIT11;
         }
     }
-    assert(is_r500 || (pt->width[0] <= 2048 && pt->height[0] <= 2048));
+    assert(is_r500 || (pt->width0 <= 2048 && pt->height0 <= 2048));
 
-    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
-		 pt->width[0], pt->height[0], pt->last_level);
+    SCREEN_DBG(screen, DBG_TEX, "r300: Set texture state (%dx%d, %d levels)\n",
+               pt->width0, pt->height0, pt->last_level);
 }
 
 unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
@@ -92,63 +107,101 @@ unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
 }
 
 /**
+ * Return the width (dim==TILE_WIDTH) or height (dim==TILE_HEIGHT) of one tile
+ * of the given texture.
+ */
+static unsigned r300_texture_get_tile_size(struct r300_texture* tex, int dim)
+{
+    unsigned pixsize, tile_size;
+
+    pixsize = util_format_get_blocksize(tex->tex.format);
+    tile_size = microblock_table[util_logbase2(pixsize)][tex->microtile][dim] *
+                (tex->macrotile == R300_BUFFER_TILED ? 8 : 1);
+
+    assert(tile_size);
+    return tile_size;
+}
+
+/**
  * Return the stride, in bytes, of the texture images of the given texture
  * at the given level.
  */
-unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level)
 {
+    unsigned tile_width, width;
+
     if (tex->stride_override)
         return tex->stride_override;
 
+    /* Check the level. */
     if (level > tex->tex.last_level) {
-        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__,
-            level, tex->tex.last_level);
+        SCREEN_DBG(screen, DBG_TEX, "%s: level (%u) > last_level (%u)\n",
+                   __FUNCTION__, level, tex->tex.last_level);
         return 0;
     }
 
-    return align(pf_get_stride(&tex->tex.block, tex->tex.width[level]), 32);
+    width = u_minify(tex->tex.width0, level);
+
+    if (!util_format_is_compressed(tex->tex.format)) {
+        tile_width = r300_texture_get_tile_size(tex, TILE_WIDTH);
+        width = align(width, tile_width);
+        return util_format_get_stride(tex->tex.format, width);
+    } else {
+        return align(util_format_get_stride(tex->tex.format, width), 32);
+    }
 }
 
-static void r300_setup_miptree(struct r300_texture* tex)
+static unsigned r300_texture_get_nblocksy(struct r300_texture* tex,
+                                          unsigned level)
 {
-    struct pipe_texture* base = &tex->tex;
-    int stride, size, layer_size;
-    int i;
+    unsigned height, tile_height;
 
-    for (i = 0; i <= base->last_level; i++) {
-        if (i > 0) {
-            base->width[i] = minify(base->width[i-1]);
-            base->height[i] = minify(base->height[i-1]);
-            base->depth[i] = minify(base->depth[i-1]);
-        }
+    height = u_minify(tex->tex.height0, level);
+
+    if (!util_format_is_compressed(tex->tex.format)) {
+        tile_height = r300_texture_get_tile_size(tex, TILE_HEIGHT);
+        height = align(height, tile_height);
+    }
 
-        base->nblocksx[i] = pf_get_nblocksx(&base->block, base->width[i]);
-        base->nblocksy[i] = pf_get_nblocksy(&base->block, base->height[i]);
+    return util_format_get_nblocksy(tex->tex.format, height);
+}
+
+static void r300_setup_miptree(struct r300_screen* screen,
+                               struct r300_texture* tex)
+{
+    struct pipe_texture* base = &tex->tex;
+    unsigned stride, size, layer_size, nblocksy, i;
+
+    SCREEN_DBG(screen, DBG_TEX, "r300: Making miptree for texture, format %s\n",
+               pf_name(base->format));
 
-        stride = r300_texture_get_stride(tex, i);
-        layer_size = stride * base->nblocksy[i];
+    for (i = 0; i <= base->last_level; i++) {
+        stride = r300_texture_get_stride(screen, tex, i);
+        nblocksy = r300_texture_get_nblocksy(tex, i);
+        layer_size = stride * nblocksy;
 
         if (base->target == PIPE_TEXTURE_CUBE)
             size = layer_size * 6;
         else
-            size = layer_size * base->depth[i];
+            size = layer_size * u_minify(base->depth0, i);
 
         tex->offset[i] = align(tex->size, 32);
         tex->size = tex->offset[i] + size;
         tex->layer_size[i] = layer_size;
-        tex->pitch[i] = stride / base->block.size;
+        tex->pitch[i] = stride / util_format_get_blocksize(base->format);
 
-        debug_printf("r300: Texture miptree: Level %d "
-                "(%dx%dx%d px, pitch %d bytes)\n",
-                i, base->width[i], base->height[i], base->depth[i],
-                stride);
+        SCREEN_DBG(screen, DBG_TEX, "r300: Texture miptree: Level %d "
+                "(%dx%dx%d px, pitch %d bytes) %d bytes total\n",
+                i, u_minify(base->width0, i), u_minify(base->height0, i),
+                u_minify(base->depth0, i), stride, tex->size);
     }
 }
 
 static void r300_setup_flags(struct r300_texture* tex)
 {
-    tex->is_npot = !util_is_power_of_two(tex->tex.width[0]) ||
-                   !util_is_power_of_two(tex->tex.height[0]);
+    tex->is_npot = !util_is_power_of_two(tex->tex.width0) ||
+                   !util_is_power_of_two(tex->tex.height0);
 }
 
 /* Create a new texture. */
@@ -157,6 +210,8 @@ static struct pipe_texture*
                         const struct pipe_texture* template)
 {
     struct r300_texture* tex = CALLOC_STRUCT(r300_texture);
+    struct r300_screen* rscreen = r300_screen(screen);
+    struct radeon_winsys* winsys = (struct radeon_winsys*)screen->winsys;
 
     if (!tex) {
         return NULL;
@@ -167,12 +222,16 @@ static struct pipe_texture*
     tex->tex.screen = screen;
 
     r300_setup_flags(tex);
-    r300_setup_miptree(tex);
-    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
+    r300_setup_miptree(rscreen, tex);
+    r300_setup_texture_state(rscreen, tex);
 
-    tex->buffer = screen->buffer_create(screen, 1024,
+    tex->buffer = screen->buffer_create(screen, 2048,
                                         PIPE_BUFFER_USAGE_PIXEL,
                                         tex->size);
+    winsys->buffer_set_tiling(winsys, tex->buffer,
+                              tex->pitch[0],
+                              tex->microtile != R300_BUFFER_LINEAR,
+                              tex->macrotile != R300_BUFFER_LINEAR);
 
     if (!tex->buffer) {
         FREE(tex);
@@ -208,8 +267,8 @@ static struct pipe_surface* r300_get_tex_surface(struct pipe_screen* screen,
         pipe_reference_init(&surface->reference, 1);
         pipe_texture_reference(&surface->texture, texture);
         surface->format = texture->format;
-        surface->width = texture->width[level];
-        surface->height = texture->height[level];
+        surface->width = u_minify(texture->width0, level);
+        surface->height = u_minify(texture->height0, level);
         surface->offset = offset;
         surface->usage = flags;
         surface->zslice = zslice;
@@ -234,10 +293,11 @@ static struct pipe_texture*
                          struct pipe_buffer* buffer)
 {
     struct r300_texture* tex;
+    struct r300_screen* rscreen = r300_screen(screen);
 
     /* Support only 2D textures without mipmaps */
     if (base->target != PIPE_TEXTURE_2D ||
-        base->depth[0] != 1 ||
+        base->depth0 != 1 ||
         base->last_level != 0) {
         return NULL;
     }
@@ -252,10 +312,10 @@ static struct pipe_texture*
     tex->tex.screen = screen;
 
     tex->stride_override = *stride;
-    tex->pitch[0] = *stride / base->block.size;
+    tex->pitch[0] = *stride / util_format_get_blocksize(base->format);
 
     r300_setup_flags(tex);
-    r300_setup_texture_state(tex, r300_screen(screen)->caps->is_r500);
+    r300_setup_texture_state(rscreen, tex);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
@@ -287,10 +347,9 @@ r300_video_surface_create(struct pipe_screen *screen,
     template.target = PIPE_TEXTURE_2D;
     template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
     template.last_level = 0;
-    template.width[0] = util_next_power_of_two(width);
-    template.height[0] = util_next_power_of_two(height);
-    template.depth[0] = 1;
-    pf_get_block(template.format, &template.block);
+    template.width0 = util_next_power_of_two(width);
+    template.height0 = util_next_power_of_two(height);
+    template.depth0 = 1;
     template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER |
                          PIPE_TEXTURE_USAGE_RENDER_TARGET;
 
@@ -323,7 +382,8 @@ void r300_init_screen_texture_functions(struct pipe_screen* screen)
     screen->video_surface_destroy= r300_video_surface_destroy;
 }
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride)
 {
@@ -335,7 +395,7 @@ boolean r300_get_texture_buffer(struct pipe_texture* texture,
     pipe_buffer_reference(buffer, tex->buffer);
 
     if (stride) {
-        *stride = r300_texture_get_stride(tex, 0);
+        *stride = r300_texture_get_stride(r300_screen(screen), tex, 0);
     }
 
     return TRUE;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 55ceb1a5136..1be1e6843c2 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -31,7 +31,8 @@ struct r300_texture;
 
 void r300_init_screen_texture_functions(struct pipe_screen* screen);
 
-unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level);
+unsigned r300_texture_get_stride(struct r300_screen* screen,
+                                 struct r300_texture* tex, unsigned level);
 
 unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
                                  unsigned zslice, unsigned face);
@@ -115,7 +116,8 @@ r300_video_surface(struct pipe_video_surface *pvs)
 
 #ifndef R300_WINSYS_H
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride);
 
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 589f1984ee3..a792c2cf989 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -120,7 +120,7 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_NOT: return RC_OPCODE_NOT; */
      /* case TGSI_OPCODE_TRUNC: return RC_OPCODE_TRUNC; */
      /* case TGSI_OPCODE_SHL: return RC_OPCODE_SHL; */
-     /* case TGSI_OPCODE_SHR: return RC_OPCODE_SHR; */
+     /* case TGSI_OPCODE_ISHR: return RC_OPCODE_SHR; */
      /* case TGSI_OPCODE_AND: return RC_OPCODE_AND; */
      /* case TGSI_OPCODE_OR: return RC_OPCODE_OR; */
      /* case TGSI_OPCODE_MOD: return RC_OPCODE_MOD; */
@@ -190,10 +190,10 @@ static void transform_dstreg(
     struct rc_dst_register * dst,
     struct tgsi_full_dst_register * src)
 {
-    dst->File = translate_register_file(src->DstRegister.File);
-    dst->Index = translate_register_index(ttr, src->DstRegister.File, src->DstRegister.Index);
-    dst->WriteMask = src->DstRegister.WriteMask;
-    dst->RelAddr = src->DstRegister.Indirect;
+    dst->File = translate_register_file(src->Register.File);
+    dst->Index = translate_register_index(ttr, src->Register.File, src->Register.Index);
+    dst->WriteMask = src->Register.WriteMask;
+    dst->RelAddr = src->Register.Indirect;
 }
 
 static void transform_srcreg(
@@ -201,18 +201,19 @@ static void transform_srcreg(
     struct rc_src_register * dst,
     struct tgsi_full_src_register * src)
 {
-    dst->File = translate_register_file(src->SrcRegister.File);
-    dst->Index = translate_register_index(ttr, src->SrcRegister.File, src->SrcRegister.Index);
-    dst->RelAddr = src->SrcRegister.Indirect;
+    dst->File = translate_register_file(src->Register.File);
+    dst->Index = translate_register_index(ttr, src->Register.File, src->Register.Index);
+    dst->RelAddr = src->Register.Indirect;
     dst->Swizzle = tgsi_util_get_full_src_register_swizzle(src, 0);
     dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 1) << 3;
     dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 2) << 6;
     dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 3) << 9;
-    dst->Abs = src->SrcRegisterExtMod.Absolute;
-    dst->Negate = src->SrcRegister.Negate ? RC_MASK_XYZW : 0;
+    dst->Abs = src->Register.Absolute;
+    dst->Negate = src->Register.Negate ? RC_MASK_XYZW : 0;
 }
 
-static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_ext_texture src)
+static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_texture src,
+                              uint32_t *shadowSamplers)
 {
     switch(src.Texture) {
         case TGSI_TEXTURE_1D:
@@ -233,14 +234,17 @@ static void transform_texture(struct rc_instruction * dst, struct tgsi_instructi
         case TGSI_TEXTURE_SHADOW1D:
             dst->U.I.TexSrcTarget = RC_TEXTURE_1D;
             dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
             break;
         case TGSI_TEXTURE_SHADOW2D:
             dst->U.I.TexSrcTarget = RC_TEXTURE_2D;
             dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
             break;
         case TGSI_TEXTURE_SHADOWRECT:
             dst->U.I.TexSrcTarget = RC_TEXTURE_RECT;
             dst->U.I.TexShadow = 1;
+            *shadowSamplers |= 1 << dst->U.I.TexSrcUnit;
             break;
     }
 }
@@ -258,17 +262,19 @@ static void transform_instruction(struct tgsi_to_rc * ttr, struct tgsi_full_inst
     dst->U.I.SaturateMode = translate_saturate(src->Instruction.Saturate);
 
     if (src->Instruction.NumDstRegs)
-        transform_dstreg(ttr, &dst->U.I.DstReg, &src->FullDstRegisters[0]);
+        transform_dstreg(ttr, &dst->U.I.DstReg, &src->Dst[0]);
 
     for(i = 0; i < src->Instruction.NumSrcRegs; ++i) {
-        if (src->FullSrcRegisters[i].SrcRegister.File == TGSI_FILE_SAMPLER)
-            dst->U.I.TexSrcUnit = src->FullSrcRegisters[i].SrcRegister.Index;
+        if (src->Src[i].Register.File == TGSI_FILE_SAMPLER)
+            dst->U.I.TexSrcUnit = src->Src[i].Register.Index;
         else
-            transform_srcreg(ttr, &dst->U.I.SrcReg[i], &src->FullSrcRegisters[i]);
+            transform_srcreg(ttr, &dst->U.I.SrcReg[i], &src->Src[i]);
     }
 
     /* Texturing. */
-    transform_texture(dst, src->InstructionExtTexture);
+    if (src->Instruction.Texture)
+        transform_texture(dst, src->Texture,
+                          &ttr->compiler->Program.ShadowSamplers);
 }
 
 static void handle_immediate(struct tgsi_to_rc * ttr, struct tgsi_full_immediate * imm)
diff --git a/src/gallium/drivers/r300/r300_vbo.c b/src/gallium/drivers/r300/r300_vbo.c
deleted file mode 100644
index a6a159667a3..00000000000
--- a/src/gallium/drivers/r300/r300_vbo.c
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* r300_vbo: Various helpers for emitting vertex buffers. Needs cleanup,
- * refactoring, etc. */
-
-#include "r300_vbo.h"
-
-#include "pipe/p_format.h"
-
-#include "r300_cs.h"
-#include "r300_context.h"
-#include "r300_state_inlines.h"
-#include "r300_reg.h"
-#include "r300_winsys.h"
-
-static INLINE void setup_vertex_attribute(struct r300_vertex_info *vinfo,
-                                          struct pipe_vertex_element *vert_elem,
-                                          unsigned attr_num)
-{
-    uint16_t hw_fmt1, hw_fmt2;
-
-    hw_fmt1 = r300_translate_vertex_data_type(vert_elem->src_format) |
-        (attr_num << R300_DST_VEC_LOC_SHIFT);
-    hw_fmt2 = r300_translate_vertex_data_swizzle(vert_elem->src_format);
-
-    if (attr_num % 2 == 0)
-    {
-        vinfo->vap_prog_stream_cntl[attr_num >> 1] = hw_fmt1;
-        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] = hw_fmt2;
-    }
-    else
-    {
-        vinfo->vap_prog_stream_cntl[attr_num >> 1] |= hw_fmt1 << 16;
-        vinfo->vap_prog_stream_cntl_ext[attr_num >> 1] |= hw_fmt2 << 16;
-    }
-}
-
-static void finish_vertex_attribs_setup(struct r300_vertex_info *vinfo,
-                                        unsigned attribs_num)
-{
-    uint32_t last_vec_bit = (attribs_num % 2 == 0) ?
-        (R300_LAST_VEC << 16) : R300_LAST_VEC;
-
-    assert(attribs_num > 0 && attribs_num <= 16);
-    vinfo->vap_prog_stream_cntl[(attribs_num - 1) >> 1] |= last_vec_bit;
-}
-
-void setup_vertex_attributes(struct r300_context *r300)
-{
-    struct pipe_vertex_element *vert_elem;
-    int i;
-
-    for (i = 0; i < r300->vertex_element_count; i++) {
-        vert_elem = &r300->vertex_element[i];
-        setup_vertex_attribute(r300->vertex_info, vert_elem, i);
-    }
-
-    finish_vertex_attribs_setup(r300->vertex_info,
-        r300->vertex_element_count);
-}
-
-static INLINE int get_buffer_offset(struct r300_context *r300,
-                                    unsigned int buf_nr,
-                                    unsigned int elem_offset)
-{
-    return r300->vertex_buffer[buf_nr].buffer_offset + elem_offset;
-}
-#if 0
-/* XXX not called at all */
-static void setup_vertex_buffers(struct r300_context *r300)
-{
-    struct pipe_vertex_element *vert_elem;
-    int i;
-
-    for (i = 0; i < r300->aos_count; i++)
-    {
-        vert_elem = &r300->vertex_element[i];
-            /* XXX use translate module to convert the data */
-        if (!format_is_supported(vert_elem->src_format,
-                                 vert_elem->nr_components)) {
-            assert(0);
-            /*
-            struct pipe_buffer *buf;
-            const unsigned int max_index = r300->vertex_buffers[vert_elem->vertex_buffer_index].max_index;
-            buf = pipe_buffer_create(r300->context.screen, 4, usage, vert_elem->nr_components * max_index * sizeof(float));
-            */
-        }
-
-        if (get_buffer_offset(r300,
-                              vert_elem->vertex_buffer_index,
-                              vert_elem->src_offset) % 4) {
-            /* XXX need to align buffer */
-            assert(0);
-        }
-    }
-}
-#endif
-/* XXX these shouldn't be asserts since we can work around bad indexbufs */
-void setup_index_buffer(struct r300_context *r300,
-                        struct pipe_buffer* indexBuffer,
-                        unsigned indexSize)
-{
-    if (!r300->winsys->add_buffer(r300->winsys, indexBuffer,
-                                  RADEON_GEM_DOMAIN_GTT, 0)) {
-        assert(0);
-    }
-
-    if (!r300->winsys->validate(r300->winsys)) {
-        assert(0);
-    }
-}
diff --git a/src/gallium/drivers/r300/r300_vbo.h b/src/gallium/drivers/r300/r300_vbo.h
deleted file mode 100644
index 7afa75899cf..00000000000
--- a/src/gallium/drivers/r300/r300_vbo.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef R300_VBO_H
-#define R300_VBO_H
-
-struct r300_context;
-struct pipe_buffer;
-
-void setup_vertex_attributes(struct r300_context *r300);
-
-void setup_index_buffer(struct r300_context *r300,
-                        struct pipe_buffer* indexBuffer,
-                        unsigned indexSize);
-
-#endif
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index 74ef416dc14..9fbb830047f 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -1,5 +1,6 @@
 /*
  * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,91 +22,299 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_vs.h"
+#include "r300_fs.h"
 
 #include "r300_context.h"
+#include "r300_screen.h"
 #include "r300_tgsi_to_rc.h"
+#include "r300_reg.h"
 
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "radeon_compiler.h"
 
+#include "util/u_math.h"
 
-static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
+/* Convert info about VS output semantics into r300_shader_semantics. */
+static void r300_shader_read_vs_outputs(
+    struct tgsi_shader_info* info,
+    struct r300_shader_semantics* vs_outputs)
 {
-    struct r300_vertex_shader * vs = c->UserData;
-    struct tgsi_shader_info* info = &vs->info;
-    struct tgsi_parse_context parser;
-    struct tgsi_full_declaration * decl;
-    boolean pointsize = FALSE;
-    int out_colors = 0;
-    int colors = 0;
-    int out_generic = 0;
-    int generic = 0;
     int i;
+    unsigned index;
 
-    /* Fill in the input mapping */
-    for (i = 0; i < info->num_inputs; i++)
-        c->code->inputs[i] = i;
+    r300_shader_semantics_reset(vs_outputs);
 
-    /* Fill in the output mapping */
     for (i = 0; i < info->num_outputs; i++) {
+        index = info->output_semantic_index[i];
+
         switch (info->output_semantic_name[i]) {
+            case TGSI_SEMANTIC_POSITION:
+                assert(index == 0);
+                vs_outputs->pos = i;
+                break;
+
             case TGSI_SEMANTIC_PSIZE:
-                pointsize = TRUE;
+                assert(index == 0);
+                vs_outputs->psize = i;
                 break;
+
             case TGSI_SEMANTIC_COLOR:
-                out_colors++;
+                assert(index <= ATTR_COLOR_COUNT);
+                vs_outputs->color[index] = i;
                 break;
-            case TGSI_SEMANTIC_FOG:
+
+            case TGSI_SEMANTIC_BCOLOR:
+                assert(index <= ATTR_COLOR_COUNT);
+                vs_outputs->bcolor[index] = i;
+                break;
+
             case TGSI_SEMANTIC_GENERIC:
-                out_generic++;
+                assert(index <= ATTR_GENERIC_COUNT);
+                vs_outputs->generic[index] = i;
+                break;
+
+            case TGSI_SEMANTIC_FOG:
+                assert(index == 0);
+                vs_outputs->fog = i;
                 break;
+
+            case TGSI_SEMANTIC_EDGEFLAG:
+                assert(index == 0);
+                fprintf(stderr, "r300 VP: cannot handle edgeflag output\n");
+                assert(0);
+                break;
+            default:
+                assert(0);
         }
     }
+}
 
-    tgsi_parse_init(&parser, vs->state.tokens);
+static void r300_shader_vap_output_fmt(struct r300_vertex_shader* vs)
+{
+    struct r300_shader_semantics* vs_outputs = &vs->outputs;
+    uint32_t* hwfmt = vs->hwfmt;
+    int i, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
 
-    while (!tgsi_parse_end_of_tokens(&parser)) {
-        tgsi_parse_token(&parser);
+    /* Do the actual vertex_info setup.
+     *
+     * vertex_info has four uints of hardware-specific data in it.
+     * vinfo.hwfmt[0] is R300_VAP_VTX_STATE_CNTL
+     * vinfo.hwfmt[1] is R300_VAP_VSM_VTX_ASSM
+     * vinfo.hwfmt[2] is R300_VAP_OUTPUT_VTX_FMT_0
+     * vinfo.hwfmt[3] is R300_VAP_OUTPUT_VTX_FMT_1 */
 
-        if (parser.FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
-            continue;
+    hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
 
-        decl = &parser.FullToken.FullDeclaration;
+    /* Position. */
+    if (vs_outputs->pos != ATTR_UNUSED) {
+        hwfmt[1] |= R300_INPUT_CNTL_POS;
+        hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+    } else {
+        assert(0);
+    }
 
-        if (decl->Declaration.File != TGSI_FILE_OUTPUT)
-            continue;
+    /* Point size. */
+    if (vs_outputs->psize != ATTR_UNUSED) {
+        hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+    }
 
-        switch (decl->Semantic.SemanticName) {
-            case TGSI_SEMANTIC_POSITION:
-                c->code->outputs[decl->DeclarationRange.First] = 0;
-                break;
-            case TGSI_SEMANTIC_PSIZE:
-                c->code->outputs[decl->DeclarationRange.First] = 1;
-                break;
-            case TGSI_SEMANTIC_COLOR:
-                c->code->outputs[decl->DeclarationRange.First] = 1 +
-                    (pointsize ? 1 : 0) +
-                    colors++;
-                break;
-            case TGSI_SEMANTIC_FOG:
-            case TGSI_SEMANTIC_GENERIC:
-                c->code->outputs[decl->DeclarationRange.First] = 1 +
-                    (pointsize ? 1 : 0) +
-                    out_colors +
-                    generic++;
-                break;
-            default:
-                debug_printf("r300: vs: Bad semantic declaration %d\n",
-                    decl->Semantic.SemanticName);
-                break;
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
+            hwfmt[1] |= R300_INPUT_CNTL_COLOR;
+            hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << i;
+        }
+    }
+
+    /* Back-face colors. */
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+            hwfmt[1] |= R300_INPUT_CNTL_COLOR;
+            hwfmt[2] |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT << (2+i);
+        }
+    }
+
+    /* Texture coordinates. */
+    gen_count = 0;
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
+            hwfmt[1] |= (R300_INPUT_CNTL_TC0 << gen_count);
+            hwfmt[3] |= (4 << (3 * gen_count));
+            gen_count++;
         }
     }
 
-    tgsi_parse_free(&parser);
+    /* Fog coordinates. */
+    if (vs_outputs->fog != ATTR_UNUSED) {
+        hwfmt[1] |= (R300_INPUT_CNTL_TC0 << gen_count);
+        hwfmt[3] |= (4 << (3 * gen_count));
+        gen_count++;
+    }
+
+    /* XXX magic */
+    assert(gen_count <= 8);
+
+    /* WPOS. */
+    vs->wpos_tex_output = gen_count;
 }
 
+/* Sets up stream mapping to equivalent VS outputs if TCL is bypassed
+ * or isn't present. */
+static void r300_stream_locations_notcl(
+    struct r300_shader_semantics* vs_outputs,
+    int* stream_loc)
+{
+    int i, tabi = 0, gen_count;
+    boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
+                              vs_outputs->bcolor[1] != ATTR_UNUSED;
+
+    /* Position. */
+    stream_loc[tabi++] = 0;
+
+    /* Point size. */
+    if (vs_outputs->psize != ATTR_UNUSED) {
+        stream_loc[tabi++] = 1;
+    }
+
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (vs_outputs->color[i] != ATTR_UNUSED || any_bcolor_used ||
+            vs_outputs->color[1] != ATTR_UNUSED) {
+            stream_loc[tabi++] = 2 + i;
+        }
+    }
+
+    /* Back-face colors. */
+    if (any_bcolor_used) {
+        for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+            stream_loc[tabi++] = 4 + i;
+        }
+    }
+
+    /* Texture coordinates. */
+    gen_count = 0;
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (vs_outputs->generic[i] != ATTR_UNUSED) {
+            assert(tabi < 16);
+            stream_loc[tabi++] = 6 + gen_count;
+            gen_count++;
+        }
+    }
+
+    /* Fog coordinates. */
+    if (vs_outputs->fog != ATTR_UNUSED) {
+        assert(tabi < 16);
+        stream_loc[tabi++] = 6 + gen_count;
+        gen_count++;
+    }
+
+    /* WPOS. */
+    if (vs_outputs->wpos != ATTR_UNUSED) {
+        assert(tabi < 16);
+        stream_loc[tabi++] = 6 + gen_count;
+        gen_count++;
+    }
+
+    for (; tabi < 16;) {
+        stream_loc[tabi++] = -1;
+    }
+}
+
+static void set_vertex_inputs_outputs(struct r300_vertex_program_compiler * c)
+{
+    struct r300_vertex_shader * vs = c->UserData;
+    struct r300_shader_semantics* outputs = &vs->outputs;
+    struct tgsi_shader_info* info = &vs->info;
+    int i, reg = 0;
+    boolean any_bcolor_used = outputs->bcolor[0] != ATTR_UNUSED ||
+                              outputs->bcolor[1] != ATTR_UNUSED;
+
+    /* Fill in the input mapping */
+    for (i = 0; i < info->num_inputs; i++)
+        c->code->inputs[i] = i;
+
+    /* Position. */
+    if (outputs->pos != ATTR_UNUSED) {
+        c->code->outputs[outputs->pos] = reg++;
+    } else {
+        assert(0);
+    }
+
+    /* Point size. */
+    if (outputs->psize != ATTR_UNUSED) {
+        c->code->outputs[outputs->psize] = reg++;
+    }
+
+    /* If we're writing back facing colors we need to send
+     * four colors to make front/back face colors selection work.
+     * If the vertex program doesn't write all 4 colors, lets
+     * pretend it does by skipping output index reg so the colors
+     * get written into appropriate output vectors.
+     */
+
+    /* Colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->color[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->color[i]] = reg++;
+        } else if (any_bcolor_used ||
+                   outputs->color[1] != ATTR_UNUSED) {
+            reg++;
+        }
+    }
+
+    /* Back-face colors. */
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        if (outputs->bcolor[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->bcolor[i]] = reg++;
+        } else if (any_bcolor_used) {
+            reg++;
+        }
+    }
+
+    /* Texture coordinates. */
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        if (outputs->generic[i] != ATTR_UNUSED) {
+            c->code->outputs[outputs->generic[i]] = reg++;
+        }
+    }
+
+    /* Fog coordinates. */
+    if (outputs->fog != ATTR_UNUSED) {
+        c->code->outputs[outputs->fog] = reg++;
+    }
+
+    /* WPOS. */
+    if (outputs->wpos != ATTR_UNUSED) {
+        c->code->outputs[outputs->wpos] = reg++;
+    }
+}
+
+static void r300_insert_wpos(struct r300_vertex_program_compiler* c,
+                             struct r300_shader_semantics* outputs)
+{
+    int i, lastOutput = 0;
+
+    /* Find the max output index. */
+    lastOutput = MAX2(lastOutput, outputs->psize);
+    for (i = 0; i < ATTR_COLOR_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->color[i]);
+        lastOutput = MAX2(lastOutput, outputs->bcolor[i]);
+    }
+    for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
+        lastOutput = MAX2(lastOutput, outputs->generic[i]);
+    }
+    lastOutput = MAX2(lastOutput, outputs->fog);
+
+    /* Set WPOS after the last output. */
+    lastOutput++;
+    rc_copy_output(&c->Base, 0, lastOutput); /* out[lastOutput] = out[0]; */
+    outputs->wpos = lastOutput;
+}
 
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs)
@@ -113,6 +322,9 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     struct r300_vertex_program_compiler compiler;
     struct tgsi_to_rc ttr;
 
+    /* Initialize. */
+    r300_shader_read_vs_outputs(&vs->info, &vs->outputs);
+
     /* Setup the compiler */
     rc_init(&compiler.Base);
 
@@ -131,13 +343,19 @@ void r300_translate_vertex_shader(struct r300_context* r300,
 
     r300_tgsi_to_rc(&ttr, vs->state.tokens);
 
-    compiler.RequiredOutputs = ~(~0 << vs->info.num_outputs);
+    compiler.RequiredOutputs = ~(~0 << (vs->info.num_outputs+1));
     compiler.SetHwInputOutput = &set_vertex_inputs_outputs;
 
+    /* Insert the WPOS output. */
+    r300_insert_wpos(&compiler, &vs->outputs);
+
+    r300_shader_vap_output_fmt(vs);
+    r300_stream_locations_notcl(&vs->outputs, vs->stream_loc_notcl);
+
     /* Invoke the compiler */
     r3xx_compile_vertex_program(&compiler);
     if (compiler.Base.Error) {
-        /* Todo: Fail gracefully */
+        /* XXX We should fallback using Draw. */
         fprintf(stderr, "r300 VP: Compiler error\n");
         abort();
     }
@@ -146,3 +364,30 @@ void r300_translate_vertex_shader(struct r300_context* r300,
     rc_destroy(&compiler.Base);
     vs->translated = TRUE;
 }
+
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300)
+{
+    struct r300_vertex_shader* vs = r300->vs;
+    int tex_output = r300->vs->wpos_tex_output;
+    uint32_t tex_fmt = R300_INPUT_CNTL_TC0 << tex_output;
+    uint32_t* hwfmt = vs->hwfmt;
+
+    if (r300->fs->inputs.wpos != ATTR_UNUSED) {
+        /* Enable WPOS in VAP. */
+        if (!(hwfmt[1] & tex_fmt)) {
+            hwfmt[1] |= tex_fmt;
+            hwfmt[3] |= (4 << (3 * tex_output));
+
+            assert(tex_output < 8);
+            return TRUE;
+        }
+    } else {
+        /* Disable WPOS in VAP. */
+        if (hwfmt[1] & tex_fmt) {
+            hwfmt[1] &= ~tex_fmt;
+            hwfmt[3] &= ~(4 << (3 * tex_output));
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
diff --git a/src/gallium/drivers/r300/r300_vs.h b/src/gallium/drivers/r300/r300_vs.h
index 2a4ce315e32..18cfeee3cd4 100644
--- a/src/gallium/drivers/r300/r300_vs.h
+++ b/src/gallium/drivers/r300/r300_vs.h
@@ -1,5 +1,6 @@
 /*
  * Copyright 2009 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Copyright 2009 Marek Olšák <maraeo@gmail.com>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -25,18 +26,25 @@
 
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h"
-
 #include "radeon_code.h"
 
+#include "r300_shader_semantics.h"
+
 struct r300_context;
 
 struct r300_vertex_shader {
     /* Parent class */
     struct pipe_shader_state state;
+
     struct tgsi_shader_info info;
+    struct r300_shader_semantics outputs;
+    uint hwfmt[4];
+
+    /* Stream locations for SWTCL or if TCL is bypassed. */
+    int stream_loc_notcl[16];
 
-    /* Fallback shader, because Draw has issues */
-    struct draw_vertex_shader* draw;
+    /* Output stream location for WPOS. */
+    int wpos_tex_output;
 
     /* Has this shader been translated yet? */
     boolean translated;
@@ -45,10 +53,10 @@ struct r300_vertex_shader {
     struct r300_vertex_program_code code;
 };
 
-
-extern struct r300_vertex_program_code r300_passthrough_vertex_shader;
-
 void r300_translate_vertex_shader(struct r300_context* r300,
                                   struct r300_vertex_shader* vs);
 
+/* Return TRUE if VAP (hwfmt) needs to be re-emitted. */
+boolean r300_vertex_shader_setup_wpos(struct r300_context* r300);
+
 #endif /* R300_VS_H */
diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h
index 864a6146b22..bdb8b54bab6 100644
--- a/src/gallium/drivers/r300/r300_winsys.h
+++ b/src/gallium/drivers/r300/r300_winsys.h
@@ -35,78 +35,13 @@ extern "C" {
 #include "pipe/p_state.h"
 #include "pipe/internal/p_winsys_screen.h"
 
-struct r300_winsys {
-    /* Parent class */
-    struct pipe_winsys base;
-
-    /* Opaque Radeon-specific winsys object. */
-    void* radeon_winsys;
-
-    /* PCI ID */
-    uint32_t pci_id;
-
-    /* GB pipe count */
-    uint32_t gb_pipes;
-
-    /* Z pipe count (rv530 only) */
-    uint32_t z_pipes;
-
-    /* GART size. */
-    uint32_t gart_size;
-
-    /* VRAM size. */
-    uint32_t vram_size;
-
-    /* Add a pipe_buffer to the list of buffer objects to validate. */
-    boolean (*add_buffer)(struct r300_winsys* winsys,
-                          struct pipe_buffer* pbuffer,
-                          uint32_t rd,
-                          uint32_t wd);
-
-    /* Revalidate all currently setup pipe_buffers.
-     * Returns TRUE if a flush is required. */
-    boolean (*validate)(struct r300_winsys* winsys);
-
-    /* Check to see if there's room for commands. */
-    boolean (*check_cs)(struct r300_winsys* winsys, int size);
-
-    /* Start a command emit. */
-    void (*begin_cs)(struct r300_winsys* winsys,
-                     int size,
-                     const char* file,
-                     const char* function,
-                     int line);
-
-    /* Write a dword to the command buffer. */
-    void (*write_cs_dword)(struct r300_winsys* winsys, uint32_t dword);
-
-    /* Write a relocated dword to the command buffer. */
-    void (*write_cs_reloc)(struct r300_winsys* winsys,
-                           struct pipe_buffer* bo,
-                           uint32_t rd,
-                           uint32_t wd,
-                           uint32_t flags);
-
-    /* Finish a command emit. */
-    void (*end_cs)(struct r300_winsys* winsys,
-                   const char* file,
-                   const char* function,
-                   int line);
-
-    /* Flush the CS. */
-    void (*flush_cs)(struct r300_winsys* winsys);
-
-    /* winsys flush - callback from winsys when flush required */
-    void (*set_flush_cb)(struct r300_winsys *winsys,
-			 void (*flush_cb)(void *), void *data);
-
-    void (*reset_bos)(struct r300_winsys *winsys);
-};
+#include "radeon_winsys.h"
 
 struct pipe_context* r300_create_context(struct pipe_screen* screen,
-                                         struct r300_winsys* r300_winsys);
+                                         struct radeon_winsys* radeon_winsys);
 
-boolean r300_get_texture_buffer(struct pipe_texture* texture,
+boolean r300_get_texture_buffer(struct pipe_screen* screen,
+                                struct pipe_texture* texture,
                                 struct pipe_buffer** buffer,
                                 unsigned* stride);
 
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index bcb887a0b26..e4ac49fa85f 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -32,6 +32,7 @@ C_SOURCES = \
 	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
 	sp_surface.c \
-	sp_video_context.c
+	sp_video_context.c \
+	sp_winsys.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index aac9edf44e6..3042e556c64 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -34,6 +34,7 @@ softpipe = env.ConvenienceLibrary(
 		'sp_texture.c',
 		'sp_tile_cache.c',
 		'sp_video_context.c',
+		'sp_winsys.c'
 	])
 
 Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index 8fac8e6e05f..5f130453c39 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,6 +36,7 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_tile_cache.h"
 
 
@@ -48,12 +49,16 @@ softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
                double depth, unsigned stencil)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
+   union util_color uc;
    unsigned cv;
    uint i;
 
    if (softpipe->no_rast)
       return;
 
+   if (!softpipe_check_render_cond(softpipe))
+      return;
+
 #if 0
    softpipe_update_derived(softpipe); /* not needed?? */
 #endif
@@ -62,12 +67,12 @@ softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
          struct pipe_surface *ps = softpipe->framebuffer.cbufs[i];
 
-         util_pack_color(rgba, ps->format, &cv);
-         sp_tile_cache_clear(softpipe->cbuf_cache[i], rgba, cv);
+         util_pack_color(rgba, ps->format, &uc);
+         sp_tile_cache_clear(softpipe->cbuf_cache[i], rgba, uc.ui);
 
 #if !TILE_CLEAR_OPTIMIZATION
          /* non-cached surface */
-         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, cv);
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui);
 #endif
       }
    }
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index beeb6f6b459..73e075f0d88 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -89,14 +89,15 @@ softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-      softpipe->quad.shade->destroy( softpipe->quad.shade );
-      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
-      softpipe->quad.blend->destroy( softpipe->quad.blend );
+   softpipe->quad.shade->destroy( softpipe->quad.shade );
+   softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+   softpipe->quad.blend->destroy( softpipe->quad.blend );
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
       pipe_surface_reference(&softpipe->framebuffer.cbufs[i], NULL);
    }
+
    sp_destroy_tile_cache(softpipe->zsbuf_cache);
    pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
 
@@ -105,9 +106,14 @@ softpipe_destroy( struct pipe_context *pipe )
       pipe_texture_reference(&softpipe->texture[i], NULL);
    }
 
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->vertex_tex_cache[i]);
+      pipe_texture_reference(&softpipe->vertex_textures[i], NULL);
+   }
+
    for (i = 0; i < Elements(softpipe->constants); i++) {
-      if (softpipe->constants[i].buffer) {
-         pipe_buffer_reference(&softpipe->constants[i].buffer, NULL);
+      if (softpipe->constants[i]) {
+         pipe_buffer_reference(&softpipe->constants[i], NULL);
       }
    }
 
@@ -151,6 +157,11 @@ softpipe_is_texture_referenced( struct pipe_context *pipe,
           softpipe->tex_cache[i]->texture == texture)
          return PIPE_REFERENCED_FOR_READ;
    }
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      if (softpipe->vertex_tex_cache[i] &&
+          softpipe->vertex_tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
    
    return PIPE_UNREFERENCED;
 }
@@ -164,6 +175,19 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
 }
 
 
+static void
+softpipe_render_condition( struct pipe_context *pipe,
+                           struct pipe_query *query,
+                           uint mode )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+
+   softpipe->render_cond_query = query;
+   softpipe->render_cond_mode = mode;
+}
+
+
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -179,6 +203,7 @@ softpipe_create( struct pipe_screen *screen )
 #endif
 
    softpipe->dump_fs = debug_get_bool_option( "GALLIUM_DUMP_FS", FALSE );
+   softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
    softpipe->pipe.winsys = screen->winsys;
    softpipe->pipe.screen = screen;
@@ -190,7 +215,8 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.delete_blend_state = softpipe_delete_blend_state;
 
    softpipe->pipe.create_sampler_state = softpipe_create_sampler_state;
-   softpipe->pipe.bind_sampler_states  = softpipe_bind_sampler_states;
+   softpipe->pipe.bind_fragment_sampler_states  = softpipe_bind_sampler_states;
+   softpipe->pipe.bind_vertex_sampler_states = softpipe_bind_vertex_sampler_states;
    softpipe->pipe.delete_sampler_state = softpipe_delete_sampler_state;
 
    softpipe->pipe.create_depth_stencil_alpha_state = softpipe_create_depth_stencil_state;
@@ -209,13 +235,18 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.bind_vs_state   = softpipe_bind_vs_state;
    softpipe->pipe.delete_vs_state = softpipe_delete_vs_state;
 
+   softpipe->pipe.create_gs_state = softpipe_create_gs_state;
+   softpipe->pipe.bind_gs_state   = softpipe_bind_gs_state;
+   softpipe->pipe.delete_gs_state = softpipe_delete_gs_state;
+
    softpipe->pipe.set_blend_color = softpipe_set_blend_color;
    softpipe->pipe.set_clip_state = softpipe_set_clip_state;
    softpipe->pipe.set_constant_buffer = softpipe_set_constant_buffer;
    softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
    softpipe->pipe.set_polygon_stipple = softpipe_set_polygon_stipple;
    softpipe->pipe.set_scissor_state = softpipe_set_scissor_state;
-   softpipe->pipe.set_sampler_textures = softpipe_set_sampler_textures;
+   softpipe->pipe.set_fragment_sampler_textures = softpipe_set_sampler_textures;
+   softpipe->pipe.set_vertex_sampler_textures = softpipe_set_vertex_sampler_textures;
    softpipe->pipe.set_viewport_state = softpipe_set_viewport_state;
 
    softpipe->pipe.set_vertex_buffers = softpipe_set_vertex_buffers;
@@ -224,8 +255,8 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.draw_arrays = softpipe_draw_arrays;
    softpipe->pipe.draw_elements = softpipe_draw_elements;
    softpipe->pipe.draw_range_elements = softpipe_draw_range_elements;
-   softpipe->pipe.set_edgeflags = softpipe_set_edgeflags;
-
+   softpipe->pipe.draw_arrays_instanced = softpipe_draw_arrays_instanced;
+   softpipe->pipe.draw_elements_instanced = softpipe_draw_elements_instanced;
 
    softpipe->pipe.clear = softpipe_clear;
    softpipe->pipe.flush = softpipe_flush;
@@ -235,6 +266,8 @@ softpipe_create( struct pipe_screen *screen )
 
    softpipe_init_query_funcs( softpipe );
 
+   softpipe->pipe.render_condition = softpipe_render_condition;
+
    /*
     * Alloc caches for accessing drawing surfaces and textures.
     * Must be before quad stage setup!
@@ -245,7 +278,9 @@ softpipe_create( struct pipe_screen *screen )
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
       softpipe->tex_cache[i] = sp_create_tex_tile_cache( screen );
-
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      softpipe->vertex_tex_cache[i] = sp_create_tex_tile_cache(screen);
+   }
 
    /* setup quad rendering stages */
    softpipe->quad.shade = sp_quad_shade_stage(softpipe);
@@ -261,7 +296,7 @@ softpipe_create( struct pipe_screen *screen )
       goto fail;
 
    draw_texture_samplers(softpipe->draw,
-                         PIPE_MAX_SAMPLERS,
+                         PIPE_MAX_VERTEX_SAMPLERS,
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index a735573d6fb..da673c57ada 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -53,25 +53,30 @@ struct softpipe_context {
    /** Constant state objects */
    struct pipe_blend_state *blend;
    struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_sampler_state *vertex_samplers[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_depth_stencil_alpha_state *depth_stencil;
    struct pipe_rasterizer_state *rasterizer;
    struct sp_fragment_shader *fs;
    struct sp_vertex_shader *vs;
+   struct sp_geometry_shader *gs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
    struct pipe_clip_state clip;
-   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_buffer *constants[PIPE_SHADER_TYPES];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
    struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_texture *vertex_textures[PIPE_MAX_VERTEX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
 
    unsigned num_samplers;
    unsigned num_textures;
+   unsigned num_vertex_samplers;
+   unsigned num_vertex_textures;
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
@@ -111,6 +116,10 @@ struct softpipe_context {
 
    unsigned line_stipple_counter;
 
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+
    /** Software quad rendering pipeline */
    struct {
       struct quad_stage *shade;
@@ -121,7 +130,7 @@ struct softpipe_context {
 
    /** TGSI exec things */
    struct {
-      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_VERTEX_SAMPLERS];
       struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
 
@@ -139,9 +148,11 @@ struct softpipe_context {
 
    unsigned tex_timestamp;
    struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
+   unsigned dump_gs : 1;
    unsigned no_rast : 1;
 };
 
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index d4045816d03..03b58d2fb72 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -38,6 +38,7 @@
 #include "util/u_prim.h"
 
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_state.h"
 
 #include "draw/draw_context.h"
@@ -48,22 +49,30 @@ static void
 softpipe_map_constant_buffers(struct softpipe_context *sp)
 {
    struct pipe_winsys *ws = sp->pipe.winsys;
-   uint i, size;
+   uint i, vssize, gssize;
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+      if (sp->constants[i] && sp->constants[i]->size)
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i],
                                                   PIPE_BUFFER_USAGE_CPU_READ);
    }
 
-   if (sp->constants[PIPE_SHADER_VERTEX].buffer)
-      size = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
+   if (sp->constants[PIPE_SHADER_VERTEX])
+      vssize = sp->constants[PIPE_SHADER_VERTEX]->size;
    else
-      size = 0;
+      vssize = 0;
 
-   draw_set_mapped_constant_buffer(sp->draw,
+   if (sp->constants[PIPE_SHADER_GEOMETRY])
+      gssize = sp->constants[PIPE_SHADER_GEOMETRY]->size;
+   else
+      gssize = 0;
+
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   size);
+                                   vssize);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_GEOMETRY,
+                                   sp->mapped_constants[PIPE_SHADER_GEOMETRY],
+                                   gssize);
 }
 
 
@@ -78,30 +87,53 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
     */
    draw_flush(sp->draw);
 
-   draw_set_mapped_constant_buffer(sp->draw, NULL, 0);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX, NULL, 0);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_GEOMETRY, NULL, 0);
 
-   for (i = 0; i < 2; i++) {
-      if (sp->constants[i].buffer && sp->constants[i].buffer->size)
-         ws->buffer_unmap(ws, sp->constants[i].buffer);
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      if (sp->constants[i] && sp->constants[i]->size)
+         ws->buffer_unmap(ws, sp->constants[i]);
       sp->mapped_constants[i] = NULL;
    }
 }
 
 
-boolean
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_buffer *indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount);
+
+
+void
 softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
                      unsigned start, unsigned count)
 {
-   return softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
 }
 
 
-/**
- * Draw vertex arrays, with optional indexing.
- * Basically, map the vertex buffers (and drawing surfaces), then hand off
- * the drawing to the 'draw' module.
- */
-boolean
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -109,51 +141,142 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned max_index,
                              unsigned mode, unsigned start, unsigned count)
 {
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          min_index,
+                                          max_index,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+
+void
+softpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          0,
+                                          1);
+}
+
+void
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          NULL,
+                                          0,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_buffer *indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount)
+{
+   softpipe_draw_range_elements_instanced(pipe,
+                                          indexBuffer,
+                                          indexSize,
+                                          0,
+                                          0xffffffff,
+                                          mode,
+                                          start,
+                                          count,
+                                          startInstance,
+                                          instanceCount);
+}
+
+static void
+softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
+                                       struct pipe_buffer *indexBuffer,
+                                       unsigned indexSize,
+                                       unsigned minIndex,
+                                       unsigned maxIndex,
+                                       unsigned mode,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned startInstance,
+                                       unsigned instanceCount)
+{
    struct softpipe_context *sp = softpipe_context(pipe);
    struct draw_context *draw = sp->draw;
    unsigned i;
 
+   if (!softpipe_check_render_cond(sp))
+      return;
+
    sp->reduced_api_prim = u_reduced_prim(mode);
 
-   if (sp->dirty)
-      softpipe_update_derived( sp );
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
 
    softpipe_map_transfers(sp);
    softpipe_map_constant_buffers(sp);
 
-   /*
-    * Map vertex buffers
-    */
+   /* Map vertex buffers */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
-      void *buf
-         = pipe_buffer_map(pipe->screen,
-                                    sp->vertex_buffer[i].buffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+      void *buf;
+
+      buf = pipe_buffer_map(pipe->screen,
+                            sp->vertex_buffer[i].buffer,
+                            PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_vertex_buffer(draw, i, buf);
    }
 
    /* Map index buffer, if present */
    if (indexBuffer) {
-      void *mapped_indexes
-         = pipe_buffer_map(pipe->screen, indexBuffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
-      draw_set_mapped_element_buffer_range(draw, indexSize,
-                                           min_index,
-                                           max_index,
+      void *mapped_indexes;
+
+      mapped_indexes = pipe_buffer_map(pipe->screen,
+                                       indexBuffer,
+                                       PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw,
+                                           indexSize,
+                                           minIndex,
+                                           maxIndex,
                                            mapped_indexes);
-   }
-   else {
+   } else {
       /* no index/element buffer */
-      draw_set_mapped_element_buffer_range(draw, 0, start,
-                                           start + count - 1, NULL);
+      draw_set_mapped_element_buffer_range(draw,
+                                           0,
+                                           start,
+                                           start + count - 1,
+                                           NULL);
    }
 
    /* draw! */
-   draw_arrays(draw, mode, start, count);
+   draw_arrays_instanced(draw, mode, start, count, startInstance, instanceCount);
 
-   /*
-    * unmap vertex/index buffers - will cause draw module to flush
-    */
+   /* unmap vertex/index buffers - will cause draw module to flush */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
       pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
@@ -163,32 +286,8 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
       pipe_buffer_unmap(pipe->screen, indexBuffer);
    }
 
-
    /* Note: leave drawing surfaces mapped */
    softpipe_unmap_constant_buffers(sp);
 
    sp->dirty_render_cache = TRUE;
-   
-   return TRUE;
-}
-
-
-boolean
-softpipe_draw_elements(struct pipe_context *pipe,
-                       struct pipe_buffer *indexBuffer,
-                       unsigned indexSize,
-                       unsigned mode, unsigned start, unsigned count)
-{
-   return softpipe_draw_range_elements( pipe, indexBuffer,
-                                        indexSize,
-                                        0, 0xffffffff,
-                                        mode, start, count );
-}
-
-
-void
-softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
-{
-   struct softpipe_context *sp = softpipe_context(pipe);
-   draw_set_edgeflags(sp->draw, edgeflags);
 }
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index db293a51a9f..e8952bf4fb8 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -53,6 +53,9 @@ softpipe_flush( struct pipe_context *pipe,
       for (i = 0; i < softpipe->num_textures; i++) {
          sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
       }
+      for (i = 0; i < softpipe->num_vertex_textures; i++) {
+         sp_flush_tex_tile_cache(softpipe->vertex_tex_cache[i]);
+      }
    }
 
    if (flags & PIPE_FLUSH_SWAPBUFFERS) {
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 4076114d392..27fa126b7c3 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -126,7 +126,10 @@ exec_run( const struct sp_fragment_shader *base,
    setup_pos_vector(quad->posCoef, 
                     (float)quad->input.x0, (float)quad->input.y0, 
                     &machine->QuadPos);
-   
+
+   /* convert 0 to 1.0 and 1 to -1.0 */
+   machine->Face = (float) (quad->input.facing * -2 + 1);
+
    quad->inout.mask &= tgsi_exec_machine_run( machine );
    if (quad->inout.mask == 0)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 43a0bd4415d..98c08eaffaf 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -543,7 +543,6 @@ sp_create_vbuf_backend(struct softpipe_context *sp)
 
    assert(sp->draw);
 
-
    cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
    cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 5cb17c5ae73..a981775cbd3 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -30,6 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "tgsi/tgsi_scan.h"
 #include "sp_context.h"
@@ -650,6 +651,20 @@ static unsigned mask_count[16] =
 
 
 
+/** helper to get number of Z buffer bits */
+static unsigned
+get_depth_bits(struct quad_stage *qs)
+{
+   struct pipe_surface *zsurf = qs->softpipe->framebuffer.zsbuf;
+   if (zsurf)
+      return util_format_get_component_bits(zsurf->format,
+                                            UTIL_FORMAT_COLORSPACE_ZS, 0);
+   else
+      return 0;
+}
+
+
+
 static void
 depth_test_quads_fallback(struct quad_stage *qs, 
                           struct quad_header *quads[],
@@ -665,8 +680,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
       nr = alpha_test_quads(qs, quads, nr);
    }
 
-   if (qs->softpipe->framebuffer.zsbuf && 
-       pf_get_component_bits(qs->softpipe->framebuffer.zsbuf->format, PIPE_FORMAT_COMP_Z) &&
+   if (get_depth_bits(qs) > 0 &&
        (qs->softpipe->depth_stencil->depth.enabled ||
         qs->softpipe->depth_stencil->stencil[0].enabled)) {
 
@@ -884,8 +898,7 @@ choose_depth_test(struct quad_stage *qs,
 
    boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
 
-   boolean depth = (qs->softpipe->framebuffer.zsbuf && 
-                    pf_get_component_bits(qs->softpipe->framebuffer.zsbuf->format, PIPE_FORMAT_COMP_Z) &&
+   boolean depth = (get_depth_bits(qs) > 0 &&
                     qs->softpipe->depth_stencil->depth.enabled);
 
    unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 379cf4ad064..4ef5d9f7b1d 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -99,6 +99,32 @@ softpipe_get_query_result(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called by rendering function to check rendering is conditional.
+ * \return TRUE if we should render, FALSE if we should skip rendering
+ */
+boolean
+softpipe_check_render_cond(struct softpipe_context *sp)
+{
+   struct pipe_context *pipe = &sp->pipe;
+   boolean b, wait;
+   uint64_t result;
+
+   if (!sp->render_cond_query) {
+      return TRUE;  /* no query predicate, draw normally */
+   }
+
+   wait = (sp->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+           sp->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(pipe, sp->render_cond_query, wait, &result);
+   if (b)
+      return result > 0;
+   else
+      return TRUE;
+}
+
+
 void softpipe_init_query_funcs(struct softpipe_context *softpipe )
 {
    softpipe->pipe.create_query = softpipe_create_query;
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
index 05060a45759..736c033897e 100644
--- a/src/gallium/drivers/softpipe/sp_query.h
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -32,6 +32,10 @@
 #ifndef SP_QUERY_H
 #define SP_QUERY_H
 
+extern boolean
+softpipe_check_render_cond(struct softpipe_context *sp);
+
+
 struct softpipe_context;
 extern void softpipe_init_query_funcs(struct softpipe_context * );
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 81fb7aa20c6..bd3532de4f4 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -58,7 +58,9 @@ softpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_SAMPLERS;
+      return PIPE_MAX_VERTEX_SAMPLERS;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
       return 1;
    case PIPE_CAP_TWO_SIDED_STENCIL:
@@ -143,6 +145,11 @@ softpipe_is_format_supported( struct pipe_screen *screen,
    case PIPE_FORMAT_DXT3_RGBA:
    case PIPE_FORMAT_DXT5_RGBA:
    case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_R8G8_SNORM:
+   case PIPE_FORMAT_B6UG5SR5S_NORM:
+   case PIPE_FORMAT_X8UB8UG8SR8S_NORM:
+   case PIPE_FORMAT_A8B8G8R8_SNORM:
+   case PIPE_FORMAT_NONE:
       return FALSE;
    default:
       return TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 478d8efd1b2..f6c3a2b38b9 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -1267,7 +1267,7 @@ void sp_setup_prepare( struct setup_context *setup )
    }
 
    /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->nr_vertex_attrs = draw_num_vs_outputs(sp->draw);
+   setup->nr_vertex_attrs = draw_num_shader_outputs(sp->draw);
 
    sp->quad.first->begin( sp->quad.first );
 
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 77ee3c1136b..7f244c4fd49 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -50,6 +50,7 @@
 #define SP_NEW_VERTEX        0x1000
 #define SP_NEW_VS            0x2000
 #define SP_NEW_QUERY         0x4000
+#define SP_NEW_GS            0x8000
 
 
 struct tgsi_sampler;
@@ -90,6 +91,11 @@ struct sp_vertex_shader {
    int max_sampler;             /* -1 if no samplers */
 };
 
+/** Subclass of pipe_shader_state */
+struct sp_geometry_shader {
+   struct pipe_shader_state shader;
+   struct draw_geometry_shader *draw_data;
+};
 
 
 void *
@@ -104,6 +110,10 @@ void *
 softpipe_create_sampler_state(struct pipe_context *,
                               const struct pipe_sampler_state *);
 void softpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void
+softpipe_bind_vertex_sampler_states(struct pipe_context *,
+                                    unsigned num_samplers,
+                                    void **samplers);
 void softpipe_delete_sampler_state(struct pipe_context *, void *);
 
 void *
@@ -129,7 +139,7 @@ void softpipe_set_clip_state( struct pipe_context *,
 
 void softpipe_set_constant_buffer(struct pipe_context *,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buf);
+                                  struct pipe_buffer *buf);
 
 void *softpipe_create_fs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
@@ -139,6 +149,10 @@ void *softpipe_create_vs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
 void softpipe_bind_vs_state(struct pipe_context *, void *);
 void softpipe_delete_vs_state(struct pipe_context *, void *);
+void *softpipe_create_gs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_gs_state(struct pipe_context *, void *);
+void softpipe_delete_gs_state(struct pipe_context *, void *);
 
 void softpipe_set_polygon_stipple( struct pipe_context *,
 				  const struct pipe_poly_stipple * );
@@ -150,6 +164,11 @@ void softpipe_set_sampler_textures( struct pipe_context *,
                                     unsigned num,
                                     struct pipe_texture ** );
 
+void
+softpipe_set_vertex_sampler_textures(struct pipe_context *,
+                                     unsigned num_textures,
+                                     struct pipe_texture **);
+
 void softpipe_set_viewport_state( struct pipe_context *,
                                   const struct pipe_viewport_state * );
 
@@ -165,14 +184,14 @@ void softpipe_set_vertex_buffers(struct pipe_context *,
 void softpipe_update_derived( struct softpipe_context *softpipe );
 
 
-boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
-			     unsigned start, unsigned count);
+void softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                          unsigned start, unsigned count);
 
-boolean softpipe_draw_elements(struct pipe_context *pipe,
-			       struct pipe_buffer *indexBuffer,
-			       unsigned indexSize,
-			       unsigned mode, unsigned start, unsigned count);
-boolean
+void softpipe_draw_elements(struct pipe_context *pipe,
+                            struct pipe_buffer *indexBuffer,
+                            unsigned indexSize,
+                            unsigned mode, unsigned start, unsigned count);
+void
 softpipe_draw_range_elements(struct pipe_context *pipe,
                              struct pipe_buffer *indexBuffer,
                              unsigned indexSize,
@@ -181,8 +200,22 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
                              unsigned mode, unsigned start, unsigned count);
 
 void
-softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+softpipe_draw_arrays_instanced(struct pipe_context *pipe,
+                               unsigned mode,
+                               unsigned start,
+                               unsigned count,
+                               unsigned startInstance,
+                               unsigned instanceCount);
 
+void
+softpipe_draw_elements_instanced(struct pipe_context *pipe,
+                                 struct pipe_buffer *indexBuffer,
+                                 unsigned indexSize,
+                                 unsigned mode,
+                                 unsigned start,
+                                 unsigned count,
+                                 unsigned startInstance,
+                                 unsigned instanceCount);
 
 void
 softpipe_map_transfers(struct softpipe_context *sp);
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index efed082f823..95ab3234337 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "util/u_memory.h"
+#include "draw/draw_context.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
@@ -45,6 +46,8 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
@@ -62,6 +65,8 @@ void softpipe_set_blend_color( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    softpipe->blend_color = *blend_color;
 
    softpipe->dirty |= SP_NEW_BLEND;
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 3bc96b95385..f6856a5f691 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -67,7 +67,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       /* compute vertex layout now */
       const struct sp_fragment_shader *spfs = softpipe->fs;
       struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_vs_outputs(softpipe->draw);
+      const uint num = draw_current_shader_outputs(softpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
@@ -117,13 +117,13 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
          }
 
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(softpipe->draw,
-                                   spfs->info.input_semantic_name[i],
-                                   spfs->info.input_semantic_index[i]);
+         src = draw_find_shader_output(softpipe->draw,
+                                       spfs->info.input_semantic_name[i],
+                                       spfs->info.input_semantic_index[i]);
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
-      softpipe->psize_slot = draw_find_vs_output(softpipe->draw,
+      softpipe->psize_slot = draw_find_shader_output(softpipe->draw,
                                                  TGSI_SEMANTIC_PSIZE, 0);
       if (softpipe->psize_slot > 0) {
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
@@ -213,6 +213,19 @@ update_tgsi_samplers( struct softpipe_context *softpipe )
          }
       }
    }
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->vertex_tex_cache[i];
+
+      if (tc->texture) {
+         struct softpipe_texture *spt = softpipe_texture(tc->texture);
+
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture(tc);
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index b41f7e8ab72..b7ed4441b43 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -69,7 +69,14 @@ softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->fs = (struct sp_fragment_shader *) fs;
+   draw_flush(softpipe->draw);
+
+   if (softpipe->fs == fs)
+      return;
+
+   draw_flush(softpipe->draw);
+
+   softpipe->fs = fs;
 
    softpipe->dirty |= SP_NEW_FS;
 }
@@ -152,16 +159,81 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 void
 softpipe_set_constant_buffer(struct pipe_context *pipe,
                              uint shader, uint index,
-                             const struct pipe_constant_buffer *buf)
+                             struct pipe_buffer *buf)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   draw_flush(softpipe->draw);
+
    /* note: reference counting */
-   pipe_buffer_reference(&softpipe->constants[shader].buffer,
-			 buf ? buf->buffer : NULL);
+   pipe_buffer_reference(&softpipe->constants[shader], buf);
 
    softpipe->dirty |= SP_NEW_CONSTANTS;
 }
+
+void *
+softpipe_create_gs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_geometry_shader *state;
+
+   state = CALLOC_STRUCT(sp_geometry_shader);
+   if (state == NULL )
+      goto fail;
+
+   /* debug */
+   if (softpipe->dump_gs)
+      tgsi_dump(templ->tokens, 0);
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_geometry_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL)
+      goto fail;
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+void
+softpipe_bind_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->gs = (struct sp_geometry_shader *)gs;
+
+   draw_bind_geometry_shader(softpipe->draw,
+                             (softpipe->gs ? softpipe->gs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_GS;
+}
+
+
+void
+softpipe_delete_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_geometry_shader *state =
+      (struct sp_geometry_shader *)gs;
+
+   draw_delete_geometry_shader(softpipe->draw,
+                               (state) ? state->draw_data : 0);
+   FREE(state);
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 87b72196838..a5b00336d44 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -41,14 +41,17 @@ softpipe_create_rasterizer_state(struct pipe_context *pipe,
 }
 
 void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
-                                    void *setup)
+                                    void *rasterizer)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   if (softpipe->rasterizer == rasterizer)
+      return;
+
    /* pass-through to draw module */
-   draw_set_rasterizer_state(softpipe->draw, setup);
+   draw_set_rasterizer_state(softpipe->draw, rasterizer);
 
-   softpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+   softpipe->rasterizer = rasterizer;
 
    softpipe->dirty |= SP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index db0b8ab76b1..ceb4e338f1a 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -94,6 +94,34 @@ softpipe_bind_sampler_states(struct pipe_context *pipe,
 
 
 void
+softpipe_bind_vertex_sampler_states(struct pipe_context *pipe,
+                                    unsigned num_samplers,
+                                    void **samplers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num_samplers <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_samplers == softpipe->num_vertex_samplers &&
+       !memcmp(softpipe->vertex_samplers, samplers, num_samplers * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num_samplers; ++i)
+      softpipe->vertex_samplers[i] = samplers[i];
+   for (i = num_samplers; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      softpipe->vertex_samplers[i] = NULL;
+
+   softpipe->num_vertex_samplers = num_samplers;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+
+void
 softpipe_set_sampler_textures(struct pipe_context *pipe,
                               unsigned num, struct pipe_texture **texture)
 {
@@ -122,6 +150,37 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
 }
 
 
+void
+softpipe_set_vertex_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num_textures,
+                                     struct pipe_texture **textures)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num_textures <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num_textures == softpipe->num_vertex_textures &&
+       !memcmp(softpipe->vertex_textures, textures, num_textures * sizeof(struct pipe_texture *))) {
+      return;
+   }
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num_textures ? textures[i] : NULL;
+
+      pipe_texture_reference(&softpipe->vertex_textures[i], tex);
+      sp_tex_tile_cache_set_texture(softpipe->vertex_tex_cache[i], tex);
+   }
+
+   softpipe->num_vertex_textures = num_textures;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+
 /**
  * Find/create an sp_sampler_varient object for sampling the given texture,
  * sampler and tex unit.
@@ -185,16 +244,16 @@ softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
     * fragment programs.
     */
    for (i = 0; i <= softpipe->vs->max_sampler; i++) {
-      if (softpipe->sampler[i]) {
+      if (softpipe->vertex_samplers[i]) {
          softpipe->tgsi.vert_samplers_list[i] = 
             get_sampler_varient( i,
-                                 sp_sampler(softpipe->sampler[i]),
-                                 softpipe->texture[i],
+                                sp_sampler(softpipe->vertex_samplers[i]),
+                                softpipe->vertex_textures[i],
                                  TGSI_PROCESSOR_VERTEX );
 
          sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
-                                          softpipe->tex_cache[i],
-                                          softpipe->texture[i] );
+                                         softpipe->vertex_tex_cache[i],
+                                         softpipe->vertex_textures[i] );
       }
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index 794a9225b86..39466782195 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -34,6 +34,8 @@
 
 #include "draw/draw_context.h"
 
+#include "util/u_format.h"
+
 
 /**
  * XXX this might get moved someday
@@ -48,6 +50,8 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct softpipe_context *sp = softpipe_context(pipe);
    uint i;
 
+   draw_flush(sp->draw);
+
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
@@ -79,8 +83,9 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       if (sp->framebuffer.zsbuf) {
          int depth_bits;
          double mrd;
-         depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
-                                            PIPE_FORMAT_COMP_Z);
+         depth_bits = util_format_get_component_bits(sp->framebuffer.zsbuf->format,
+                                                     UTIL_FORMAT_COLORSPACE_ZS,
+                                                     0);
          if (depth_bits > 16) {
             mrd = 0.0000001;
          }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index c22ee86b66c..1ae8fecacf7 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -2,7 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
- * Copyright 2008 VMware, Inc.  All rights reserved.
+ * Copyright 2008-2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -514,21 +514,15 @@ static float
 compute_lambda_1d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
    float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
    float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
-   float rho = MAX2(dsdx, dsdy) * texture->width[0];
-   float lambda;
-
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+   float rho = MAX2(dsdx, dsdy) * texture->width0;
 
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -536,8 +530,7 @@ static float
 compute_lambda_2d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -545,16 +538,11 @@ compute_lambda_2d(const struct sp_sampler_varient *samp,
    float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
    float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
    float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
-   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
-   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float maxx = MAX2(dsdx, dsdy) * texture->width0;
+   float maxy = MAX2(dtdx, dtdy) * texture->height0;
    float rho  = MAX2(maxx, maxy);
-   float lambda;
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
@@ -562,8 +550,7 @@ static float
 compute_lambda_3d(const struct sp_sampler_varient *samp,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  float lodbias)
+                  const float p[QUAD_SIZE])
 {
    const struct pipe_texture *texture = samp->texture;
    const struct pipe_sampler_state *sampler = samp->sampler;
@@ -573,34 +560,29 @@ compute_lambda_3d(const struct sp_sampler_varient *samp,
    float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
    float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
    float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
-   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
-   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
-   float maxz = MAX2(dpdx, dpdy) * texture->depth[0];
-   float rho, lambda;
+   float maxx = MAX2(dsdx, dsdy) * texture->width0;
+   float maxy = MAX2(dtdx, dtdy) * texture->height0;
+   float maxz = MAX2(dpdx, dpdy) * texture->depth0;
+   float rho;
 
    rho = MAX2(maxx, maxy);
    rho = MAX2(rho, maxz);
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
-
-   return lambda;
+   return util_fast_log2(rho);
 }
 
 
 /**
  * Compute lambda for a vertex texture sampler.
- * Since there aren't derivatives to use, just return the LOD bias.
+ * Since there aren't derivatives to use, just return 0.
  */
 static float
 compute_lambda_vert(const struct sp_sampler_varient *samp,
                     const float s[QUAD_SIZE],
                     const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    float lodbias)
+                    const float p[QUAD_SIZE])
 {
-   return lodbias;
+   return 0.0f;
 }
 
 
@@ -644,8 +626,8 @@ get_texel_2d(const struct sp_sampler_varient *samp,
    const struct pipe_texture *texture = samp->texture;
    unsigned level = addr.bits.level;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level]) {
+   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
+       y < 0 || y >= (int) u_minify(texture->height0, level)) {
       return samp->sampler->border_color;
    }
    else {
@@ -737,9 +719,9 @@ get_texel_3d(const struct sp_sampler_varient *samp,
    const struct pipe_texture *texture = samp->texture;
    unsigned level = addr.bits.level;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
+   if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
+       y < 0 || y >= (int) u_minify(texture->height0, level) ||
+       z < 0 || z >= (int) u_minify(texture->depth0, level)) {
       return samp->sampler->border_color;
    }
    else {
@@ -769,7 +751,8 @@ img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -827,7 +810,8 @@ img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                                  const float s[QUAD_SIZE],
                                  const float t[QUAD_SIZE],
                                  const float p[QUAD_SIZE],
-                                 float lodbias,
+                                 const float c0[QUAD_SIZE],
+                                 enum tgsi_sampler_control control,
                                  float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -866,7 +850,8 @@ img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
                                 const float s[QUAD_SIZE],
                                 const float t[QUAD_SIZE],
                                 const float p[QUAD_SIZE],
-                                float lodbias,
+                                const float c0[QUAD_SIZE],
+                                enum tgsi_sampler_control control,
                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -914,7 +899,8 @@ img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -925,7 +911,7 @@ img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
+   width = u_minify(texture->width0, level0);
 
    assert(width > 0);
 
@@ -949,7 +935,8 @@ img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -961,8 +948,8 @@ img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
 
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -996,7 +983,8 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
                         const float s[QUAD_SIZE],
                         const float t[QUAD_SIZE],
                         const float p[QUAD_SIZE],
-                        float lodbias,
+                        const float c0[QUAD_SIZE],
+                        enum tgsi_sampler_control control,
                         float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1008,8 +996,8 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -1035,7 +1023,8 @@ img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
-                      float lodbias,
+                      const float c0[QUAD_SIZE],
+                      enum tgsi_sampler_control control,
                       float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1046,9 +1035,9 @@ img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+   depth = u_minify(texture->depth0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -1076,7 +1065,8 @@ img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1088,7 +1078,7 @@ img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
+   width = u_minify(texture->width0, level0);
 
    assert(width > 0);
 
@@ -1115,7 +1105,8 @@ img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1127,8 +1118,8 @@ img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -1161,7 +1152,8 @@ img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1174,8 +1166,8 @@ img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
 
    assert(width > 0);
    assert(height > 0);
@@ -1209,7 +1201,8 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     float lodbias,
+                     const float c0[QUAD_SIZE],
+                     enum tgsi_sampler_control control,
                      float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1221,9 +1214,9 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
    union tex_tile_address addr;
 
    level0 = samp->level;
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
+   width = u_minify(texture->width0, level0);
+   height = u_minify(texture->height0, level0);
+   depth = u_minify(texture->depth0, level0);
 
    addr.value = 0;
    addr.bits.level = level0;
@@ -1261,29 +1254,60 @@ img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
 }
 
 
+/* Calculate level of detail for every fragment.
+ * Note that lambda has already been biased by global LOD bias.
+ */
+static INLINE void
+compute_lod(const struct pipe_sampler_state *sampler,
+            const float biased_lambda,
+            const float lodbias[QUAD_SIZE],
+            float lod[QUAD_SIZE])
+{
+   uint i;
+
+   for (i = 0; i < QUAD_SIZE; i++) {
+      lod[i] = biased_lambda + lodbias[i];
+      lod[i] = CLAMP(lod[i], sampler->min_lod, sampler->max_lod);
+   }
+}
+
+
 static void
 mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
                   const float s[QUAD_SIZE],
                   const float t[QUAD_SIZE],
                   const float p[QUAD_SIZE],
-                  float lodbias,
+                  const float c0[QUAD_SIZE],
+                  enum tgsi_sampler_control control,
                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else if (level0 >= texture->last_level) {
       samp->level = texture->last_level;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1292,10 +1316,10 @@ mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
       int c,j;
 
       samp->level = level0;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1311,23 +1335,36 @@ mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
                    const float s[QUAD_SIZE],
                    const float t[QUAD_SIZE],
                    const float p[QUAD_SIZE],
-                   float lodbias,
+                   const float c0[QUAD_SIZE],
+                   enum tgsi_sampler_control control,
                    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
       samp->level = 0;
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       samp->level = (int)(lambda + 0.5) ;
       samp->level = MIN2(samp->level, (int)texture->last_level);
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 
 #if 0
@@ -1345,17 +1382,32 @@ mip_filter_none(struct tgsi_sampler *tgsi_sampler,
                 const float s[QUAD_SIZE],
                 const float t[QUAD_SIZE],
                 const float p[QUAD_SIZE],
-                float lodbias,
+                const float c0[QUAD_SIZE],
+                enum tgsi_sampler_control control,
                 float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
-   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   float lambda;
+   float lod[QUAD_SIZE];
+
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
 
    if (lambda < 0.0) { 
-      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->mag_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
-      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+      samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 }
 
@@ -1371,15 +1423,28 @@ mip_filter_linear_2d_linear_repeat_POT(
    const float s[QUAD_SIZE],
    const float t[QUAD_SIZE],
    const float p[QUAD_SIZE],
-   float lodbias,
+   const float c0[QUAD_SIZE],
+   enum tgsi_sampler_control control,
    float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
    const struct pipe_texture *texture = samp->texture;
    int level0;
    float lambda;
+   float lod[QUAD_SIZE];
 
-   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   if (control == tgsi_sampler_lod_bias) {
+      lambda = samp->compute_lambda(samp, s, t, p) + samp->sampler->lod_bias;
+      compute_lod(samp->sampler, lambda, c0, lod);
+   } else {
+      assert(control == tgsi_sampler_lod_explicit);
+
+      memcpy(lod, c0, sizeof(lod));
+   }
+
+   /* XXX: Take into account all lod values.
+    */
+   lambda = lod[0];
    level0 = (int)lambda;
 
    /* Catches both negative and large values of level0:
@@ -1390,7 +1455,7 @@ mip_filter_linear_2d_linear_repeat_POT(
       else
          samp->level = texture->last_level;
 
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
    else {
       float levelBlend = lambda - level0;
@@ -1399,10 +1464,10 @@ mip_filter_linear_2d_linear_repeat_POT(
       int c,j;
 
       samp->level = level0;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba0);
 
       samp->level = level0+1;
-      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+      img_filter_2d_linear_repeat_POT(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba1);
 
       for (j = 0; j < QUAD_SIZE; j++) {
          for (c = 0; c < 4; c++) {
@@ -1422,7 +1487,8 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
                const float s[QUAD_SIZE],
                const float t[QUAD_SIZE],
                const float p[QUAD_SIZE],
-               float lodbias,
+               const float c0[QUAD_SIZE],
+               enum tgsi_sampler_control control,
                float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1430,7 +1496,7 @@ sample_compare(struct tgsi_sampler *tgsi_sampler,
    int j, k0, k1, k2, k3;
    float val;
 
-   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+   samp->mip_filter(tgsi_sampler, s, t, p, c0, control, rgba);
 
    /**
     * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
@@ -1508,7 +1574,8 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
             const float s[QUAD_SIZE],
             const float t[QUAD_SIZE],
             const float p[QUAD_SIZE],
-            float lodbias,
+            const float c0[QUAD_SIZE],
+            enum tgsi_sampler_control control,
             float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
    struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
@@ -1589,7 +1656,7 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
     * is not active, this will point somewhere deeper into the
     * pipeline, eg. to mip_filter or even img_filter.
     */
-   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
 }
 
 
@@ -1778,8 +1845,8 @@ sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
 
    samp->texture = texture;
    samp->cache = tex_cache;
-   samp->xpot = util_unsigned_logbase2( texture->width[0] );
-   samp->ypot = util_unsigned_logbase2( texture->height[0] );
+   samp->xpot = util_unsigned_logbase2( texture->width0 );
+   samp->ypot = util_unsigned_logbase2( texture->height0 );
    samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
 }
 
@@ -1862,7 +1929,7 @@ sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
       break;
    }
 
-   if (sampler->compare_mode != FALSE) {
+   if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       samp->compare = sample_compare;
    }
    else {
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index b0797711d37..b6e66c998ae 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2010 VMware, Inc.  All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -46,14 +47,14 @@ typedef void (*wrap_linear_func)(const float s[4],
 typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
                                      const float s[QUAD_SIZE],
                                      const float t[QUAD_SIZE],
-                                     const float p[QUAD_SIZE],
-                                     float lodbias);
+                                     const float p[QUAD_SIZE]);
 
 typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
                             const float s[QUAD_SIZE],
                             const float t[QUAD_SIZE],
                             const float p[QUAD_SIZE],
-                            float lodbias,
+                            const float c0[QUAD_SIZE],
+                            enum tgsi_sampler_control control,
                             float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index 0312a31460b..50242d5bd69 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -35,6 +35,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
+#include "util/u_math.h"
 #include "sp_context.h"
 #include "sp_texture.h"
 #include "sp_tex_tile_cache.h"
@@ -245,9 +246,9 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
                                      addr.bits.level, 
                                      addr.bits.z, 
                                      PIPE_TRANSFER_READ, 0, 0,
-                                     tc->texture->width[addr.bits.level],
-                                     tc->texture->height[addr.bits.level]);
-
+                                     u_minify(tc->texture->width0, addr.bits.level),
+                                     u_minify(tc->texture->height0, addr.bits.level));
+         
          tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
 
          tc->tex_face = addr.bits.face;
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 64a70bf44b1..a5fff915077 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -32,6 +32,8 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
+
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -51,29 +53,23 @@ softpipe_texture_layout(struct pipe_screen *screen,
 {
    struct pipe_texture *pt = &spt->base;
    unsigned level;
-   unsigned width = pt->width[0];
-   unsigned height = pt->height[0];
-   unsigned depth = pt->depth[0];
-
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
    unsigned buffer_size = 0;
 
    for (level = 0; level <= pt->last_level; level++) {
-      pt->width[level] = width;
-      pt->height[level] = height;
-      pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
-      spt->stride[level] = pt->nblocksx[level]*pt->block.size;
+      spt->stride[level] = util_format_get_stride(pt->format, width);
 
       spt->level_offset[level] = buffer_size;
 
-      buffer_size += (pt->nblocksy[level] *
+      buffer_size += (util_format_get_nblocksy(pt->format, height) *
                       ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
                       spt->stride[level]);
 
-      width  = minify(width);
-      height = minify(height);
-      depth = minify(depth);
+      width  = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
    }
 
    spt->buffer = screen->buffer_create(screen, 32,
@@ -95,12 +91,9 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
                      PIPE_BUFFER_USAGE_GPU_READ_WRITE);
    unsigned tex_usage = spt->base.tex_usage;
 
-   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
-   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
-
    spt->buffer = screen->surface_buffer_create( screen, 
-                                                spt->base.width[0], 
-                                                spt->base.height[0],
+                                                spt->base.width0, 
+                                                spt->base.height0,
                                                 spt->base.format,
                                                 usage,
                                                 tex_usage,
@@ -125,9 +118,9 @@ softpipe_texture_create(struct pipe_screen *screen,
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
 
-   spt->pot = (util_is_power_of_two(template->width[0]) &&
-               util_is_power_of_two(template->height[0]) &&
-               util_is_power_of_two(template->depth[0]));
+   spt->pot = (util_is_power_of_two(template->width0) &&
+               util_is_power_of_two(template->height0) &&
+               util_is_power_of_two(template->depth0));
 
    if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
@@ -162,7 +155,7 @@ softpipe_texture_blanket(struct pipe_screen * screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -173,8 +166,6 @@ softpipe_texture_blanket(struct pipe_screen * screen,
    spt->base = *base;
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
-   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
-   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
    spt->stride[0] = stride[0];
 
    pipe_buffer_reference(&spt->buffer, buffer);
@@ -212,8 +203,8 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       pipe_reference_init(&ps->reference, 1);
       pipe_texture_reference(&ps->texture, pt);
       ps->format = pt->format;
-      ps->width = pt->width[level];
-      ps->height = pt->height[level];
+      ps->width = u_minify(pt->width0, level);
+      ps->height = u_minify(pt->height0, level);
       ps->offset = spt->level_offset[level];
       ps->usage = usage;
 
@@ -242,10 +233,12 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE) {
-         ps->offset += face * pt->nblocksy[level] * spt->stride[level];
+         ps->offset += face * util_format_get_nblocksy(pt->format, u_minify(pt->height0, level)) *
+                       spt->stride[level];
       }
       else if (pt->target == PIPE_TEXTURE_3D) {
-         ps->offset += zslice * pt->nblocksy[level] * spt->stride[level];
+         ps->offset += zslice * util_format_get_nblocksy(pt->format, u_minify(pt->height0, level)) *
+                       spt->stride[level];
       }
       else {
          assert(face == 0);
@@ -297,18 +290,19 @@ softpipe_get_tex_transfer(struct pipe_screen *screen,
    assert(texture);
    assert(level <= texture->last_level);
 
+   /* make sure the requested region is in the image bounds */
+   assert(x + w <= u_minify(texture->width0, level));
+   assert(y + h <= u_minify(texture->height0, level));
+
    spt = CALLOC_STRUCT(softpipe_transfer);
    if (spt) {
       struct pipe_transfer *pt = &spt->base;
+      int nblocksy = util_format_get_nblocksy(texture->format, u_minify(texture->height0, level));
       pipe_texture_reference(&pt->texture, texture);
-      pt->format = texture->format;
-      pt->block = texture->block;
       pt->x = x;
       pt->y = y;
       pt->width = w;
       pt->height = h;
-      pt->nblocksx = texture->nblocksx[level];
-      pt->nblocksy = texture->nblocksy[level];
       pt->stride = sptex->stride[level];
       pt->usage = usage;
       pt->face = face;
@@ -318,10 +312,10 @@ softpipe_get_tex_transfer(struct pipe_screen *screen,
       spt->offset = sptex->level_offset[level];
 
       if (texture->target == PIPE_TEXTURE_CUBE) {
-         spt->offset += face * pt->nblocksy * pt->stride;
+         spt->offset += face * nblocksy * pt->stride;
       }
       else if (texture->target == PIPE_TEXTURE_3D) {
-         spt->offset += zslice * pt->nblocksy * pt->stride;
+         spt->offset += zslice * nblocksy * pt->stride;
       }
       else {
          assert(face == 0);
@@ -359,9 +353,11 @@ softpipe_transfer_map( struct pipe_screen *screen,
 {
    ubyte *map, *xfer_map;
    struct softpipe_texture *spt;
+   enum pipe_format format;
 
    assert(transfer->texture);
    spt = softpipe_texture(transfer->texture);
+   format = transfer->texture->format;
 
    map = pipe_buffer_map(screen, spt->buffer, pipe_transfer_buffer_flags(transfer));
    if (map == NULL)
@@ -378,8 +374,8 @@ softpipe_transfer_map( struct pipe_screen *screen,
    }
 
    xfer_map = map + softpipe_transfer(transfer)->offset +
-      transfer->y / transfer->block.height * transfer->stride +
-      transfer->x / transfer->block.width * transfer->block.size;
+      transfer->y / util_format_get_blockheight(format) * transfer->stride +
+      transfer->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
    /*printf("map = %p  xfer map = %p\n", map, xfer_map);*/
    return xfer_map;
 }
@@ -433,10 +429,9 @@ softpipe_video_surface_create(struct pipe_screen *screen,
    template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
    template.last_level = 0;
    /* vl_mpeg12_mc_renderer expects this when it's initialized with pot_buffers=true */
-   template.width[0] = util_next_power_of_two(width);
-   template.height[0] = util_next_power_of_two(height);
-   template.depth[0] = 1;
-   pf_get_block(template.format, &template.block);
+   template.width0 = util_next_power_of_two(width);
+   template.height0 = util_next_power_of_two(height);
+   template.depth0 = 1;
    template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
 
    sp_vsfc->tex = screen->texture_create(screen, &template);
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 65872cecc4f..112a6fe0cf3 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -33,6 +33,7 @@
  */
 
 #include "pipe/p_inlines.h"
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
 #include "sp_tile_cache.h"
@@ -238,7 +239,7 @@ clear_tile(struct softpipe_cached_tile *tile,
 {
    uint i, j;
 
-   switch (pf_get_size(format)) {
+   switch (util_format_get_blocksize(format)) {
    case 1:
       memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
       break;
@@ -284,8 +285,9 @@ sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
    uint x, y;
    uint numCleared = 0;
 
+   assert(pt->texture);
    /* clear the scratch tile to the clear value */
-   clear_tile(&tc->tile, pt->format, tc->clear_val);
+   clear_tile(&tc->tile, pt->texture->format, tc->clear_val);
 
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
@@ -372,6 +374,7 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc,
 
    if (addr.value != tile->addr.value) {
 
+      assert(pt->texture);
       if (tile->addr.bits.invalid == 0) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
@@ -395,10 +398,10 @@ sp_find_cached_tile(struct softpipe_tile_cache *tc,
       if (is_clear_flag_set(tc->clear_flags, addr)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
-            clear_tile(tile, pt->format, tc->clear_val);
+            clear_tile(tile, pt->texture->format, tc->clear_val);
          }
          else {
-            clear_tile_rgba(tile, pt->format, tc->clear_color);
+            clear_tile_rgba(tile, pt->texture->format, tc->clear_color);
          }
          clear_clear_flag(tc->clear_flags, addr);
       }
diff --git a/src/gallium/drivers/softpipe/sp_winsys.c b/src/gallium/drivers/softpipe/sp_winsys.c
new file mode 100644
index 00000000000..8169071dc9f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_winsys.c
@@ -0,0 +1,245 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Malloc softpipe winsys. Uses malloc for all memory allocations.
+ * 
+ * @author Keith Whitwell
+ * @author Brian Paul
+ * @author Jose Fonseca
+ */
+
+
+#include "pipe/internal/p_winsys_screen.h"/* port to just p_screen */
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+
+
+struct st_softpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+};
+
+
+/** Cast wrapper */
+static INLINE struct st_softpipe_buffer *
+st_softpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct st_softpipe_buffer *)buf;
+}
+
+
+static void *
+st_softpipe_buffer_map(struct pipe_winsys *winsys, 
+                       struct pipe_buffer *buf,
+                       unsigned flags)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = st_softpipe_buf->data;
+   return st_softpipe_buf->mapped;
+}
+
+
+static void
+st_softpipe_buffer_unmap(struct pipe_winsys *winsys, 
+                         struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = NULL;
+}
+
+
+static void
+st_softpipe_buffer_destroy(struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *oldBuf = st_softpipe_buffer(buf);
+
+   if (oldBuf->data) {
+      if (!oldBuf->userBuffer)
+         align_free(oldBuf->data);
+
+      oldBuf->data = NULL;
+   }
+
+   FREE(oldBuf);
+}
+
+
+static void
+st_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
+                              struct pipe_surface *surf,
+                              void *context_private)
+{
+}
+
+
+
+static const char *
+st_softpipe_get_name(struct pipe_winsys *winsys)
+{
+   return "softpipe";
+}
+
+
+static struct pipe_buffer *
+st_softpipe_buffer_create(struct pipe_winsys *winsys, 
+                          unsigned alignment, 
+                          unsigned usage,
+                          unsigned size)
+{
+   struct st_softpipe_buffer *buffer = CALLOC_STRUCT(st_softpipe_buffer);
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+st_softpipe_user_buffer_create(struct pipe_winsys *winsys, 
+                               void *ptr, 
+                               unsigned bytes)
+{
+   struct st_softpipe_buffer *buffer;
+   
+   buffer = CALLOC_STRUCT(st_softpipe_buffer);
+   if(!buffer)
+      return NULL;
+   
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+static struct pipe_buffer *
+st_softpipe_surface_buffer_create(struct pipe_winsys *winsys,
+                                  unsigned width, unsigned height,
+                                  enum pipe_format format,
+                                  unsigned usage,
+                                  unsigned tex_usage,
+                                  unsigned *stride)
+{
+   const unsigned alignment = 64;
+   unsigned nblocksy;
+
+   nblocksy = util_format_get_nblocksy(format, height);
+   *stride = align(util_format_get_stride(format, width), alignment);
+
+   return winsys->buffer_create(winsys, alignment,
+                                usage,
+                                *stride * nblocksy);
+}
+
+
+static void
+st_softpipe_fence_reference(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+st_softpipe_fence_signalled(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle *fence,
+                            unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+st_softpipe_fence_finish(struct pipe_winsys *winsys, 
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   return 0;
+}
+
+
+static void
+st_softpipe_destroy(struct pipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+struct pipe_screen *
+softpipe_create_screen_malloc(void)
+{
+   static struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(pipe_winsys);
+   if(!winsys)
+      return NULL;
+
+   winsys->destroy = st_softpipe_destroy;
+   
+   winsys->buffer_create = st_softpipe_buffer_create;
+   winsys->user_buffer_create = st_softpipe_user_buffer_create;
+   winsys->buffer_map = st_softpipe_buffer_map;
+   winsys->buffer_unmap = st_softpipe_buffer_unmap;
+   winsys->buffer_destroy = st_softpipe_buffer_destroy;
+
+   winsys->surface_buffer_create = st_softpipe_surface_buffer_create;
+
+   winsys->fence_reference = st_softpipe_fence_reference;
+   winsys->fence_signalled = st_softpipe_fence_signalled;
+   winsys->fence_finish = st_softpipe_fence_finish;
+
+   winsys->flush_frontbuffer = st_softpipe_flush_frontbuffer;
+   winsys->get_name = st_softpipe_get_name;
+
+   screen = softpipe_create_screen(winsys);
+   if(!screen)
+      st_softpipe_destroy(winsys);
+
+   return screen;
+}
diff --git a/src/gallium/drivers/softpipe/sp_winsys.h b/src/gallium/drivers/softpipe/sp_winsys.h
index 9e571862b75..3042e01a05c 100644
--- a/src/gallium/drivers/softpipe/sp_winsys.h
+++ b/src/gallium/drivers/softpipe/sp_winsys.h
@@ -34,23 +34,32 @@
 #ifndef SP_WINSYS_H
 #define SP_WINSYS_H
 
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include "pipe/p_defines.h"
 
 struct pipe_screen;
 struct pipe_winsys;
 struct pipe_context;
+struct pipe_texture;
+struct pipe_buffer;
 
 
 struct pipe_context *softpipe_create( struct pipe_screen * );
 
+/**
+ * Create a softpipe screen that uses the
+ * given winsys for allocating buffers.
+ */
+struct pipe_screen *softpipe_create_screen( struct pipe_winsys * );
 
-struct pipe_screen *
-softpipe_create_screen(struct pipe_winsys *);
-
+/**
+ * Create a softpipe screen that uses
+ * regular malloc to create all its buffers.
+ */
+struct pipe_screen *softpipe_create_screen_malloc(void);
 
 boolean
 softpipe_get_texture_buffer( struct pipe_texture *texture,
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 32e9304f819..66259fd0103 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -203,8 +203,6 @@ struct svga_state
    struct pipe_clip_state clip;
    struct pipe_viewport_state viewport;
 
-   const unsigned *edgeflags;
-
    unsigned num_samplers;
    unsigned num_textures;
    unsigned num_vertex_elements;
@@ -370,7 +368,7 @@ struct svga_context
 #define SVGA_NEW_FRAME_BUFFER        0x800
 #define SVGA_NEW_STIPPLE             0x1000
 #define SVGA_NEW_SCISSOR             0x2000
-#define SVGA_NEW_BLEND_COLOR         0x5000
+#define SVGA_NEW_BLEND_COLOR         0x4000
 #define SVGA_NEW_CLIP                0x8000
 #define SVGA_NEW_VIEWPORT            0x10000
 #define SVGA_NEW_PRESCALE            0x20000
@@ -381,9 +379,8 @@ struct svga_context
 #define SVGA_NEW_NEED_SWTNL          0x400000
 #define SVGA_NEW_FS_RESULT           0x800000
 #define SVGA_NEW_VS_RESULT           0x1000000
-#define SVGA_NEW_EDGEFLAGS           0x2000000
-#define SVGA_NEW_ZERO_STRIDE         0x4000000
-#define SVGA_NEW_TEXTURE_FLAGS       0x8000000
+#define SVGA_NEW_ZERO_STRIDE         0x2000000
+#define SVGA_NEW_TEXTURE_FLAGS       0x4000000
 
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
index 6195c3897ed..409b3b41cbc 100644
--- a/src/gallium/drivers/svga/svga_pipe_clear.c
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -46,7 +46,7 @@ try_clear(struct svga_context *svga,
    boolean restore_viewport = FALSE;
    SVGA3dClearFlag flags = 0;
    struct pipe_framebuffer_state *fb = &svga->curr.framebuffer;
-   unsigned color = 0;
+   union util_color uc;
 
    ret = svga_update_state(svga, SVGA_STATE_HW_CLEAR);
    if (ret)
@@ -54,7 +54,7 @@ try_clear(struct svga_context *svga,
 
    if ((buffers & PIPE_CLEAR_COLOR) && fb->cbufs[0]) {
       flags |= SVGA3D_CLEAR_COLOR;
-      util_pack_color(rgba, PIPE_FORMAT_A8R8G8B8_UNORM, &color);
+      util_pack_color(rgba, PIPE_FORMAT_A8R8G8B8_UNORM, &uc);
 
       rect.w = fb->cbufs[0]->width;
       rect.h = fb->cbufs[0]->height;
@@ -77,7 +77,7 @@ try_clear(struct svga_context *svga,
          return ret;
    }
 
-   ret = SVGA3D_ClearRect(svga->swc, flags, color, depth, stencil,
+   ret = SVGA3D_ClearRect(svga->swc, flags, uc.ui, depth, stencil,
                           rect.x, rect.y, rect.w, rect.h);
    if (ret != PIPE_OK)
       return ret;
diff --git a/src/gallium/drivers/svga/svga_pipe_constants.c b/src/gallium/drivers/svga/svga_pipe_constants.c
index a6dc344bdd3..93022f3cc51 100644
--- a/src/gallium/drivers/svga/svga_pipe_constants.c
+++ b/src/gallium/drivers/svga/svga_pipe_constants.c
@@ -46,7 +46,7 @@ struct svga_constbuf
 
 static void svga_set_constant_buffer(struct pipe_context *pipe,
                                      uint shader, uint index,
-                                     const struct pipe_constant_buffer *buf)
+                                     struct pipe_buffer *buf)
 {
    struct svga_context *svga = svga_context(pipe);
 
@@ -54,7 +54,7 @@ static void svga_set_constant_buffer(struct pipe_context *pipe,
    assert(index == 0);
 
    pipe_buffer_reference( &svga->curr.cb[shader],
-                          buf->buffer );
+                          buf );
 
    if (shader == PIPE_SHADER_FRAGMENT)
       svga->dirty |= SVGA_NEW_FS_CONST_BUFFER;
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 719b3419f86..4e0c499dc3e 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -148,7 +148,7 @@ retry:
 
 
 
-static boolean
+static void
 svga_draw_range_elements( struct pipe_context *pipe,
                           struct pipe_buffer *index_buffer,
                           unsigned index_size,
@@ -161,7 +161,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
    enum pipe_error ret = 0;
 
    if (!u_trim_pipe_prim( prim, &count ))
-      return TRUE;
+      return;
 
    /*
     * Mark currently bound target surfaces as dirty
@@ -182,7 +182,7 @@ svga_draw_range_elements( struct pipe_context *pipe,
 #ifdef DEBUG
    if (svga->curr.vs->base.id == svga->debug.disable_shader ||
        svga->curr.fs->base.id == svga->debug.disable_shader)
-      return 0;
+      return;
 #endif
 
    if (svga->state.sw.need_swtnl)
@@ -224,31 +224,29 @@ svga_draw_range_elements( struct pipe_context *pipe,
       svga_hwtnl_flush_retry( svga );
       svga_context_flush(svga, NULL);
    }
-
-   return ret == PIPE_OK;
 }
 
 
-static boolean 
+static void
 svga_draw_elements( struct pipe_context *pipe,
                     struct pipe_buffer *index_buffer,
                     unsigned index_size,
                     unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements( pipe, index_buffer,
-                                    index_size,
-                                    0, 0xffffffff,
-                                    prim, start, count );
+   svga_draw_range_elements( pipe, index_buffer,
+                             index_size,
+                             0, 0xffffffff,
+                             prim, start, count );
 }
 
-static boolean 
+static void
 svga_draw_arrays( struct pipe_context *pipe,
                   unsigned prim, unsigned start, unsigned count)
 {
-   return svga_draw_range_elements(pipe, NULL, 0, 
-                                   start, start + count - 1, 
-                                   prim, 
-                                   start, count);
+   svga_draw_range_elements(pipe, NULL, 0, 
+                            start, start + count - 1, 
+                            prim, 
+                            start, count);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index 9cc69c8266b..161c66dd4ff 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -73,7 +73,6 @@ static INLINE unsigned translate_img_filter( unsigned filter )
    switch (filter) {
    case PIPE_TEX_FILTER_NEAREST: return SVGA3D_TEX_FILTER_NEAREST;
    case PIPE_TEX_FILTER_LINEAR:  return SVGA3D_TEX_FILTER_LINEAR;
-   case PIPE_TEX_FILTER_ANISO:   return SVGA3D_TEX_FILTER_ANISOTROPIC;
    default:
       assert(0);
       return SVGA3D_TEX_FILTER_NEAREST;
@@ -98,11 +97,14 @@ svga_create_sampler_state(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    struct svga_sampler_state *cso = CALLOC_STRUCT( svga_sampler_state );
+   union util_color uc;
    
    cso->mipfilter = translate_mip_filter(sampler->min_mip_filter);
    cso->magfilter = translate_img_filter( sampler->mag_img_filter );
    cso->minfilter = translate_img_filter( sampler->min_img_filter );
    cso->aniso_level = MAX2( (unsigned) sampler->max_anisotropy, 1 );
+   if(cso->aniso_level != 1)
+      cso->magfilter = cso->minfilter = SVGA3D_TEX_FILTER_ANISOTROPIC;
    cso->lod_bias = sampler->lod_bias;
    cso->addressu = translate_wrap_mode(sampler->wrap_s);
    cso->addressv = translate_wrap_mode(sampler->wrap_t);
@@ -118,8 +120,8 @@ svga_create_sampler_state(struct pipe_context *pipe,
       ubyte a = float_to_ubyte(sampler->border_color[3]);
 
       util_pack_color_ub( r, g, b, a,
-                          PIPE_FORMAT_B8G8R8A8_UNORM,
-                          &cso->bordercolor );
+                          PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+      cso->bordercolor = uc.ui;
    }
 
    /* No SVGA3D support for:
@@ -231,9 +233,9 @@ static void svga_set_sampler_textures(struct pipe_context *pipe,
 void svga_init_sampler_functions( struct svga_context *svga )
 {
    svga->pipe.create_sampler_state = svga_create_sampler_state;
-   svga->pipe.bind_sampler_states = svga_bind_sampler_states;
+   svga->pipe.bind_fragment_sampler_states = svga_bind_sampler_states;
    svga->pipe.delete_sampler_state = svga_delete_sampler_state;
-   svga->pipe.set_sampler_textures = svga_set_sampler_textures;
+   svga->pipe.set_fragment_sampler_textures = svga_set_sampler_textures;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
index 87ac9231d57..0bf43fa9008 100644
--- a/src/gallium/drivers/svga/svga_pipe_vertex.c
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -80,18 +80,6 @@ static void svga_set_vertex_elements(struct pipe_context *pipe,
 }
 
 
-static void svga_set_edgeflags(struct pipe_context *pipe,
-                               const unsigned *bitfield)
-{
-   struct svga_context *svga = svga_context(pipe);
-
-   if (bitfield != NULL || svga->curr.edgeflags != NULL) {
-      svga->curr.edgeflags = bitfield;
-      svga->dirty |= SVGA_NEW_EDGEFLAGS;
-   }
-}
-
-
 void svga_cleanup_vertex_state( struct svga_context *svga )
 {
    unsigned i;
@@ -105,7 +93,6 @@ void svga_init_vertex_functions( struct svga_context *svga )
 {
    svga->pipe.set_vertex_buffers = svga_set_vertex_buffers;
    svga->pipe.set_vertex_elements = svga_set_vertex_elements;
-   svga->pipe.set_edgeflags = svga_set_edgeflags;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
index 7f2b2bce79a..c4ac5304ac6 100644
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -48,7 +48,7 @@ static const struct tgsi_token *substitute_vs(
    static struct tgsi_token tokens[300];
 
    const char *text = 
-      "VERT1.1\n"
+      "VERT\n"
       "DCL IN[0]\n"
       "DCL IN[1]\n"
       "DCL IN[2]\n"
diff --git a/src/gallium/drivers/svga/svga_screen_buffer.c b/src/gallium/drivers/svga/svga_screen_buffer.c
index 719adde27ef..430a6970fde 100644
--- a/src/gallium/drivers/svga/svga_screen_buffer.c
+++ b/src/gallium/drivers/svga/svga_screen_buffer.c
@@ -291,7 +291,8 @@ svga_buffer_upload_flush(struct svga_context *svga,
    sbuf->host_written = TRUE;
 
    /* Decrement reference count */
-   pipe_buffer_reference((struct pipe_buffer **)&sbuf, NULL);
+   pipe_reference(&(sbuf->base.reference), NULL);
+   sbuf = NULL;
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_screen_texture.c b/src/gallium/drivers/svga/svga_screen_texture.c
index 9ad4edafef2..0d69007fd87 100644
--- a/src/gallium/drivers/svga/svga_screen_texture.c
+++ b/src/gallium/drivers/svga/svga_screen_texture.c
@@ -29,6 +29,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_thread.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -158,7 +159,8 @@ svga_transfer_dma_band(struct svga_transfer *st,
                 st->base.x + st->base.width,
                 y + h,
                 st->base.zslice + 1,
-                texture->base.block.size*8/(texture->base.block.width*texture->base.block.height));
+                util_format_get_blocksize(texture->base.format)*8/
+                (util_format_get_blockwidth(texture->base.format)*util_format_get_blockheight(texture->base.format)));
    
    box.x = st->base.x;
    box.y = y;
@@ -208,7 +210,8 @@ svga_transfer_dma(struct svga_transfer *st,
    }
    else {
       unsigned y, h, srcy;
-      h = st->hw_nblocksy * st->base.block.height;
+      unsigned blockheight = util_format_get_blockheight(st->base.texture->format);
+      h = st->hw_nblocksy * blockheight;
       srcy = 0;
       for(y = 0; y < st->base.height; y += h) {
          unsigned offset, length;
@@ -218,11 +221,11 @@ svga_transfer_dma(struct svga_transfer *st,
             h = st->base.height - y;
 
          /* Transfer band must be aligned to pixel block boundaries */
-         assert(y % st->base.block.height == 0);
-         assert(h % st->base.block.height == 0);
+         assert(y % blockheight == 0);
+         assert(h % blockheight == 0);
          
-         offset = y * st->base.stride / st->base.block.height;
-         length = h * st->base.stride / st->base.block.height;
+         offset = y * st->base.stride / blockheight;
+         length = h * st->base.stride / blockheight;
 
          sw = (uint8_t *)st->swbuf + offset;
          
@@ -281,24 +284,19 @@ svga_texture_create(struct pipe_screen *screen,
    if(templat->last_level >= SVGA_MAX_TEXTURE_LEVELS)
       goto error2;
    
-   width = templat->width[0];
-   height = templat->height[0];
-   depth = templat->depth[0];
+   width = templat->width0;
+   height = templat->height0;
+   depth = templat->depth0;
    for(level = 0; level <= templat->last_level; ++level) {
-      tex->base.width[level] = width;
-      tex->base.height[level] = height;
-      tex->base.depth[level] = depth;
-      tex->base.nblocksx[level] = pf_get_nblocksx(&tex->base.block, width);  
-      tex->base.nblocksy[level] = pf_get_nblocksy(&tex->base.block, height);  
-      width  = minify(width);
-      height = minify(height);
-      depth = minify(depth);
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
    }
    
    tex->key.flags = 0;
-   tex->key.size.width = templat->width[0];
-   tex->key.size.height = templat->height[0];
-   tex->key.size.depth = templat->depth[0];
+   tex->key.size.width = templat->width0;
+   tex->key.size.height = templat->height0;
+   tex->key.size.depth = templat->depth0;
    
    if(templat->target == PIPE_TEXTURE_CUBE) {
       tex->key.flags |= SVGA3D_SURFACE_CUBEMAP;
@@ -330,7 +328,7 @@ svga_texture_create(struct pipe_screen *screen,
     */
 #if 0
    if((templat->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) &&
-      !pf_is_compressed(templat->format))
+      !util_format_is_compressed(templat->format))
       tex->key.flags |= SVGA3D_SURFACE_HINT_RENDERTARGET;
 #endif
    
@@ -371,7 +369,7 @@ svga_texture_blanket(struct pipe_screen * screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -436,7 +434,7 @@ svga_screen_texture_wrap_surface(struct pipe_screen *screen,
    /* Only supports one type */
    if (base->target != PIPE_TEXTURE_2D ||
        base->last_level != 0 ||
-       base->depth[0] != 1) {
+       base->depth0 != 1) {
       return NULL;
    }
 
@@ -600,9 +598,9 @@ svga_texture_view_surface(struct pipe_context *pipe,
    key->flags = 0;
    key->format = format;
    key->numMipLevels = num_mip;
-   key->size.width = tex->base.width[start_mip];
-   key->size.height = tex->base.height[start_mip];
-   key->size.depth = zslice_pick < 0 ? tex->base.depth[start_mip] : 1;
+   key->size.width = u_minify(tex->base.width0, start_mip);
+   key->size.height = u_minify(tex->base.height0, start_mip);
+   key->size.depth = zslice_pick < 0 ? u_minify(tex->base.depth0, start_mip) : 1;
    key->cachable = 1;
    assert(key->size.depth == 1);
    
@@ -636,7 +634,10 @@ svga_texture_view_surface(struct pipe_context *pipe,
    for (i = 0; i < key->numMipLevels; i++) {
       for (j = 0; j < key->numFaces; j++) {
          if(tex->defined[j + face_pick][i + start_mip]) {
-            unsigned depth = zslice_pick < 0 ? tex->base.depth[i + start_mip] : 1;
+            unsigned depth = (zslice_pick < 0 ?
+                              u_minify(tex->base.depth0, i + start_mip) :
+                              1);
+
             svga_texture_copy_handle(svga_context(pipe),
                                      ss,
                                      tex->handle, 
@@ -644,8 +645,8 @@ svga_texture_view_surface(struct pipe_context *pipe,
                                      i + start_mip, 
                                      j + face_pick,
                                      handle, 0, 0, 0, i, j,
-                                     tex->base.width[i + start_mip],
-                                     tex->base.height[i + start_mip],
+                                     u_minify(tex->base.width0, i + start_mip),
+                                     u_minify(tex->base.height0, i + start_mip),
                                      depth);
          }
       }
@@ -674,8 +675,8 @@ svga_get_tex_surface(struct pipe_screen *screen,
    pipe_reference_init(&s->base.reference, 1);
    pipe_texture_reference(&s->base.texture, pt);
    s->base.format = pt->format;
-   s->base.width = pt->width[level];
-   s->base.height = pt->height[level];
+   s->base.width = u_minify(pt->width0, level);
+   s->base.height = u_minify(pt->height0, level);
    s->base.usage = flags;
    s->base.level = level;
    s->base.face = face;
@@ -804,7 +805,8 @@ svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf)
       svga_texture_copy_handle(svga_context(pipe), ss,
                                s->handle, 0, 0, 0, s->real_level, s->real_face,
                                tex->handle, 0, 0, surf->zslice, surf->level, surf->face,
-                               tex->base.width[surf->level], tex->base.height[surf->level], 1);
+                               u_minify(tex->base.width0, surf->level),
+                               u_minify(tex->base.height0, surf->level), 1);
       tex->defined[surf->face][surf->level] = TRUE;
    }
 }
@@ -832,6 +834,8 @@ svga_get_tex_transfer(struct pipe_screen *screen,
    struct svga_screen *ss = svga_screen(screen);
    struct svga_winsys_screen *sws = ss->sws;
    struct svga_transfer *st;
+   unsigned nblocksx = util_format_get_nblocksx(texture->format, w);
+   unsigned nblocksy = util_format_get_nblocksy(texture->format, h);
 
    /* We can't map texture storage directly */
    if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
@@ -841,21 +845,17 @@ svga_get_tex_transfer(struct pipe_screen *screen,
    if (!st)
       return NULL;
    
-   st->base.format = texture->format;
-   st->base.block = texture->block;
    st->base.x = x;
    st->base.y = y;
    st->base.width = w;
    st->base.height = h;
-   st->base.nblocksx = pf_get_nblocksx(&texture->block, w);
-   st->base.nblocksy = pf_get_nblocksy(&texture->block, h);
-   st->base.stride = st->base.nblocksx*st->base.block.size;
+   st->base.stride = nblocksx*util_format_get_blocksize(texture->format);
    st->base.usage = usage;
    st->base.face = face;
    st->base.level = level;
    st->base.zslice = zslice;
 
-   st->hw_nblocksy = st->base.nblocksy;
+   st->hw_nblocksy = nblocksy;
    
    st->hwbuf = svga_winsys_buffer_create(ss, 
                                          1, 
@@ -871,15 +871,15 @@ svga_get_tex_transfer(struct pipe_screen *screen,
    if(!st->hwbuf)
       goto no_hwbuf;
 
-   if(st->hw_nblocksy < st->base.nblocksy) {
+   if(st->hw_nblocksy < nblocksy) {
       /* We couldn't allocate a hardware buffer big enough for the transfer, 
        * so allocate regular malloc memory instead */
       debug_printf("%s: failed to allocate %u KB of DMA, splitting into %u x %u KB DMA transfers\n",
                    __FUNCTION__,
-                   (st->base.nblocksy*st->base.stride + 1023)/1024,
-                   (st->base.nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
+                   (nblocksy*st->base.stride + 1023)/1024,
+                   (nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
                    (st->hw_nblocksy*st->base.stride + 1023)/1024);
-      st->swbuf = MALLOC(st->base.nblocksy*st->base.stride);
+      st->swbuf = MALLOC(nblocksy*st->base.stride);
       if(!st->swbuf)
          goto no_swbuf;
    }
@@ -994,7 +994,7 @@ svga_get_tex_sampler_view(struct pipe_context *pipe, struct pipe_texture *pt,
       if (min_lod == 0 && max_lod >= pt->last_level)
          view = FALSE;
 
-      if (pf_is_compressed(pt->format) && view) {
+      if (util_format_is_compressed(pt->format) && view) {
          format = svga_translate_format_render(pt->format);
       }
 
@@ -1033,9 +1033,9 @@ svga_get_tex_sampler_view(struct pipe_context *pipe, struct pipe_texture *pt,
                "svga: Sampler view: no %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
                pt, min_lod, max_lod,
                max_lod - min_lod + 1,
-               pt->width[0],
-               pt->height[0],
-               pt->depth[0],
+               pt->width0,
+               pt->height0,
+               pt->depth0,
                pt->last_level);
       sv->key.cachable = 0;
       sv->handle = tex->handle;
@@ -1046,9 +1046,9 @@ svga_get_tex_sampler_view(struct pipe_context *pipe, struct pipe_texture *pt,
             "svga: Sampler view: yes %p, mips %u..%u, nr %u, size (%ux%ux%u), last %u\n",
             pt, min_lod, max_lod,
             max_lod - min_lod + 1,
-            pt->width[0],
-            pt->height[0],
-            pt->depth[0],
+            pt->width0,
+            pt->height0,
+            pt->depth0,
             pt->last_level);
 
    sv->age = tex->age;
@@ -1098,9 +1098,9 @@ svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *
             svga_texture_copy_handle(svga, NULL,
                                      tex->handle, 0, 0, 0, i, k,
                                      v->handle, 0, 0, 0, i - v->min_lod, k,
-                                     tex->base.width[i],
-                                     tex->base.height[i],
-                                     tex->base.depth[i]);
+                                     u_minify(tex->base.width0, i),
+                                     u_minify(tex->base.height0, i),
+                                     u_minify(tex->base.depth0, i));
       }
    }
 
@@ -1133,8 +1133,7 @@ svga_screen_buffer_from_texture(struct pipe_texture *texture,
        svga_translate_format(texture->format),
        stex->handle);
 
-   *stride = pf_get_nblocksx(&texture->block, texture->width[0]) *
-      texture->block.size;
+   *stride = util_format_get_stride(texture->format, texture->width0);
 
    return *buffer != NULL;
 }
diff --git a/src/gallium/drivers/svga/svga_screen_texture.h b/src/gallium/drivers/svga/svga_screen_texture.h
index 8cfdfea6937..89ae24219fd 100644
--- a/src/gallium/drivers/svga/svga_screen_texture.h
+++ b/src/gallium/drivers/svga/svga_screen_texture.h
@@ -171,8 +171,9 @@ svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_
 {
    struct svga_sampler_view *old = *ptr;
 
-   if (pipe_reference((struct pipe_reference **)ptr, &v->reference))
+   if (pipe_reference(&(*ptr)->reference, &v->reference))
       svga_destroy_sampler_view_priv(old);
+   *ptr = v;
 }
 
 extern void
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index a5777d4fbdc..6b0e511cec1 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -140,8 +140,8 @@ static int emit_fs_consts( struct svga_context *svga,
             struct pipe_texture *tex = svga->curr.texture[i];
             float data[4];
 
-            data[0] = 1.0 / (float)tex->width[0];
-            data[1] = 1.0 / (float)tex->height[0];
+            data[0] = 1.0 / (float)tex->width0;
+            data[1] = 1.0 / (float)tex->height0;
             data[2] = 1.0;
             data[3] = 1.0;
 
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 4fe91416778..ec2886348b9 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -40,8 +40,13 @@
 static INLINE int compare_fs_keys( const struct svga_fs_compile_key *a,
                                    const struct svga_fs_compile_key *b )
 {
-   unsigned keysize = svga_fs_key_size( a );
-   return memcmp( a, b, keysize );
+   unsigned keysize_a = svga_fs_key_size( a );
+   unsigned keysize_b = svga_fs_key_size( b );
+
+   if (keysize_a != keysize_b) {
+      return (int)(keysize_a - keysize_b);
+   }
+   return memcmp( a, b, keysize_a );
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
index 00201b8091d..3c35a8579f7 100644
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -108,6 +108,7 @@ static int update_need_pipeline( struct svga_context *svga,
 {
    
    boolean need_pipeline = FALSE;
+   struct svga_vertex_shader *vs = svga->curr.vs;
 
    /* SVGA_NEW_RAST, SVGA_NEW_REDUCED_PRIMITIVE
     */
@@ -119,11 +120,9 @@ static int update_need_pipeline( struct svga_context *svga,
       need_pipeline = TRUE;
    }
 
-   /* SVGA_NEW_EDGEFLAGS
+   /* EDGEFLAGS
     */
-   if (svga->curr.rast->hw_unfilled != PIPE_POLYGON_MODE_FILL &&
-       svga->curr.reduced_prim == PIPE_PRIM_TRIANGLES && 
-       svga->curr.edgeflags != NULL) {
+    if (vs->base.info.writes_edgeflag) {
       SVGA_DBG(DEBUG_SWTNL, "%s: edgeflags\n", __FUNCTION__);
       need_pipeline = TRUE;
    }
@@ -150,6 +149,7 @@ struct svga_tracked_state svga_update_need_pipeline =
    "need pipeline",
    (SVGA_NEW_RAST |
     SVGA_NEW_CLIP |
+    SVGA_NEW_VS |
     SVGA_NEW_REDUCED_PRIMITIVE),
    update_need_pipeline
 };
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 5e33c127d92..e7e6c084321 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -25,6 +25,7 @@
 
 #include "pipe/p_inlines.h"
 #include "pipe/p_defines.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_bitmask.h"
 #include "translate/translate.h"
@@ -201,10 +202,12 @@ static int update_zero_stride( struct svga_context *svga,
 
          key.output_stride = 4 * sizeof(float);
          key.nr_elements = 1;
+         key.element[0].type = TRANSLATE_ELEMENT_NORMAL;
          key.element[0].input_format = vel->src_format;
          key.element[0].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
          key.element[0].input_buffer = vel->vertex_buffer_index;
          key.element[0].input_offset = vel->src_offset;
+         key.element[0].instance_divisor = vel->instance_divisor;
          key.element[0].output_offset = const_idx * 4 * sizeof(float);
 
          translate_key_sanitize(&key);
@@ -218,12 +221,12 @@ static int update_zero_stride( struct svga_context *svga,
          mapped_buffer = pipe_buffer_map_range(svga->pipe.screen, 
                                                vbuffer->buffer,
                                                vel->src_offset,
-                                               pf_get_size(vel->src_format),
+                                               util_format_get_blocksize(vel->src_format),
                                                PIPE_BUFFER_USAGE_CPU_READ);
          translate->set_buffer(translate, vel->vertex_buffer_index,
                                mapped_buffer,
                                vbuffer->stride);
-         translate->run(translate, 0, 1,
+         translate->run(translate, 0, 1, 0,
                         svga->curr.zero_stride_constants);
 
          pipe_buffer_unmap(svga->pipe.screen,
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
index 6729cc73817..0ae58c7d9de 100644
--- a/src/gallium/drivers/svga/svga_swtnl_draw.c
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -89,7 +89,7 @@ svga_swtnl_draw_range_elements(struct svga_context *svga,
                             PIPE_BUFFER_USAGE_CPU_READ);
       assert(map);
       draw_set_mapped_constant_buffer(
-         draw, 
+         draw, PIPE_SHADER_VERTEX,
          map,
          svga->curr.cb[PIPE_SHADER_VERTEX]->size);
    }
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index d57ed82866b..fe03e207fff 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -119,10 +119,6 @@ static int update_swtnl_draw( struct svga_context *svga,
       draw_set_mrd(svga->swtnl.draw, 
                    svga->curr.depthscale);
 
-   if (dirty & SVGA_NEW_EDGEFLAGS)
-      draw_set_edgeflags( svga->swtnl.draw, 
-                          svga->curr.edgeflags );
-
    return 0;
 }
 
@@ -137,8 +133,7 @@ struct svga_tracked_state svga_update_swtnl_draw =
     SVGA_NEW_VIEWPORT |
     SVGA_NEW_RAST |
     SVGA_NEW_FRAME_BUFFER |
-    SVGA_NEW_REDUCED_PRIMITIVE |
-    SVGA_NEW_EDGEFLAGS),
+    SVGA_NEW_REDUCED_PRIMITIVE),
    update_swtnl_draw
 };
 
@@ -160,7 +155,7 @@ int svga_swtnl_update_vdecl( struct svga_context *svga )
    memset(vdecl, 0, sizeof(vdecl));
 
    /* always add position */
-   src = draw_find_vs_output(draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
    vinfo->attrib[0].emit = EMIT_4F;
    vdecl[0].array.offset = offset;
@@ -173,7 +168,7 @@ int svga_swtnl_update_vdecl( struct svga_context *svga )
    for (i = 0; i < fs->base.info.num_inputs; i++) {
       unsigned name = fs->base.info.input_semantic_name[i];
       unsigned index = fs->base.info.input_semantic_index[i];
-      src = draw_find_vs_output(draw, name, index);
+      src = draw_find_shader_output(draw, name, index);
       vdecl[nr_decls].array.offset = offset;
       vdecl[nr_decls].identity.usageIndex = fs->base.info.input_semantic_index[i];
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index 896c90a89ae..737a2213af5 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -39,26 +39,24 @@ struct tgsi_token;
 
 struct svga_vs_compile_key
 {
-   ubyte need_prescale:1;
-   ubyte allow_psiz:1;
    unsigned zero_stride_vertex_elements;
-   ubyte num_zero_stride_vertex_elements:6;
+   unsigned need_prescale:1;
+   unsigned allow_psiz:1;
+   unsigned num_zero_stride_vertex_elements:6;
 };
 
 struct svga_fs_compile_key
 {
-   boolean light_twoside:1;
-   boolean front_cw:1;
-   ubyte num_textures;
-   ubyte num_unnormalized_coords;
+   unsigned light_twoside:1;
+   unsigned front_cw:1;
+   unsigned num_textures:8;
+   unsigned num_unnormalized_coords:8;
    struct {
-      ubyte compare_mode       : 1;
-      ubyte compare_func       : 3;
-      ubyte unnormalized       : 1;
-
-      ubyte width_height_idx   : 7;
-
-      ubyte texture_target;
+      unsigned compare_mode:1;
+      unsigned compare_func:3;
+      unsigned unnormalized:1;
+      unsigned width_height_idx:7;
+      unsigned texture_target:8;
    } tex[PIPE_MAX_SAMPLERS];
 };
 
@@ -121,8 +119,7 @@ static INLINE unsigned svga_vs_key_size( const struct svga_vs_compile_key *key )
 
 static INLINE unsigned svga_fs_key_size( const struct svga_fs_compile_key *key )
 {
-   return (const char *)&key->tex[key->num_textures].texture_target -
-      (const char *)key;
+   return (const char *)&key->tex[key->num_textures] - (const char *)key;
 }
 
 struct svga_shader_result *
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
index 0c3430127aa..1ae99067610 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm20.c
@@ -43,7 +43,7 @@ static boolean ps20_input( struct svga_shader_emitter *emit,
    dcl.values[0] = 0;
    dcl.values[1] = 0;
 
-   switch (semantic.SemanticName) {
+   switch (semantic.Name) {
    case TGSI_SEMANTIC_POSITION:
       /* Special case:
        */
@@ -52,15 +52,15 @@ static boolean ps20_input( struct svga_shader_emitter *emit,
       break;
    case TGSI_SEMANTIC_COLOR:
       reg = src_register( SVGA3DREG_INPUT, 
-                          semantic.SemanticIndex );
+                          semantic.Index );
       break;
    case TGSI_SEMANTIC_FOG:
-      assert(semantic.SemanticIndex == 0);
+      assert(semantic.Index == 0);
       reg = src_register( SVGA3DREG_TEXTURE, 0 );
       break;
    case TGSI_SEMANTIC_GENERIC:
       reg = src_register( SVGA3DREG_TEXTURE,
-                          semantic.SemanticIndex + 1 );
+                          semantic.Index + 1 );
       break;
    default:
       assert(0);
@@ -87,16 +87,16 @@ static boolean ps20_output( struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken reg;
 
-   switch (semantic.SemanticName) {
+   switch (semantic.Name) {
    case TGSI_SEMANTIC_COLOR:
-      if (semantic.SemanticIndex < PIPE_MAX_COLOR_BUFS) {
-         unsigned cbuf = semantic.SemanticIndex;
+      if (semantic.Index < PIPE_MAX_COLOR_BUFS) {
+         unsigned cbuf = semantic.Index;
 
          emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                                emit->nr_hw_temp++ );
          emit->temp_col[cbuf] = emit->output_map[idx];
          emit->true_col[cbuf] = dst_register( SVGA3DREG_COLOROUT, 
-                                              semantic.SemanticIndex );
+                                              semantic.Index );
       }
       else {
          assert(0);
@@ -108,7 +108,7 @@ static boolean ps20_output( struct svga_shader_emitter *emit,
                                             emit->nr_hw_temp++ );
       emit->temp_pos = emit->output_map[idx];
       emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
-                                     semantic.SemanticIndex );
+                                     semantic.Index );
       break;
    default:
       assert(0);
@@ -166,9 +166,9 @@ static boolean vs20_output( struct svga_shader_emitter *emit,
 
    /* Just build the register map table: 
     */
-   switch (semantic.SemanticName) {
+   switch (semantic.Name) {
    case TGSI_SEMANTIC_POSITION:
-      assert(semantic.SemanticIndex == 0);
+      assert(semantic.Index == 0);
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                             emit->nr_hw_temp++ );
       emit->temp_pos = emit->output_map[idx];
@@ -176,7 +176,7 @@ static boolean vs20_output( struct svga_shader_emitter *emit,
                                      SVGA3DRASTOUT_POSITION);
       break;
    case TGSI_SEMANTIC_PSIZE:
-      assert(semantic.SemanticIndex == 0);
+      assert(semantic.Index == 0);
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                             emit->nr_hw_temp++ );
       emit->temp_psiz = emit->output_map[idx];
@@ -184,17 +184,17 @@ static boolean vs20_output( struct svga_shader_emitter *emit,
                                       SVGA3DRASTOUT_PSIZE );
       break;
    case TGSI_SEMANTIC_FOG:
-      assert(semantic.SemanticIndex == 0);
+      assert(semantic.Index == 0);
       emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT, 0 );
       break;
    case TGSI_SEMANTIC_COLOR:
       /* oD0 */
       emit->output_map[idx] = dst_register( SVGA3DREG_ATTROUT,
-                                            semantic.SemanticIndex );
+                                            semantic.Index );
       break;
    case TGSI_SEMANTIC_GENERIC:
       emit->output_map[idx] = dst_register( SVGA3DREG_TEXCRDOUT,
-                                            semantic.SemanticIndex + 1 );
+                                            semantic.Index + 1 );
       break;
    default:
       assert(0);
@@ -227,15 +227,15 @@ static boolean ps20_sampler( struct svga_shader_emitter *emit,
 boolean svga_translate_decl_sm20( struct svga_shader_emitter *emit,
                              const struct tgsi_full_declaration *decl )
 {
-   unsigned first = decl->DeclarationRange.First;
-   unsigned last = decl->DeclarationRange.Last;
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
    unsigned semantic = 0;
    unsigned semantic_idx = 0;
    unsigned idx;
    
    if (decl->Declaration.Semantic) {
-      semantic = decl->Semantic.SemanticName;
-      semantic_idx = decl->Semantic.SemanticIndex;
+      semantic = decl->Semantic.Name;
+      semantic_idx = decl->Semantic.Index;
    }
 
    for( idx = first; idx <= last; idx++ ) {
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 2291cf116dd..43fc0d32359 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -34,35 +34,35 @@ static boolean translate_vs_ps_semantic( struct tgsi_declaration_semantic semant
                                          unsigned *usage,
                                          unsigned *idx )
 {
-   switch (semantic.SemanticName) {
+   switch (semantic.Name) {
    case TGSI_SEMANTIC_POSITION:  
-      *idx = semantic.SemanticIndex;
+      *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_POSITION;
       break;
    case TGSI_SEMANTIC_COLOR:     
 
-      *idx = semantic.SemanticIndex;
+      *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_COLOR;
       break;
    case TGSI_SEMANTIC_BCOLOR:
-      *idx = semantic.SemanticIndex + 2; /* sharing with COLOR */
+      *idx = semantic.Index + 2; /* sharing with COLOR */
       *usage = SVGA3D_DECLUSAGE_COLOR;
       break;
    case TGSI_SEMANTIC_FOG:       
       *idx = 0;
-      assert(semantic.SemanticIndex == 0);
+      assert(semantic.Index == 0);
       *usage = SVGA3D_DECLUSAGE_TEXCOORD;
       break;
    case TGSI_SEMANTIC_PSIZE:     
-      *idx = semantic.SemanticIndex;
+      *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_PSIZE;
       break;
    case TGSI_SEMANTIC_GENERIC:   
-      *idx = semantic.SemanticIndex + 1; /* texcoord[0] is reserved for fog */
+      *idx = semantic.Index + 1; /* texcoord[0] is reserved for fog */
       *usage = SVGA3D_DECLUSAGE_TEXCOORD;
       break;
    case TGSI_SEMANTIC_NORMAL:    
-      *idx = semantic.SemanticIndex;
+      *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_NORMAL;
       break;
    default:
@@ -119,7 +119,7 @@ static boolean ps30_input( struct svga_shader_emitter *emit,
    unsigned usage, index;
    SVGA3dShaderDestToken reg;
 
-   if (semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+   if (semantic.Name == TGSI_SEMANTIC_POSITION) {
       emit->input_map[idx] = src_register( SVGA3DREG_MISCTYPE,
                                            SVGA3DMISCREG_POSITION );
 
@@ -134,7 +134,7 @@ static boolean ps30_input( struct svga_shader_emitter *emit,
       return emit_decl( emit, reg, 0, 0 );
    }
    else if (emit->key.fkey.light_twoside &&
-            (semantic.SemanticName == TGSI_SEMANTIC_COLOR)) {
+            (semantic.Name == TGSI_SEMANTIC_COLOR)) {
 
       if (!translate_vs_ps_semantic( semantic, &usage, &index ))
          return FALSE;
@@ -149,7 +149,7 @@ static boolean ps30_input( struct svga_shader_emitter *emit,
       if (!emit_decl( emit, reg, usage, index ))
          return FALSE;
 
-      semantic.SemanticName = TGSI_SEMANTIC_BCOLOR;
+      semantic.Name = TGSI_SEMANTIC_BCOLOR;
       if (!translate_vs_ps_semantic( semantic, &usage, &index ))
          return FALSE;
 
@@ -163,7 +163,7 @@ static boolean ps30_input( struct svga_shader_emitter *emit,
 
       return TRUE;
    }
-   else if (semantic.SemanticName == TGSI_SEMANTIC_FACE) {
+   else if (semantic.Name == TGSI_SEMANTIC_FACE) {
       if (!emit_vface_decl( emit ))
          return FALSE;
       emit->emit_frontface = TRUE;
@@ -192,17 +192,17 @@ static boolean ps30_output( struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken reg;
 
-   switch (semantic.SemanticName) {
+   switch (semantic.Name) {
    case TGSI_SEMANTIC_COLOR:
       emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
-                                            semantic.SemanticIndex );
+                                            semantic.Index );
       break;
    case TGSI_SEMANTIC_POSITION:
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                             emit->nr_hw_temp++ );
       emit->temp_pos = emit->output_map[idx];
       emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
-                                     semantic.SemanticIndex );
+                                     semantic.Index );
       break;
    default:
       assert(0);
@@ -282,14 +282,14 @@ static boolean vs30_output( struct svga_shader_emitter *emit,
    dcl.index = index;
    dcl.values[0] |= 1<<31;
 
-   if (semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+   if (semantic.Name == TGSI_SEMANTIC_POSITION) {
       assert(idx == 0);
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                             emit->nr_hw_temp++ );
       emit->temp_pos = emit->output_map[idx];
       emit->true_pos = dcl.dst;
    }
-   else if (semantic.SemanticName == TGSI_SEMANTIC_PSIZE) {
+   else if (semantic.Name == TGSI_SEMANTIC_PSIZE) {
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
                                             emit->nr_hw_temp++ );
       emit->temp_psiz = emit->output_map[idx];
@@ -334,15 +334,15 @@ static boolean ps30_sampler( struct svga_shader_emitter *emit,
 boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit,
                              const struct tgsi_full_declaration *decl )
 {
-   unsigned first = decl->DeclarationRange.First;
-   unsigned last = decl->DeclarationRange.Last;
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
    unsigned semantic = 0;
    unsigned semantic_idx = 0;
    unsigned idx;
 
    if (decl->Declaration.Semantic) {
-      semantic = decl->Semantic.SemanticName;
-      semantic_idx = decl->Semantic.SemanticIndex;
+      semantic = decl->Semantic.Name;
+      semantic_idx = decl->Semantic.Index;
    }
 
    for( idx = first; idx <= last; idx++ ) {
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index ea409b7e165..dc5eb8fc606 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -96,24 +96,24 @@ translate_dst_register( struct svga_shader_emitter *emit,
                         const struct tgsi_full_instruction *insn,
                         unsigned idx )
 {
-   const struct tgsi_full_dst_register *reg = &insn->FullDstRegisters[idx];
+   const struct tgsi_full_dst_register *reg = &insn->Dst[idx];
    SVGA3dShaderDestToken dest;
 
-   switch (reg->DstRegister.File) {
+   switch (reg->Register.File) {
    case TGSI_FILE_OUTPUT:
       /* Output registers encode semantic information in their name.
        * Need to lookup a table built at decl time:
        */
-      dest = emit->output_map[reg->DstRegister.Index];
+      dest = emit->output_map[reg->Register.Index];
       break;
 
    default:
-      dest = dst_register( translate_file( reg->DstRegister.File ),
-                           reg->DstRegister.Index );
+      dest = dst_register( translate_file( reg->Register.File ),
+                           reg->Register.Index );
       break;
    }
 
-   dest.mask = reg->DstRegister.WriteMask;
+   dest.mask = reg->Register.WriteMask;
 
    if (insn->Instruction.Saturate) 
       dest.dstMod = SVGA3DDSTMOD_SATURATE;
@@ -176,33 +176,33 @@ translate_src_register( const struct svga_shader_emitter *emit,
 {
    struct src_register src;
 
-   switch (reg->SrcRegister.File) {
+   switch (reg->Register.File) {
    case TGSI_FILE_INPUT:
       /* Input registers are referred to by their semantic name rather
        * than by index.  Use the mapping build up from the decls:
        */
-      src = emit->input_map[reg->SrcRegister.Index];
+      src = emit->input_map[reg->Register.Index];
       break;
        
    case TGSI_FILE_IMMEDIATE:
       /* Immediates are appended after TGSI constants in the D3D
        * constant buffer.
        */
-      src = src_register( translate_file( reg->SrcRegister.File ),
-                          reg->SrcRegister.Index + 
+      src = src_register( translate_file( reg->Register.File ),
+                          reg->Register.Index + 
                           emit->imm_start );
       break;
 
    default:
-      src = src_register( translate_file( reg->SrcRegister.File ),
-                          reg->SrcRegister.Index );
+      src = src_register( translate_file( reg->Register.File ),
+                          reg->Register.Index );
 
       break;
    }
 
    /* Indirect addressing (for coninstant buffer lookups only)
     */
-   if (reg->SrcRegister.Indirect)
+   if (reg->Register.Indirect)
    {
       /* we shift the offset towards the minimum */
       if (svga_arl_needs_adjustment( emit )) {
@@ -213,28 +213,28 @@ translate_src_register( const struct svga_shader_emitter *emit,
       /* Not really sure what should go in the second token:
        */
       src.indirect = src_token( SVGA3DREG_ADDR,
-                                reg->SrcRegisterInd.Index );
+                                reg->Indirect.Index );
 
       src.indirect.swizzle = SWIZZLE_XXXX;
    }
 
    src = swizzle( src,
-                  reg->SrcRegister.SwizzleX,
-                  reg->SrcRegister.SwizzleY,
-                  reg->SrcRegister.SwizzleZ,
-                  reg->SrcRegister.SwizzleW );
+                  reg->Register.SwizzleX,
+                  reg->Register.SwizzleY,
+                  reg->Register.SwizzleZ,
+                  reg->Register.SwizzleW );
 
    /* src.mod isn't a bitfield, unfortunately:
     * See tgsi_util_get_full_src_register_sign_mode for implementation details.
     */
-   if (reg->SrcRegisterExtMod.Absolute) {
-      if (reg->SrcRegisterExtMod.Negate)
+   if (reg->Register.Absolute) {
+      if (reg->Register.Negate)
          src.base.srcMod = SVGA3DSRCMOD_ABSNEG;
       else
          src.base.srcMod = SVGA3DSRCMOD_ABS;
    }
    else {
-      if (reg->SrcRegister.Negate != reg->SrcRegisterExtMod.Negate)
+      if (reg->Register.Negate)
          src.base.srcMod = SVGA3DSRCMOD_NEG;
       else
          src.base.srcMod = SVGA3DSRCMOD_NONE;
@@ -629,7 +629,7 @@ static boolean emit_fake_arl(struct svga_shader_emitter *emit,
                              const struct tgsi_full_instruction *insn)
 {
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register src1 = get_fake_arl_const( emit );
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    SVGA3dShaderDestToken tmp = get_temp( emit );
@@ -653,7 +653,7 @@ static boolean emit_if(struct svga_shader_emitter *emit,
                        const struct tgsi_full_instruction *insn)
 {
    const struct src_register src = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register zero = get_zero_immediate( emit );
    SVGA3dShaderInstToken if_token = inst_token( SVGA3DOP_IFC );
 
@@ -690,7 +690,7 @@ static boolean emit_floor(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* FRC  TMP, SRC */
@@ -716,11 +716,11 @@ static boolean emit_cmp(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    const struct src_register src2 = translate_src_register(
-      emit, &insn->FullSrcRegisters[2] );
+      emit, &insn->Src[2] );
 
    /* CMP  DST, SRC0, SRC2, SRC1 */
    return submit_op3( emit, inst_token( SVGA3DOP_CMP ), dst, src0, src2, src1);
@@ -740,9 +740,9 @@ static boolean emit_div(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
    int i;
 
@@ -782,9 +782,9 @@ static boolean emit_dp2(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
    struct src_register temp_src0, temp_src1;
 
@@ -815,9 +815,9 @@ static boolean emit_dph(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* DP3  TMP, SRC1, SRC2 */
@@ -846,7 +846,7 @@ static boolean emit_nrm(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* DP3  TMP, SRC, SRC */
@@ -889,7 +889,7 @@ static boolean emit_sincos(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* SCS TMP SRC */
@@ -912,7 +912,7 @@ static boolean emit_sin(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* SCS TMP SRC */
@@ -937,7 +937,7 @@ static boolean emit_cos(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    SVGA3dShaderDestToken temp = get_temp( emit );
 
    /* SCS TMP SRC */
@@ -962,9 +962,9 @@ static boolean emit_sub(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
 
    src1 = negate(src1);
 
@@ -980,19 +980,19 @@ static boolean emit_kil(struct svga_shader_emitter *emit,
                         const struct tgsi_full_instruction *insn )
 {
    SVGA3dShaderInstToken inst;
-   const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[0];
+   const struct tgsi_full_src_register *reg = &insn->Src[0];
    struct src_register src0;
 
    inst = inst_token( SVGA3DOP_TEXKILL );
    src0 = translate_src_register( emit, reg );
 
-   if (reg->SrcRegisterExtMod.Absolute ||
-       reg->SrcRegister.Negate != reg->SrcRegisterExtMod.Negate ||
-       reg->SrcRegister.Indirect ||
-       reg->SrcRegister.SwizzleX != 0 ||
-       reg->SrcRegister.SwizzleY != 1 ||
-       reg->SrcRegister.SwizzleZ != 2 ||
-       reg->SrcRegister.File != TGSI_FILE_TEMPORARY)
+   if (reg->Register.Absolute ||
+       reg->Register.Negate ||
+       reg->Register.Indirect ||
+       reg->Register.SwizzleX != 0 ||
+       reg->Register.SwizzleY != 1 ||
+       reg->Register.SwizzleZ != 2 ||
+       reg->Register.File != TGSI_FILE_TEMPORARY)
    {
       SVGA3dShaderDestToken temp = get_temp( emit );
 
@@ -1154,9 +1154,9 @@ static boolean emit_select_op(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
       
    return emit_select( emit, compare, dst, src0, src1 );
 }
@@ -1189,8 +1189,8 @@ static boolean emit_tex2(struct svga_shader_emitter *emit,
       return FALSE;
    }
 
-   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
-   src1 = translate_src_register( emit, &insn->FullSrcRegisters[1] );
+   src0 = translate_src_register( emit, &insn->Src[0] );
+   src1 = translate_src_register( emit, &insn->Src[1] );
 
    if (emit->key.fkey.tex[src1.base.num].unnormalized) {
       struct src_register wh = get_tex_dimensions( emit, src1.base.num );
@@ -1231,9 +1231,9 @@ static boolean emit_tex3(struct svga_shader_emitter *emit,
       break;
    }
 
-   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
-   src1 = translate_src_register( emit, &insn->FullSrcRegisters[1] );
-   src2 = translate_src_register( emit, &insn->FullSrcRegisters[2] );
+   src0 = translate_src_register( emit, &insn->Src[0] );
+   src1 = translate_src_register( emit, &insn->Src[1] );
+   src2 = translate_src_register( emit, &insn->Src[2] );
 
    return submit_op3( emit, inst, dst, src0, src1, src2 );
 }
@@ -1245,9 +1245,9 @@ static boolean emit_tex(struct svga_shader_emitter *emit,
    SVGA3dShaderDestToken dst = 
       translate_dst_register( emit, insn, 0 );
    struct src_register src0 =
-      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+      translate_src_register( emit, &insn->Src[0] );
    struct src_register src1 =
-      translate_src_register( emit, &insn->FullSrcRegisters[1] );
+      translate_src_register( emit, &insn->Src[1] );
 
    SVGA3dShaderDestToken tex_result;
 
@@ -1359,7 +1359,7 @@ static boolean emit_scalar_op1( struct svga_shader_emitter *emit,
 
    inst = inst_token( opcode );
    dst = translate_dst_register( emit, insn, 0 );
-   src = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src = translate_src_register( emit, &insn->Src[0] );
    src = scalar( src, TGSI_SWIZZLE_X );
 
    return submit_op1( emit, inst, dst, src );
@@ -1370,7 +1370,7 @@ static boolean emit_simple_instruction(struct svga_shader_emitter *emit,
                                        unsigned opcode,
                                        const struct tgsi_full_instruction *insn )
 {
-   const struct tgsi_full_src_register *src = insn->FullSrcRegisters;
+   const struct tgsi_full_src_register *src = insn->Src;
    SVGA3dShaderInstToken inst;
    SVGA3dShaderDestToken dst;
 
@@ -1428,13 +1428,13 @@ static boolean emit_pow(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    boolean need_tmp = FALSE;
    
    /* POW can only output to a temporary */
-   if (insn->FullDstRegisters[0].DstRegister.File != TGSI_FILE_TEMPORARY)
+   if (insn->Dst[0].Register.File != TGSI_FILE_TEMPORARY)
       need_tmp = TRUE;
    
    /* POW src1 must not be the same register as dst */
@@ -1463,9 +1463,9 @@ static boolean emit_xpd(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    boolean need_dst_tmp = FALSE;
 
    /* XPD can only output to a temporary */
@@ -1517,11 +1517,11 @@ static boolean emit_lrp(struct svga_shader_emitter *emit,
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    SVGA3dShaderDestToken tmp;
    const struct src_register src0 = translate_src_register(
-      emit, &insn->FullSrcRegisters[0] );
+      emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
-      emit, &insn->FullSrcRegisters[1] );
+      emit, &insn->Src[1] );
    const struct src_register src2 = translate_src_register(
-      emit, &insn->FullSrcRegisters[2] );
+      emit, &insn->Src[2] );
    boolean need_dst_tmp = FALSE;
 
    /* The dst reg must not be the same as src0 or src2 */
@@ -1568,9 +1568,9 @@ static boolean emit_dst_insn(struct svga_shader_emitter *emit,
       SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
       SVGA3dShaderDestToken tmp;
       const struct src_register src0 = translate_src_register(
-         emit, &insn->FullSrcRegisters[0] );
+         emit, &insn->Src[0] );
       const struct src_register src1 = translate_src_register(
-         emit, &insn->FullSrcRegisters[1] );
+         emit, &insn->Src[1] );
       struct src_register zero = get_zero_immediate( emit );
       boolean need_tmp = FALSE;
 
@@ -1633,7 +1633,7 @@ static boolean emit_exp(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 =
-      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+      translate_src_register( emit, &insn->Src[0] );
    struct src_register zero = get_zero_immediate( emit );
    SVGA3dShaderDestToken fraction;
 
@@ -1723,7 +1723,7 @@ static boolean emit_lit(struct svga_shader_emitter *emit,
       SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
       SVGA3dShaderDestToken tmp = get_temp( emit );
       const struct src_register src0 = translate_src_register(
-         emit, &insn->FullSrcRegisters[0] );
+         emit, &insn->Src[0] );
       struct src_register zero = get_zero_immediate( emit );
 
       /* tmp = pow(src.y, src.w)
@@ -1806,7 +1806,7 @@ static boolean emit_ex2( struct svga_shader_emitter *emit,
 
    inst = inst_token( SVGA3DOP_EXP );
    dst = translate_dst_register( emit, insn, 0 );
-   src0 = translate_src_register( emit, &insn->FullSrcRegisters[0] );
+   src0 = translate_src_register( emit, &insn->Src[0] );
    src0 = scalar( src0, TGSI_SWIZZLE_X );
 
    if (dst.mask != TGSI_WRITEMASK_XYZW) {
@@ -1829,7 +1829,7 @@ static boolean emit_log(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
    struct src_register src0 =
-      translate_src_register( emit, &insn->FullSrcRegisters[0] );
+      translate_src_register( emit, &insn->Src[0] );
    struct src_register zero = get_zero_immediate( emit );
    SVGA3dShaderDestToken abs_tmp;
    struct src_register abs_src0;
@@ -1953,7 +1953,7 @@ static boolean emit_bgnsub( struct svga_shader_emitter *emit,
 static boolean emit_call( struct svga_shader_emitter *emit,
                            const struct tgsi_full_instruction *insn )
 {
-   unsigned position = insn->InstructionExtLabel.Label;
+   unsigned position = insn->Label.Label;
    unsigned i;
    
    for (i = 0; i < emit->nr_labels; i++) {
@@ -2109,7 +2109,7 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_NOT:
    case TGSI_OPCODE_SHL:
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
    case TGSI_OPCODE_XOR:
       return FALSE;
 
@@ -2543,27 +2543,27 @@ pre_parse_instruction( struct svga_shader_emitter *emit,
                        const struct tgsi_full_instruction *insn,
                        int current_arl)
 {
-   if (insn->FullSrcRegisters[0].SrcRegister.Indirect &&
-       insn->FullSrcRegisters[0].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
-      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[0];
-      if (reg->SrcRegister.Index < 0) {
-         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+   if (insn->Src[0].Register.Indirect &&
+       insn->Src[0].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[0];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
       }
    }
 
-   if (insn->FullSrcRegisters[1].SrcRegister.Indirect &&
-       insn->FullSrcRegisters[1].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
-      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[1];
-      if (reg->SrcRegister.Index < 0) {
-         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+   if (insn->Src[1].Register.Indirect &&
+       insn->Src[1].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[1];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
       }
    }
 
-   if (insn->FullSrcRegisters[2].SrcRegister.Indirect &&
-       insn->FullSrcRegisters[2].SrcRegisterInd.File == TGSI_FILE_ADDRESS) {
-      const struct tgsi_full_src_register *reg = &insn->FullSrcRegisters[2];
-      if (reg->SrcRegister.Index < 0) {
-         pre_parse_add_indirect(emit, reg->SrcRegister.Index, current_arl);
+   if (insn->Src[2].Register.Indirect &&
+       insn->Src[2].Indirect.File == TGSI_FILE_ADDRESS) {
+      const struct tgsi_full_src_register *reg = &insn->Src[2];
+      if (reg->Register.Index < 0) {
+         pre_parse_add_indirect(emit, reg->Register.Index, current_arl);
       }
    }
 
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.c b/src/gallium/drivers/svga/svgadump/svga_dump.c
index 910afa25287..d59fb89a58c 100644
--- a/src/gallium/drivers/svga/svgadump/svga_dump.c
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.c
@@ -42,554 +42,554 @@ dump_SVGA3dVertexDecl(const SVGA3dVertexDecl *cmd)
 {
    switch((*cmd).identity.type) {
    case SVGA3D_DECLTYPE_FLOAT1:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT1\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT1\n");
       break;
    case SVGA3D_DECLTYPE_FLOAT2:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT2\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT2\n");
       break;
    case SVGA3D_DECLTYPE_FLOAT3:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT3\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT3\n");
       break;
    case SVGA3D_DECLTYPE_FLOAT4:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT4\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT4\n");
       break;
    case SVGA3D_DECLTYPE_D3DCOLOR:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_D3DCOLOR\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_D3DCOLOR\n");
       break;
    case SVGA3D_DECLTYPE_UBYTE4:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4\n");
       break;
    case SVGA3D_DECLTYPE_SHORT2:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2\n");
       break;
    case SVGA3D_DECLTYPE_SHORT4:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4\n");
       break;
    case SVGA3D_DECLTYPE_UBYTE4N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UBYTE4N\n");
       break;
    case SVGA3D_DECLTYPE_SHORT2N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT2N\n");
       break;
    case SVGA3D_DECLTYPE_SHORT4N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_SHORT4N\n");
       break;
    case SVGA3D_DECLTYPE_USHORT2N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT2N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT2N\n");
       break;
    case SVGA3D_DECLTYPE_USHORT4N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT4N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_USHORT4N\n");
       break;
    case SVGA3D_DECLTYPE_UDEC3:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UDEC3\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_UDEC3\n");
       break;
    case SVGA3D_DECLTYPE_DEC3N:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_DEC3N\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_DEC3N\n");
       break;
    case SVGA3D_DECLTYPE_FLOAT16_2:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_2\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_2\n");
       break;
    case SVGA3D_DECLTYPE_FLOAT16_4:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_4\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_FLOAT16_4\n");
       break;
    case SVGA3D_DECLTYPE_MAX:
-      debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_MAX\n");
+      _debug_printf("\t\t.identity.type = SVGA3D_DECLTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.identity.type = %i\n", (*cmd).identity.type);
+      _debug_printf("\t\t.identity.type = %i\n", (*cmd).identity.type);
       break;
    }
    switch((*cmd).identity.method) {
    case SVGA3D_DECLMETHOD_DEFAULT:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_DEFAULT\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_DEFAULT\n");
       break;
    case SVGA3D_DECLMETHOD_PARTIALU:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALU\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALU\n");
       break;
    case SVGA3D_DECLMETHOD_PARTIALV:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALV\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_PARTIALV\n");
       break;
    case SVGA3D_DECLMETHOD_CROSSUV:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_CROSSUV\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_CROSSUV\n");
       break;
    case SVGA3D_DECLMETHOD_UV:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_UV\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_UV\n");
       break;
    case SVGA3D_DECLMETHOD_LOOKUP:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUP\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUP\n");
       break;
    case SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED:
-      debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED\n");
+      _debug_printf("\t\t.identity.method = SVGA3D_DECLMETHOD_LOOKUPPRESAMPLED\n");
       break;
    default:
-      debug_printf("\t\t.identity.method = %i\n", (*cmd).identity.method);
+      _debug_printf("\t\t.identity.method = %i\n", (*cmd).identity.method);
       break;
    }
    switch((*cmd).identity.usage) {
    case SVGA3D_DECLUSAGE_POSITION:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITION\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITION\n");
       break;
    case SVGA3D_DECLUSAGE_BLENDWEIGHT:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDWEIGHT\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDWEIGHT\n");
       break;
    case SVGA3D_DECLUSAGE_BLENDINDICES:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDINDICES\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BLENDINDICES\n");
       break;
    case SVGA3D_DECLUSAGE_NORMAL:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_NORMAL\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_NORMAL\n");
       break;
    case SVGA3D_DECLUSAGE_PSIZE:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_PSIZE\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_PSIZE\n");
       break;
    case SVGA3D_DECLUSAGE_TEXCOORD:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TEXCOORD\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TEXCOORD\n");
       break;
    case SVGA3D_DECLUSAGE_TANGENT:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TANGENT\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TANGENT\n");
       break;
    case SVGA3D_DECLUSAGE_BINORMAL:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BINORMAL\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_BINORMAL\n");
       break;
    case SVGA3D_DECLUSAGE_TESSFACTOR:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TESSFACTOR\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_TESSFACTOR\n");
       break;
    case SVGA3D_DECLUSAGE_POSITIONT:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITIONT\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_POSITIONT\n");
       break;
    case SVGA3D_DECLUSAGE_COLOR:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_COLOR\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_COLOR\n");
       break;
    case SVGA3D_DECLUSAGE_FOG:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_FOG\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_FOG\n");
       break;
    case SVGA3D_DECLUSAGE_DEPTH:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_DEPTH\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_DEPTH\n");
       break;
    case SVGA3D_DECLUSAGE_SAMPLE:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_SAMPLE\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_SAMPLE\n");
       break;
    case SVGA3D_DECLUSAGE_MAX:
-      debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_MAX\n");
+      _debug_printf("\t\t.identity.usage = SVGA3D_DECLUSAGE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.identity.usage = %i\n", (*cmd).identity.usage);
+      _debug_printf("\t\t.identity.usage = %i\n", (*cmd).identity.usage);
       break;
    }
-   debug_printf("\t\t.identity.usageIndex = %u\n", (*cmd).identity.usageIndex);
-   debug_printf("\t\t.array.surfaceId = %u\n", (*cmd).array.surfaceId);
-   debug_printf("\t\t.array.offset = %u\n", (*cmd).array.offset);
-   debug_printf("\t\t.array.stride = %u\n", (*cmd).array.stride);
-   debug_printf("\t\t.rangeHint.first = %u\n", (*cmd).rangeHint.first);
-   debug_printf("\t\t.rangeHint.last = %u\n", (*cmd).rangeHint.last);
+   _debug_printf("\t\t.identity.usageIndex = %u\n", (*cmd).identity.usageIndex);
+   _debug_printf("\t\t.array.surfaceId = %u\n", (*cmd).array.surfaceId);
+   _debug_printf("\t\t.array.offset = %u\n", (*cmd).array.offset);
+   _debug_printf("\t\t.array.stride = %u\n", (*cmd).array.stride);
+   _debug_printf("\t\t.rangeHint.first = %u\n", (*cmd).rangeHint.first);
+   _debug_printf("\t\t.rangeHint.last = %u\n", (*cmd).rangeHint.last);
 }
 
 static void
 dump_SVGA3dTextureState(const SVGA3dTextureState *cmd)
 {
-   debug_printf("\t\t.stage = %u\n", (*cmd).stage);
+   _debug_printf("\t\t.stage = %u\n", (*cmd).stage);
    switch((*cmd).name) {
    case SVGA3D_TS_INVALID:
-      debug_printf("\t\t.name = SVGA3D_TS_INVALID\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_INVALID\n");
       break;
    case SVGA3D_TS_BIND_TEXTURE:
-      debug_printf("\t\t.name = SVGA3D_TS_BIND_TEXTURE\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BIND_TEXTURE\n");
       break;
    case SVGA3D_TS_COLOROP:
-      debug_printf("\t\t.name = SVGA3D_TS_COLOROP\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_COLOROP\n");
       break;
    case SVGA3D_TS_COLORARG1:
-      debug_printf("\t\t.name = SVGA3D_TS_COLORARG1\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG1\n");
       break;
    case SVGA3D_TS_COLORARG2:
-      debug_printf("\t\t.name = SVGA3D_TS_COLORARG2\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG2\n");
       break;
    case SVGA3D_TS_ALPHAOP:
-      debug_printf("\t\t.name = SVGA3D_TS_ALPHAOP\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAOP\n");
       break;
    case SVGA3D_TS_ALPHAARG1:
-      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG1\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG1\n");
       break;
    case SVGA3D_TS_ALPHAARG2:
-      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG2\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG2\n");
       break;
    case SVGA3D_TS_ADDRESSU:
-      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSU\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSU\n");
       break;
    case SVGA3D_TS_ADDRESSV:
-      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSV\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSV\n");
       break;
    case SVGA3D_TS_MIPFILTER:
-      debug_printf("\t\t.name = SVGA3D_TS_MIPFILTER\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_MIPFILTER\n");
       break;
    case SVGA3D_TS_MAGFILTER:
-      debug_printf("\t\t.name = SVGA3D_TS_MAGFILTER\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_MAGFILTER\n");
       break;
    case SVGA3D_TS_MINFILTER:
-      debug_printf("\t\t.name = SVGA3D_TS_MINFILTER\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_MINFILTER\n");
       break;
    case SVGA3D_TS_BORDERCOLOR:
-      debug_printf("\t\t.name = SVGA3D_TS_BORDERCOLOR\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BORDERCOLOR\n");
       break;
    case SVGA3D_TS_TEXCOORDINDEX:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDINDEX\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDINDEX\n");
       break;
    case SVGA3D_TS_TEXTURETRANSFORMFLAGS:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXTURETRANSFORMFLAGS\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURETRANSFORMFLAGS\n");
       break;
    case SVGA3D_TS_TEXCOORDGEN:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDGEN\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXCOORDGEN\n");
       break;
    case SVGA3D_TS_BUMPENVMAT00:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT00\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT00\n");
       break;
    case SVGA3D_TS_BUMPENVMAT01:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT01\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT01\n");
       break;
    case SVGA3D_TS_BUMPENVMAT10:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT10\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT10\n");
       break;
    case SVGA3D_TS_BUMPENVMAT11:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT11\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVMAT11\n");
       break;
    case SVGA3D_TS_TEXTURE_MIPMAP_LEVEL:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_MIPMAP_LEVEL\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_MIPMAP_LEVEL\n");
       break;
    case SVGA3D_TS_TEXTURE_LOD_BIAS:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_LOD_BIAS\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_LOD_BIAS\n");
       break;
    case SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL:
-      debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_TEXTURE_ANISOTROPIC_LEVEL\n");
       break;
    case SVGA3D_TS_ADDRESSW:
-      debug_printf("\t\t.name = SVGA3D_TS_ADDRESSW\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ADDRESSW\n");
       break;
    case SVGA3D_TS_GAMMA:
-      debug_printf("\t\t.name = SVGA3D_TS_GAMMA\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_GAMMA\n");
       break;
    case SVGA3D_TS_BUMPENVLSCALE:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLSCALE\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLSCALE\n");
       break;
    case SVGA3D_TS_BUMPENVLOFFSET:
-      debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLOFFSET\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_BUMPENVLOFFSET\n");
       break;
    case SVGA3D_TS_COLORARG0:
-      debug_printf("\t\t.name = SVGA3D_TS_COLORARG0\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_COLORARG0\n");
       break;
    case SVGA3D_TS_ALPHAARG0:
-      debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG0\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_ALPHAARG0\n");
       break;
    case SVGA3D_TS_MAX:
-      debug_printf("\t\t.name = SVGA3D_TS_MAX\n");
+      _debug_printf("\t\t.name = SVGA3D_TS_MAX\n");
       break;
    default:
-      debug_printf("\t\t.name = %i\n", (*cmd).name);
+      _debug_printf("\t\t.name = %i\n", (*cmd).name);
       break;
    }
-   debug_printf("\t\t.value = %u\n", (*cmd).value);
-   debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+   _debug_printf("\t\t.value = %u\n", (*cmd).value);
+   _debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
 }
 
 static void
 dump_SVGA3dCopyBox(const SVGA3dCopyBox *cmd)
 {
-   debug_printf("\t\t.x = %u\n", (*cmd).x);
-   debug_printf("\t\t.y = %u\n", (*cmd).y);
-   debug_printf("\t\t.z = %u\n", (*cmd).z);
-   debug_printf("\t\t.w = %u\n", (*cmd).w);
-   debug_printf("\t\t.h = %u\n", (*cmd).h);
-   debug_printf("\t\t.d = %u\n", (*cmd).d);
-   debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
-   debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
-   debug_printf("\t\t.srcz = %u\n", (*cmd).srcz);
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.z = %u\n", (*cmd).z);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
+   _debug_printf("\t\t.d = %u\n", (*cmd).d);
+   _debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   _debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+   _debug_printf("\t\t.srcz = %u\n", (*cmd).srcz);
 }
 
 static void
 dump_SVGA3dCmdSetClipPlane(const SVGA3dCmdSetClipPlane *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.index = %u\n", (*cmd).index);
-   debug_printf("\t\t.plane[0] = %f\n", (*cmd).plane[0]);
-   debug_printf("\t\t.plane[1] = %f\n", (*cmd).plane[1]);
-   debug_printf("\t\t.plane[2] = %f\n", (*cmd).plane[2]);
-   debug_printf("\t\t.plane[3] = %f\n", (*cmd).plane[3]);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
+   _debug_printf("\t\t.plane[0] = %f\n", (*cmd).plane[0]);
+   _debug_printf("\t\t.plane[1] = %f\n", (*cmd).plane[1]);
+   _debug_printf("\t\t.plane[2] = %f\n", (*cmd).plane[2]);
+   _debug_printf("\t\t.plane[3] = %f\n", (*cmd).plane[3]);
 }
 
 static void
 dump_SVGA3dCmdWaitForQuery(const SVGA3dCmdWaitForQuery *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_QUERYTYPE_OCCLUSION:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
       break;
    case SVGA3D_QUERYTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
-   debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
-   debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+   _debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   _debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
 }
 
 static void
 dump_SVGA3dCmdSetRenderTarget(const SVGA3dCmdSetRenderTarget *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_RT_DEPTH:
-      debug_printf("\t\t.type = SVGA3D_RT_DEPTH\n");
+      _debug_printf("\t\t.type = SVGA3D_RT_DEPTH\n");
       break;
    case SVGA3D_RT_STENCIL:
-      debug_printf("\t\t.type = SVGA3D_RT_STENCIL\n");
+      _debug_printf("\t\t.type = SVGA3D_RT_STENCIL\n");
       break;
    default:
-      debug_printf("\t\t.type = SVGA3D_RT_COLOR%u\n", (*cmd).type - SVGA3D_RT_COLOR0);
+      _debug_printf("\t\t.type = SVGA3D_RT_COLOR%u\n", (*cmd).type - SVGA3D_RT_COLOR0);
       break;
    }
-   debug_printf("\t\t.target.sid = %u\n", (*cmd).target.sid);
-   debug_printf("\t\t.target.face = %u\n", (*cmd).target.face);
-   debug_printf("\t\t.target.mipmap = %u\n", (*cmd).target.mipmap);
+   _debug_printf("\t\t.target.sid = %u\n", (*cmd).target.sid);
+   _debug_printf("\t\t.target.face = %u\n", (*cmd).target.face);
+   _debug_printf("\t\t.target.mipmap = %u\n", (*cmd).target.mipmap);
 }
 
 static void
 dump_SVGA3dCmdSetTextureState(const SVGA3dCmdSetTextureState *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
 }
 
 static void
 dump_SVGA3dCmdSurfaceCopy(const SVGA3dCmdSurfaceCopy *cmd)
 {
-   debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
-   debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
-   debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
-   debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
-   debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
-   debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+   _debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   _debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   _debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   _debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   _debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   _debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
 }
 
 static void
 dump_SVGA3dCmdSetMaterial(const SVGA3dCmdSetMaterial *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).face) {
    case SVGA3D_FACE_INVALID:
-      debug_printf("\t\t.face = SVGA3D_FACE_INVALID\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_INVALID\n");
       break;
    case SVGA3D_FACE_NONE:
-      debug_printf("\t\t.face = SVGA3D_FACE_NONE\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_NONE\n");
       break;
    case SVGA3D_FACE_FRONT:
-      debug_printf("\t\t.face = SVGA3D_FACE_FRONT\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_FRONT\n");
       break;
    case SVGA3D_FACE_BACK:
-      debug_printf("\t\t.face = SVGA3D_FACE_BACK\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_BACK\n");
       break;
    case SVGA3D_FACE_FRONT_BACK:
-      debug_printf("\t\t.face = SVGA3D_FACE_FRONT_BACK\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_FRONT_BACK\n");
       break;
    case SVGA3D_FACE_MAX:
-      debug_printf("\t\t.face = SVGA3D_FACE_MAX\n");
+      _debug_printf("\t\t.face = SVGA3D_FACE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.face = %i\n", (*cmd).face);
+      _debug_printf("\t\t.face = %i\n", (*cmd).face);
       break;
    }
-   debug_printf("\t\t.material.diffuse[0] = %f\n", (*cmd).material.diffuse[0]);
-   debug_printf("\t\t.material.diffuse[1] = %f\n", (*cmd).material.diffuse[1]);
-   debug_printf("\t\t.material.diffuse[2] = %f\n", (*cmd).material.diffuse[2]);
-   debug_printf("\t\t.material.diffuse[3] = %f\n", (*cmd).material.diffuse[3]);
-   debug_printf("\t\t.material.ambient[0] = %f\n", (*cmd).material.ambient[0]);
-   debug_printf("\t\t.material.ambient[1] = %f\n", (*cmd).material.ambient[1]);
-   debug_printf("\t\t.material.ambient[2] = %f\n", (*cmd).material.ambient[2]);
-   debug_printf("\t\t.material.ambient[3] = %f\n", (*cmd).material.ambient[3]);
-   debug_printf("\t\t.material.specular[0] = %f\n", (*cmd).material.specular[0]);
-   debug_printf("\t\t.material.specular[1] = %f\n", (*cmd).material.specular[1]);
-   debug_printf("\t\t.material.specular[2] = %f\n", (*cmd).material.specular[2]);
-   debug_printf("\t\t.material.specular[3] = %f\n", (*cmd).material.specular[3]);
-   debug_printf("\t\t.material.emissive[0] = %f\n", (*cmd).material.emissive[0]);
-   debug_printf("\t\t.material.emissive[1] = %f\n", (*cmd).material.emissive[1]);
-   debug_printf("\t\t.material.emissive[2] = %f\n", (*cmd).material.emissive[2]);
-   debug_printf("\t\t.material.emissive[3] = %f\n", (*cmd).material.emissive[3]);
-   debug_printf("\t\t.material.shininess = %f\n", (*cmd).material.shininess);
+   _debug_printf("\t\t.material.diffuse[0] = %f\n", (*cmd).material.diffuse[0]);
+   _debug_printf("\t\t.material.diffuse[1] = %f\n", (*cmd).material.diffuse[1]);
+   _debug_printf("\t\t.material.diffuse[2] = %f\n", (*cmd).material.diffuse[2]);
+   _debug_printf("\t\t.material.diffuse[3] = %f\n", (*cmd).material.diffuse[3]);
+   _debug_printf("\t\t.material.ambient[0] = %f\n", (*cmd).material.ambient[0]);
+   _debug_printf("\t\t.material.ambient[1] = %f\n", (*cmd).material.ambient[1]);
+   _debug_printf("\t\t.material.ambient[2] = %f\n", (*cmd).material.ambient[2]);
+   _debug_printf("\t\t.material.ambient[3] = %f\n", (*cmd).material.ambient[3]);
+   _debug_printf("\t\t.material.specular[0] = %f\n", (*cmd).material.specular[0]);
+   _debug_printf("\t\t.material.specular[1] = %f\n", (*cmd).material.specular[1]);
+   _debug_printf("\t\t.material.specular[2] = %f\n", (*cmd).material.specular[2]);
+   _debug_printf("\t\t.material.specular[3] = %f\n", (*cmd).material.specular[3]);
+   _debug_printf("\t\t.material.emissive[0] = %f\n", (*cmd).material.emissive[0]);
+   _debug_printf("\t\t.material.emissive[1] = %f\n", (*cmd).material.emissive[1]);
+   _debug_printf("\t\t.material.emissive[2] = %f\n", (*cmd).material.emissive[2]);
+   _debug_printf("\t\t.material.emissive[3] = %f\n", (*cmd).material.emissive[3]);
+   _debug_printf("\t\t.material.shininess = %f\n", (*cmd).material.shininess);
 }
 
 static void
 dump_SVGA3dCmdSetLightData(const SVGA3dCmdSetLightData *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.index = %u\n", (*cmd).index);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
    switch((*cmd).data.type) {
    case SVGA3D_LIGHTTYPE_INVALID:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_INVALID\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_INVALID\n");
       break;
    case SVGA3D_LIGHTTYPE_POINT:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_POINT\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_POINT\n");
       break;
    case SVGA3D_LIGHTTYPE_SPOT1:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT1\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT1\n");
       break;
    case SVGA3D_LIGHTTYPE_SPOT2:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT2\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_SPOT2\n");
       break;
    case SVGA3D_LIGHTTYPE_DIRECTIONAL:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_DIRECTIONAL\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_DIRECTIONAL\n");
       break;
    case SVGA3D_LIGHTTYPE_MAX:
-      debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_MAX\n");
+      _debug_printf("\t\t.data.type = SVGA3D_LIGHTTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.data.type = %i\n", (*cmd).data.type);
+      _debug_printf("\t\t.data.type = %i\n", (*cmd).data.type);
       break;
    }
-   debug_printf("\t\t.data.inWorldSpace = %u\n", (*cmd).data.inWorldSpace);
-   debug_printf("\t\t.data.diffuse[0] = %f\n", (*cmd).data.diffuse[0]);
-   debug_printf("\t\t.data.diffuse[1] = %f\n", (*cmd).data.diffuse[1]);
-   debug_printf("\t\t.data.diffuse[2] = %f\n", (*cmd).data.diffuse[2]);
-   debug_printf("\t\t.data.diffuse[3] = %f\n", (*cmd).data.diffuse[3]);
-   debug_printf("\t\t.data.specular[0] = %f\n", (*cmd).data.specular[0]);
-   debug_printf("\t\t.data.specular[1] = %f\n", (*cmd).data.specular[1]);
-   debug_printf("\t\t.data.specular[2] = %f\n", (*cmd).data.specular[2]);
-   debug_printf("\t\t.data.specular[3] = %f\n", (*cmd).data.specular[3]);
-   debug_printf("\t\t.data.ambient[0] = %f\n", (*cmd).data.ambient[0]);
-   debug_printf("\t\t.data.ambient[1] = %f\n", (*cmd).data.ambient[1]);
-   debug_printf("\t\t.data.ambient[2] = %f\n", (*cmd).data.ambient[2]);
-   debug_printf("\t\t.data.ambient[3] = %f\n", (*cmd).data.ambient[3]);
-   debug_printf("\t\t.data.position[0] = %f\n", (*cmd).data.position[0]);
-   debug_printf("\t\t.data.position[1] = %f\n", (*cmd).data.position[1]);
-   debug_printf("\t\t.data.position[2] = %f\n", (*cmd).data.position[2]);
-   debug_printf("\t\t.data.position[3] = %f\n", (*cmd).data.position[3]);
-   debug_printf("\t\t.data.direction[0] = %f\n", (*cmd).data.direction[0]);
-   debug_printf("\t\t.data.direction[1] = %f\n", (*cmd).data.direction[1]);
-   debug_printf("\t\t.data.direction[2] = %f\n", (*cmd).data.direction[2]);
-   debug_printf("\t\t.data.direction[3] = %f\n", (*cmd).data.direction[3]);
-   debug_printf("\t\t.data.range = %f\n", (*cmd).data.range);
-   debug_printf("\t\t.data.falloff = %f\n", (*cmd).data.falloff);
-   debug_printf("\t\t.data.attenuation0 = %f\n", (*cmd).data.attenuation0);
-   debug_printf("\t\t.data.attenuation1 = %f\n", (*cmd).data.attenuation1);
-   debug_printf("\t\t.data.attenuation2 = %f\n", (*cmd).data.attenuation2);
-   debug_printf("\t\t.data.theta = %f\n", (*cmd).data.theta);
-   debug_printf("\t\t.data.phi = %f\n", (*cmd).data.phi);
+   _debug_printf("\t\t.data.inWorldSpace = %u\n", (*cmd).data.inWorldSpace);
+   _debug_printf("\t\t.data.diffuse[0] = %f\n", (*cmd).data.diffuse[0]);
+   _debug_printf("\t\t.data.diffuse[1] = %f\n", (*cmd).data.diffuse[1]);
+   _debug_printf("\t\t.data.diffuse[2] = %f\n", (*cmd).data.diffuse[2]);
+   _debug_printf("\t\t.data.diffuse[3] = %f\n", (*cmd).data.diffuse[3]);
+   _debug_printf("\t\t.data.specular[0] = %f\n", (*cmd).data.specular[0]);
+   _debug_printf("\t\t.data.specular[1] = %f\n", (*cmd).data.specular[1]);
+   _debug_printf("\t\t.data.specular[2] = %f\n", (*cmd).data.specular[2]);
+   _debug_printf("\t\t.data.specular[3] = %f\n", (*cmd).data.specular[3]);
+   _debug_printf("\t\t.data.ambient[0] = %f\n", (*cmd).data.ambient[0]);
+   _debug_printf("\t\t.data.ambient[1] = %f\n", (*cmd).data.ambient[1]);
+   _debug_printf("\t\t.data.ambient[2] = %f\n", (*cmd).data.ambient[2]);
+   _debug_printf("\t\t.data.ambient[3] = %f\n", (*cmd).data.ambient[3]);
+   _debug_printf("\t\t.data.position[0] = %f\n", (*cmd).data.position[0]);
+   _debug_printf("\t\t.data.position[1] = %f\n", (*cmd).data.position[1]);
+   _debug_printf("\t\t.data.position[2] = %f\n", (*cmd).data.position[2]);
+   _debug_printf("\t\t.data.position[3] = %f\n", (*cmd).data.position[3]);
+   _debug_printf("\t\t.data.direction[0] = %f\n", (*cmd).data.direction[0]);
+   _debug_printf("\t\t.data.direction[1] = %f\n", (*cmd).data.direction[1]);
+   _debug_printf("\t\t.data.direction[2] = %f\n", (*cmd).data.direction[2]);
+   _debug_printf("\t\t.data.direction[3] = %f\n", (*cmd).data.direction[3]);
+   _debug_printf("\t\t.data.range = %f\n", (*cmd).data.range);
+   _debug_printf("\t\t.data.falloff = %f\n", (*cmd).data.falloff);
+   _debug_printf("\t\t.data.attenuation0 = %f\n", (*cmd).data.attenuation0);
+   _debug_printf("\t\t.data.attenuation1 = %f\n", (*cmd).data.attenuation1);
+   _debug_printf("\t\t.data.attenuation2 = %f\n", (*cmd).data.attenuation2);
+   _debug_printf("\t\t.data.theta = %f\n", (*cmd).data.theta);
+   _debug_printf("\t\t.data.phi = %f\n", (*cmd).data.phi);
 }
 
 static void
 dump_SVGA3dCmdSetViewport(const SVGA3dCmdSetViewport *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
-   debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
-   debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
-   debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   _debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   _debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   _debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
 }
 
 static void
 dump_SVGA3dCmdSetScissorRect(const SVGA3dCmdSetScissorRect *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
-   debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
-   debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
-   debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.rect.x = %u\n", (*cmd).rect.x);
+   _debug_printf("\t\t.rect.y = %u\n", (*cmd).rect.y);
+   _debug_printf("\t\t.rect.w = %u\n", (*cmd).rect.w);
+   _debug_printf("\t\t.rect.h = %u\n", (*cmd).rect.h);
 }
 
 static void
 dump_SVGA3dCopyRect(const SVGA3dCopyRect *cmd)
 {
-   debug_printf("\t\t.x = %u\n", (*cmd).x);
-   debug_printf("\t\t.y = %u\n", (*cmd).y);
-   debug_printf("\t\t.w = %u\n", (*cmd).w);
-   debug_printf("\t\t.h = %u\n", (*cmd).h);
-   debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
-   debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
+   _debug_printf("\t\t.srcx = %u\n", (*cmd).srcx);
+   _debug_printf("\t\t.srcy = %u\n", (*cmd).srcy);
 }
 
 static void
 dump_SVGA3dCmdSetShader(const SVGA3dCmdSetShader *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_SHADERTYPE_COMPILED_DX8:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
       break;
    case SVGA3D_SHADERTYPE_VS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
       break;
    case SVGA3D_SHADERTYPE_PS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
       break;
    case SVGA3D_SHADERTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
-   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
 }
 
 static void
 dump_SVGA3dCmdEndQuery(const SVGA3dCmdEndQuery *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_QUERYTYPE_OCCLUSION:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
       break;
    case SVGA3D_QUERYTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
-   debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
-   debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
+   _debug_printf("\t\t.guestResult.gmrId = %u\n", (*cmd).guestResult.gmrId);
+   _debug_printf("\t\t.guestResult.offset = %u\n", (*cmd).guestResult.offset);
 }
 
 static void
 dump_SVGA3dSize(const SVGA3dSize *cmd)
 {
-   debug_printf("\t\t.width = %u\n", (*cmd).width);
-   debug_printf("\t\t.height = %u\n", (*cmd).height);
-   debug_printf("\t\t.depth = %u\n", (*cmd).depth);
+   _debug_printf("\t\t.width = %u\n", (*cmd).width);
+   _debug_printf("\t\t.height = %u\n", (*cmd).height);
+   _debug_printf("\t\t.depth = %u\n", (*cmd).depth);
 }
 
 static void
 dump_SVGA3dCmdDestroySurface(const SVGA3dCmdDestroySurface *cmd)
 {
-   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
 }
 
 static void
 dump_SVGA3dCmdDefineContext(const SVGA3dCmdDefineContext *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
 }
 
 static void
 dump_SVGA3dRect(const SVGA3dRect *cmd)
 {
-   debug_printf("\t\t.x = %u\n", (*cmd).x);
-   debug_printf("\t\t.y = %u\n", (*cmd).y);
-   debug_printf("\t\t.w = %u\n", (*cmd).w);
-   debug_printf("\t\t.h = %u\n", (*cmd).h);
+   _debug_printf("\t\t.x = %u\n", (*cmd).x);
+   _debug_printf("\t\t.y = %u\n", (*cmd).y);
+   _debug_printf("\t\t.w = %u\n", (*cmd).w);
+   _debug_printf("\t\t.h = %u\n", (*cmd).h);
 }
 
 static void
 dump_SVGA3dCmdBeginQuery(const SVGA3dCmdBeginQuery *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_QUERYTYPE_OCCLUSION:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_OCCLUSION\n");
       break;
    case SVGA3D_QUERYTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_QUERYTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
 }
@@ -599,336 +599,336 @@ dump_SVGA3dRenderState(const SVGA3dRenderState *cmd)
 {
    switch((*cmd).state) {
    case SVGA3D_RS_INVALID:
-      debug_printf("\t\t.state = SVGA3D_RS_INVALID\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_INVALID\n");
       break;
    case SVGA3D_RS_ZENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_ZENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ZENABLE\n");
       break;
    case SVGA3D_RS_ZWRITEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_ZWRITEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ZWRITEENABLE\n");
       break;
    case SVGA3D_RS_ALPHATESTENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_ALPHATESTENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHATESTENABLE\n");
       break;
    case SVGA3D_RS_DITHERENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_DITHERENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_DITHERENABLE\n");
       break;
    case SVGA3D_RS_BLENDENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_BLENDENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDENABLE\n");
       break;
    case SVGA3D_RS_FOGENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGENABLE\n");
       break;
    case SVGA3D_RS_SPECULARENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_SPECULARENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SPECULARENABLE\n");
       break;
    case SVGA3D_RS_STENCILENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE\n");
       break;
    case SVGA3D_RS_LIGHTINGENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_LIGHTINGENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_LIGHTINGENABLE\n");
       break;
    case SVGA3D_RS_NORMALIZENORMALS:
-      debug_printf("\t\t.state = SVGA3D_RS_NORMALIZENORMALS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_NORMALIZENORMALS\n");
       break;
    case SVGA3D_RS_POINTSPRITEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSPRITEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSPRITEENABLE\n");
       break;
    case SVGA3D_RS_POINTSCALEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALEENABLE\n");
       break;
    case SVGA3D_RS_STENCILREF:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILREF\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILREF\n");
       break;
    case SVGA3D_RS_STENCILMASK:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILMASK\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILMASK\n");
       break;
    case SVGA3D_RS_STENCILWRITEMASK:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILWRITEMASK\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILWRITEMASK\n");
       break;
    case SVGA3D_RS_FOGSTART:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGSTART\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGSTART\n");
       break;
    case SVGA3D_RS_FOGEND:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGEND\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGEND\n");
       break;
    case SVGA3D_RS_FOGDENSITY:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGDENSITY\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGDENSITY\n");
       break;
    case SVGA3D_RS_POINTSIZE:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZE\n");
       break;
    case SVGA3D_RS_POINTSIZEMIN:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMIN\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMIN\n");
       break;
    case SVGA3D_RS_POINTSIZEMAX:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMAX\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSIZEMAX\n");
       break;
    case SVGA3D_RS_POINTSCALE_A:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_A\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_A\n");
       break;
    case SVGA3D_RS_POINTSCALE_B:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_B\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_B\n");
       break;
    case SVGA3D_RS_POINTSCALE_C:
-      debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_C\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_POINTSCALE_C\n");
       break;
    case SVGA3D_RS_FOGCOLOR:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGCOLOR\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGCOLOR\n");
       break;
    case SVGA3D_RS_AMBIENT:
-      debug_printf("\t\t.state = SVGA3D_RS_AMBIENT\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_AMBIENT\n");
       break;
    case SVGA3D_RS_CLIPPLANEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_CLIPPLANEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CLIPPLANEENABLE\n");
       break;
    case SVGA3D_RS_FOGMODE:
-      debug_printf("\t\t.state = SVGA3D_RS_FOGMODE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FOGMODE\n");
       break;
    case SVGA3D_RS_FILLMODE:
-      debug_printf("\t\t.state = SVGA3D_RS_FILLMODE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FILLMODE\n");
       break;
    case SVGA3D_RS_SHADEMODE:
-      debug_printf("\t\t.state = SVGA3D_RS_SHADEMODE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SHADEMODE\n");
       break;
    case SVGA3D_RS_LINEPATTERN:
-      debug_printf("\t\t.state = SVGA3D_RS_LINEPATTERN\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_LINEPATTERN\n");
       break;
    case SVGA3D_RS_SRCBLEND:
-      debug_printf("\t\t.state = SVGA3D_RS_SRCBLEND\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SRCBLEND\n");
       break;
    case SVGA3D_RS_DSTBLEND:
-      debug_printf("\t\t.state = SVGA3D_RS_DSTBLEND\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_DSTBLEND\n");
       break;
    case SVGA3D_RS_BLENDEQUATION:
-      debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATION\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATION\n");
       break;
    case SVGA3D_RS_CULLMODE:
-      debug_printf("\t\t.state = SVGA3D_RS_CULLMODE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CULLMODE\n");
       break;
    case SVGA3D_RS_ZFUNC:
-      debug_printf("\t\t.state = SVGA3D_RS_ZFUNC\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ZFUNC\n");
       break;
    case SVGA3D_RS_ALPHAFUNC:
-      debug_printf("\t\t.state = SVGA3D_RS_ALPHAFUNC\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHAFUNC\n");
       break;
    case SVGA3D_RS_STENCILFUNC:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILFUNC\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILFUNC\n");
       break;
    case SVGA3D_RS_STENCILFAIL:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILFAIL\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILFAIL\n");
       break;
    case SVGA3D_RS_STENCILZFAIL:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILZFAIL\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILZFAIL\n");
       break;
    case SVGA3D_RS_STENCILPASS:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILPASS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILPASS\n");
       break;
    case SVGA3D_RS_ALPHAREF:
-      debug_printf("\t\t.state = SVGA3D_RS_ALPHAREF\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ALPHAREF\n");
       break;
    case SVGA3D_RS_FRONTWINDING:
-      debug_printf("\t\t.state = SVGA3D_RS_FRONTWINDING\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_FRONTWINDING\n");
       break;
    case SVGA3D_RS_COORDINATETYPE:
-      debug_printf("\t\t.state = SVGA3D_RS_COORDINATETYPE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_COORDINATETYPE\n");
       break;
    case SVGA3D_RS_ZBIAS:
-      debug_printf("\t\t.state = SVGA3D_RS_ZBIAS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ZBIAS\n");
       break;
    case SVGA3D_RS_RANGEFOGENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_RANGEFOGENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_RANGEFOGENABLE\n");
       break;
    case SVGA3D_RS_COLORWRITEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE\n");
       break;
    case SVGA3D_RS_VERTEXMATERIALENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_VERTEXMATERIALENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_VERTEXMATERIALENABLE\n");
       break;
    case SVGA3D_RS_DIFFUSEMATERIALSOURCE:
-      debug_printf("\t\t.state = SVGA3D_RS_DIFFUSEMATERIALSOURCE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_DIFFUSEMATERIALSOURCE\n");
       break;
    case SVGA3D_RS_SPECULARMATERIALSOURCE:
-      debug_printf("\t\t.state = SVGA3D_RS_SPECULARMATERIALSOURCE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SPECULARMATERIALSOURCE\n");
       break;
    case SVGA3D_RS_AMBIENTMATERIALSOURCE:
-      debug_printf("\t\t.state = SVGA3D_RS_AMBIENTMATERIALSOURCE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_AMBIENTMATERIALSOURCE\n");
       break;
    case SVGA3D_RS_EMISSIVEMATERIALSOURCE:
-      debug_printf("\t\t.state = SVGA3D_RS_EMISSIVEMATERIALSOURCE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_EMISSIVEMATERIALSOURCE\n");
       break;
    case SVGA3D_RS_TEXTUREFACTOR:
-      debug_printf("\t\t.state = SVGA3D_RS_TEXTUREFACTOR\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_TEXTUREFACTOR\n");
       break;
    case SVGA3D_RS_LOCALVIEWER:
-      debug_printf("\t\t.state = SVGA3D_RS_LOCALVIEWER\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_LOCALVIEWER\n");
       break;
    case SVGA3D_RS_SCISSORTESTENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_SCISSORTESTENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SCISSORTESTENABLE\n");
       break;
    case SVGA3D_RS_BLENDCOLOR:
-      debug_printf("\t\t.state = SVGA3D_RS_BLENDCOLOR\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDCOLOR\n");
       break;
    case SVGA3D_RS_STENCILENABLE2SIDED:
-      debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE2SIDED\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_STENCILENABLE2SIDED\n");
       break;
    case SVGA3D_RS_CCWSTENCILFUNC:
-      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFUNC\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFUNC\n");
       break;
    case SVGA3D_RS_CCWSTENCILFAIL:
-      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFAIL\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILFAIL\n");
       break;
    case SVGA3D_RS_CCWSTENCILZFAIL:
-      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILZFAIL\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILZFAIL\n");
       break;
    case SVGA3D_RS_CCWSTENCILPASS:
-      debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILPASS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CCWSTENCILPASS\n");
       break;
    case SVGA3D_RS_VERTEXBLEND:
-      debug_printf("\t\t.state = SVGA3D_RS_VERTEXBLEND\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_VERTEXBLEND\n");
       break;
    case SVGA3D_RS_SLOPESCALEDEPTHBIAS:
-      debug_printf("\t\t.state = SVGA3D_RS_SLOPESCALEDEPTHBIAS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SLOPESCALEDEPTHBIAS\n");
       break;
    case SVGA3D_RS_DEPTHBIAS:
-      debug_printf("\t\t.state = SVGA3D_RS_DEPTHBIAS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_DEPTHBIAS\n");
       break;
    case SVGA3D_RS_OUTPUTGAMMA:
-      debug_printf("\t\t.state = SVGA3D_RS_OUTPUTGAMMA\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_OUTPUTGAMMA\n");
       break;
    case SVGA3D_RS_ZVISIBLE:
-      debug_printf("\t\t.state = SVGA3D_RS_ZVISIBLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ZVISIBLE\n");
       break;
    case SVGA3D_RS_LASTPIXEL:
-      debug_printf("\t\t.state = SVGA3D_RS_LASTPIXEL\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_LASTPIXEL\n");
       break;
    case SVGA3D_RS_CLIPPING:
-      debug_printf("\t\t.state = SVGA3D_RS_CLIPPING\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_CLIPPING\n");
       break;
    case SVGA3D_RS_WRAP0:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP0\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP0\n");
       break;
    case SVGA3D_RS_WRAP1:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP1\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP1\n");
       break;
    case SVGA3D_RS_WRAP2:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP2\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP2\n");
       break;
    case SVGA3D_RS_WRAP3:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP3\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP3\n");
       break;
    case SVGA3D_RS_WRAP4:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP4\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP4\n");
       break;
    case SVGA3D_RS_WRAP5:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP5\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP5\n");
       break;
    case SVGA3D_RS_WRAP6:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP6\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP6\n");
       break;
    case SVGA3D_RS_WRAP7:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP7\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP7\n");
       break;
    case SVGA3D_RS_WRAP8:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP8\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP8\n");
       break;
    case SVGA3D_RS_WRAP9:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP9\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP9\n");
       break;
    case SVGA3D_RS_WRAP10:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP10\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP10\n");
       break;
    case SVGA3D_RS_WRAP11:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP11\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP11\n");
       break;
    case SVGA3D_RS_WRAP12:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP12\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP12\n");
       break;
    case SVGA3D_RS_WRAP13:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP13\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP13\n");
       break;
    case SVGA3D_RS_WRAP14:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP14\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP14\n");
       break;
    case SVGA3D_RS_WRAP15:
-      debug_printf("\t\t.state = SVGA3D_RS_WRAP15\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_WRAP15\n");
       break;
    case SVGA3D_RS_MULTISAMPLEANTIALIAS:
-      debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEANTIALIAS\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEANTIALIAS\n");
       break;
    case SVGA3D_RS_MULTISAMPLEMASK:
-      debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEMASK\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_MULTISAMPLEMASK\n");
       break;
    case SVGA3D_RS_INDEXEDVERTEXBLENDENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_INDEXEDVERTEXBLENDENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_INDEXEDVERTEXBLENDENABLE\n");
       break;
    case SVGA3D_RS_TWEENFACTOR:
-      debug_printf("\t\t.state = SVGA3D_RS_TWEENFACTOR\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_TWEENFACTOR\n");
       break;
    case SVGA3D_RS_ANTIALIASEDLINEENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_ANTIALIASEDLINEENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_ANTIALIASEDLINEENABLE\n");
       break;
    case SVGA3D_RS_COLORWRITEENABLE1:
-      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE1\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE1\n");
       break;
    case SVGA3D_RS_COLORWRITEENABLE2:
-      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE2\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE2\n");
       break;
    case SVGA3D_RS_COLORWRITEENABLE3:
-      debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE3\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_COLORWRITEENABLE3\n");
       break;
    case SVGA3D_RS_SEPARATEALPHABLENDENABLE:
-      debug_printf("\t\t.state = SVGA3D_RS_SEPARATEALPHABLENDENABLE\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SEPARATEALPHABLENDENABLE\n");
       break;
    case SVGA3D_RS_SRCBLENDALPHA:
-      debug_printf("\t\t.state = SVGA3D_RS_SRCBLENDALPHA\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_SRCBLENDALPHA\n");
       break;
    case SVGA3D_RS_DSTBLENDALPHA:
-      debug_printf("\t\t.state = SVGA3D_RS_DSTBLENDALPHA\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_DSTBLENDALPHA\n");
       break;
    case SVGA3D_RS_BLENDEQUATIONALPHA:
-      debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATIONALPHA\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_BLENDEQUATIONALPHA\n");
       break;
    case SVGA3D_RS_MAX:
-      debug_printf("\t\t.state = SVGA3D_RS_MAX\n");
+      _debug_printf("\t\t.state = SVGA3D_RS_MAX\n");
       break;
    default:
-      debug_printf("\t\t.state = %i\n", (*cmd).state);
+      _debug_printf("\t\t.state = %i\n", (*cmd).state);
       break;
    }
-   debug_printf("\t\t.uintValue = %u\n", (*cmd).uintValue);
-   debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
+   _debug_printf("\t\t.uintValue = %u\n", (*cmd).uintValue);
+   _debug_printf("\t\t.floatValue = %f\n", (*cmd).floatValue);
 }
 
 static void
 dump_SVGA3dVertexDivisor(const SVGA3dVertexDivisor *cmd)
 {
-   debug_printf("\t\t.value = %u\n", (*cmd).value);
-   debug_printf("\t\t.count = %u\n", (*cmd).count);
-   debug_printf("\t\t.indexedData = %u\n", (*cmd).indexedData);
-   debug_printf("\t\t.instanceData = %u\n", (*cmd).instanceData);
+   _debug_printf("\t\t.value = %u\n", (*cmd).value);
+   _debug_printf("\t\t.count = %u\n", (*cmd).count);
+   _debug_printf("\t\t.indexedData = %u\n", (*cmd).indexedData);
+   _debug_printf("\t\t.instanceData = %u\n", (*cmd).instanceData);
 }
 
 static void
 dump_SVGA3dCmdDefineShader(const SVGA3dCmdDefineShader *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
    switch((*cmd).type) {
    case SVGA3D_SHADERTYPE_COMPILED_DX8:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
       break;
    case SVGA3D_SHADERTYPE_VS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
       break;
    case SVGA3D_SHADERTYPE_PS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
       break;
    case SVGA3D_SHADERTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
 }
@@ -936,53 +936,53 @@ dump_SVGA3dCmdDefineShader(const SVGA3dCmdDefineShader *cmd)
 static void
 dump_SVGA3dCmdSetShaderConst(const SVGA3dCmdSetShaderConst *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.reg = %u\n", (*cmd).reg);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.reg = %u\n", (*cmd).reg);
    switch((*cmd).type) {
    case SVGA3D_SHADERTYPE_COMPILED_DX8:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
       break;
    case SVGA3D_SHADERTYPE_VS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
       break;
    case SVGA3D_SHADERTYPE_PS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
       break;
    case SVGA3D_SHADERTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
    switch((*cmd).ctype) {
    case SVGA3D_CONST_TYPE_FLOAT:
-      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_FLOAT\n");
-      debug_printf("\t\t.values[0] = %f\n", *(const float *)&(*cmd).values[0]);
-      debug_printf("\t\t.values[1] = %f\n", *(const float *)&(*cmd).values[1]);
-      debug_printf("\t\t.values[2] = %f\n", *(const float *)&(*cmd).values[2]);
-      debug_printf("\t\t.values[3] = %f\n", *(const float *)&(*cmd).values[3]);
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_FLOAT\n");
+      _debug_printf("\t\t.values[0] = %f\n", *(const float *)&(*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %f\n", *(const float *)&(*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %f\n", *(const float *)&(*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %f\n", *(const float *)&(*cmd).values[3]);
       break;
    case SVGA3D_CONST_TYPE_INT:
-      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_INT\n");
-      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
-      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
-      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
-      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_INT\n");
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
       break;
    case SVGA3D_CONST_TYPE_BOOL:
-      debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_BOOL\n");
-      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
-      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
-      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
-      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      _debug_printf("\t\t.ctype = SVGA3D_CONST_TYPE_BOOL\n");
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
       break;
    default:
-      debug_printf("\t\t.ctype = %i\n", (*cmd).ctype);
-      debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
-      debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
-      debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
-      debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
+      _debug_printf("\t\t.ctype = %i\n", (*cmd).ctype);
+      _debug_printf("\t\t.values[0] = %u\n", (*cmd).values[0]);
+      _debug_printf("\t\t.values[1] = %u\n", (*cmd).values[1]);
+      _debug_printf("\t\t.values[2] = %u\n", (*cmd).values[2]);
+      _debug_printf("\t\t.values[3] = %u\n", (*cmd).values[3]);
       break;
    }
 }
@@ -990,25 +990,25 @@ dump_SVGA3dCmdSetShaderConst(const SVGA3dCmdSetShaderConst *cmd)
 static void
 dump_SVGA3dCmdSetZRange(const SVGA3dCmdSetZRange *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.zRange.min = %f\n", (*cmd).zRange.min);
-   debug_printf("\t\t.zRange.max = %f\n", (*cmd).zRange.max);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.zRange.min = %f\n", (*cmd).zRange.min);
+   _debug_printf("\t\t.zRange.max = %f\n", (*cmd).zRange.max);
 }
 
 static void
 dump_SVGA3dCmdDrawPrimitives(const SVGA3dCmdDrawPrimitives *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.numVertexDecls = %u\n", (*cmd).numVertexDecls);
-   debug_printf("\t\t.numRanges = %u\n", (*cmd).numRanges);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.numVertexDecls = %u\n", (*cmd).numVertexDecls);
+   _debug_printf("\t\t.numRanges = %u\n", (*cmd).numRanges);
 }
 
 static void
 dump_SVGA3dCmdSetLightEnabled(const SVGA3dCmdSetLightEnabled *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.index = %u\n", (*cmd).index);
-   debug_printf("\t\t.enabled = %u\n", (*cmd).enabled);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.index = %u\n", (*cmd).index);
+   _debug_printf("\t\t.enabled = %u\n", (*cmd).enabled);
 }
 
 static void
@@ -1016,86 +1016,86 @@ dump_SVGA3dPrimitiveRange(const SVGA3dPrimitiveRange *cmd)
 {
    switch((*cmd).primType) {
    case SVGA3D_PRIMITIVE_INVALID:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_INVALID\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_INVALID\n");
       break;
    case SVGA3D_PRIMITIVE_TRIANGLELIST:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLELIST\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLELIST\n");
       break;
    case SVGA3D_PRIMITIVE_POINTLIST:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_POINTLIST\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_POINTLIST\n");
       break;
    case SVGA3D_PRIMITIVE_LINELIST:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINELIST\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINELIST\n");
       break;
    case SVGA3D_PRIMITIVE_LINESTRIP:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINESTRIP\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_LINESTRIP\n");
       break;
    case SVGA3D_PRIMITIVE_TRIANGLESTRIP:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLESTRIP\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLESTRIP\n");
       break;
    case SVGA3D_PRIMITIVE_TRIANGLEFAN:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLEFAN\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_TRIANGLEFAN\n");
       break;
    case SVGA3D_PRIMITIVE_MAX:
-      debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_MAX\n");
+      _debug_printf("\t\t.primType = SVGA3D_PRIMITIVE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.primType = %i\n", (*cmd).primType);
+      _debug_printf("\t\t.primType = %i\n", (*cmd).primType);
       break;
    }
-   debug_printf("\t\t.primitiveCount = %u\n", (*cmd).primitiveCount);
-   debug_printf("\t\t.indexArray.surfaceId = %u\n", (*cmd).indexArray.surfaceId);
-   debug_printf("\t\t.indexArray.offset = %u\n", (*cmd).indexArray.offset);
-   debug_printf("\t\t.indexArray.stride = %u\n", (*cmd).indexArray.stride);
-   debug_printf("\t\t.indexWidth = %u\n", (*cmd).indexWidth);
-   debug_printf("\t\t.indexBias = %i\n", (*cmd).indexBias);
+   _debug_printf("\t\t.primitiveCount = %u\n", (*cmd).primitiveCount);
+   _debug_printf("\t\t.indexArray.surfaceId = %u\n", (*cmd).indexArray.surfaceId);
+   _debug_printf("\t\t.indexArray.offset = %u\n", (*cmd).indexArray.offset);
+   _debug_printf("\t\t.indexArray.stride = %u\n", (*cmd).indexArray.stride);
+   _debug_printf("\t\t.indexWidth = %u\n", (*cmd).indexWidth);
+   _debug_printf("\t\t.indexBias = %i\n", (*cmd).indexBias);
 }
 
 static void
 dump_SVGA3dCmdPresent(const SVGA3dCmdPresent *cmd)
 {
-   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
 }
 
 static void
 dump_SVGA3dCmdSetRenderState(const SVGA3dCmdSetRenderState *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
 }
 
 static void
 dump_SVGA3dCmdSurfaceStretchBlt(const SVGA3dCmdSurfaceStretchBlt *cmd)
 {
-   debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
-   debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
-   debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
-   debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
-   debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
-   debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
-   debug_printf("\t\t.boxSrc.x = %u\n", (*cmd).boxSrc.x);
-   debug_printf("\t\t.boxSrc.y = %u\n", (*cmd).boxSrc.y);
-   debug_printf("\t\t.boxSrc.z = %u\n", (*cmd).boxSrc.z);
-   debug_printf("\t\t.boxSrc.w = %u\n", (*cmd).boxSrc.w);
-   debug_printf("\t\t.boxSrc.h = %u\n", (*cmd).boxSrc.h);
-   debug_printf("\t\t.boxSrc.d = %u\n", (*cmd).boxSrc.d);
-   debug_printf("\t\t.boxDest.x = %u\n", (*cmd).boxDest.x);
-   debug_printf("\t\t.boxDest.y = %u\n", (*cmd).boxDest.y);
-   debug_printf("\t\t.boxDest.z = %u\n", (*cmd).boxDest.z);
-   debug_printf("\t\t.boxDest.w = %u\n", (*cmd).boxDest.w);
-   debug_printf("\t\t.boxDest.h = %u\n", (*cmd).boxDest.h);
-   debug_printf("\t\t.boxDest.d = %u\n", (*cmd).boxDest.d);
+   _debug_printf("\t\t.src.sid = %u\n", (*cmd).src.sid);
+   _debug_printf("\t\t.src.face = %u\n", (*cmd).src.face);
+   _debug_printf("\t\t.src.mipmap = %u\n", (*cmd).src.mipmap);
+   _debug_printf("\t\t.dest.sid = %u\n", (*cmd).dest.sid);
+   _debug_printf("\t\t.dest.face = %u\n", (*cmd).dest.face);
+   _debug_printf("\t\t.dest.mipmap = %u\n", (*cmd).dest.mipmap);
+   _debug_printf("\t\t.boxSrc.x = %u\n", (*cmd).boxSrc.x);
+   _debug_printf("\t\t.boxSrc.y = %u\n", (*cmd).boxSrc.y);
+   _debug_printf("\t\t.boxSrc.z = %u\n", (*cmd).boxSrc.z);
+   _debug_printf("\t\t.boxSrc.w = %u\n", (*cmd).boxSrc.w);
+   _debug_printf("\t\t.boxSrc.h = %u\n", (*cmd).boxSrc.h);
+   _debug_printf("\t\t.boxSrc.d = %u\n", (*cmd).boxSrc.d);
+   _debug_printf("\t\t.boxDest.x = %u\n", (*cmd).boxDest.x);
+   _debug_printf("\t\t.boxDest.y = %u\n", (*cmd).boxDest.y);
+   _debug_printf("\t\t.boxDest.z = %u\n", (*cmd).boxDest.z);
+   _debug_printf("\t\t.boxDest.w = %u\n", (*cmd).boxDest.w);
+   _debug_printf("\t\t.boxDest.h = %u\n", (*cmd).boxDest.h);
+   _debug_printf("\t\t.boxDest.d = %u\n", (*cmd).boxDest.d);
    switch((*cmd).mode) {
    case SVGA3D_STRETCH_BLT_POINT:
-      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_POINT\n");
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_POINT\n");
       break;
    case SVGA3D_STRETCH_BLT_LINEAR:
-      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_LINEAR\n");
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_LINEAR\n");
       break;
    case SVGA3D_STRETCH_BLT_MAX:
-      debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_MAX\n");
+      _debug_printf("\t\t.mode = SVGA3D_STRETCH_BLT_MAX\n");
       break;
    default:
-      debug_printf("\t\t.mode = %i\n", (*cmd).mode);
+      _debug_printf("\t\t.mode = %i\n", (*cmd).mode);
       break;
    }
 }
@@ -1103,21 +1103,21 @@ dump_SVGA3dCmdSurfaceStretchBlt(const SVGA3dCmdSurfaceStretchBlt *cmd)
 static void
 dump_SVGA3dCmdSurfaceDMA(const SVGA3dCmdSurfaceDMA *cmd)
 {
-   debug_printf("\t\t.guest.ptr.gmrId = %u\n", (*cmd).guest.ptr.gmrId);
-   debug_printf("\t\t.guest.ptr.offset = %u\n", (*cmd).guest.ptr.offset);
-   debug_printf("\t\t.guest.pitch = %u\n", (*cmd).guest.pitch);
-   debug_printf("\t\t.host.sid = %u\n", (*cmd).host.sid);
-   debug_printf("\t\t.host.face = %u\n", (*cmd).host.face);
-   debug_printf("\t\t.host.mipmap = %u\n", (*cmd).host.mipmap);
+   _debug_printf("\t\t.guest.ptr.gmrId = %u\n", (*cmd).guest.ptr.gmrId);
+   _debug_printf("\t\t.guest.ptr.offset = %u\n", (*cmd).guest.ptr.offset);
+   _debug_printf("\t\t.guest.pitch = %u\n", (*cmd).guest.pitch);
+   _debug_printf("\t\t.host.sid = %u\n", (*cmd).host.sid);
+   _debug_printf("\t\t.host.face = %u\n", (*cmd).host.face);
+   _debug_printf("\t\t.host.mipmap = %u\n", (*cmd).host.mipmap);
    switch((*cmd).transfer) {
    case SVGA3D_WRITE_HOST_VRAM:
-      debug_printf("\t\t.transfer = SVGA3D_WRITE_HOST_VRAM\n");
+      _debug_printf("\t\t.transfer = SVGA3D_WRITE_HOST_VRAM\n");
       break;
    case SVGA3D_READ_HOST_VRAM:
-      debug_printf("\t\t.transfer = SVGA3D_READ_HOST_VRAM\n");
+      _debug_printf("\t\t.transfer = SVGA3D_READ_HOST_VRAM\n");
       break;
    default:
-      debug_printf("\t\t.transfer = %i\n", (*cmd).transfer);
+      _debug_printf("\t\t.transfer = %i\n", (*cmd).transfer);
       break;
    }
 }
@@ -1125,107 +1125,107 @@ dump_SVGA3dCmdSurfaceDMA(const SVGA3dCmdSurfaceDMA *cmd)
 static void
 dump_SVGA3dCmdSurfaceDMASuffix(const SVGA3dCmdSurfaceDMASuffix *cmd)
 {
-   debug_printf("\t\t.suffixSize = %u\n", (*cmd).suffixSize);
-   debug_printf("\t\t.maximumOffset = %u\n", (*cmd).maximumOffset);
-   debug_printf("\t\t.flags.discard = %u\n", (*cmd).flags.discard);
-   debug_printf("\t\t.flags.unsynchronized = %u\n", (*cmd).flags.unsynchronized);
+   _debug_printf("\t\t.suffixSize = %u\n", (*cmd).suffixSize);
+   _debug_printf("\t\t.maximumOffset = %u\n", (*cmd).maximumOffset);
+   _debug_printf("\t\t.flags.discard = %u\n", (*cmd).flags.discard);
+   _debug_printf("\t\t.flags.unsynchronized = %u\n", (*cmd).flags.unsynchronized);
 }
 
 static void
 dump_SVGA3dCmdSetTransform(const SVGA3dCmdSetTransform *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).type) {
    case SVGA3D_TRANSFORM_INVALID:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_INVALID\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_INVALID\n");
       break;
    case SVGA3D_TRANSFORM_WORLD:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD\n");
       break;
    case SVGA3D_TRANSFORM_VIEW:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_VIEW\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_VIEW\n");
       break;
    case SVGA3D_TRANSFORM_PROJECTION:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_PROJECTION\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_PROJECTION\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE0:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE0\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE0\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE1:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE1\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE1\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE2:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE2\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE2\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE3:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE3\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE3\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE4:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE4\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE4\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE5:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE5\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE5\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE6:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE6\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE6\n");
       break;
    case SVGA3D_TRANSFORM_TEXTURE7:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE7\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_TEXTURE7\n");
       break;
    case SVGA3D_TRANSFORM_WORLD1:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD1\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD1\n");
       break;
    case SVGA3D_TRANSFORM_WORLD2:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD2\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD2\n");
       break;
    case SVGA3D_TRANSFORM_WORLD3:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD3\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_WORLD3\n");
       break;
    case SVGA3D_TRANSFORM_MAX:
-      debug_printf("\t\t.type = SVGA3D_TRANSFORM_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_TRANSFORM_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
-   debug_printf("\t\t.matrix[0] = %f\n", (*cmd).matrix[0]);
-   debug_printf("\t\t.matrix[1] = %f\n", (*cmd).matrix[1]);
-   debug_printf("\t\t.matrix[2] = %f\n", (*cmd).matrix[2]);
-   debug_printf("\t\t.matrix[3] = %f\n", (*cmd).matrix[3]);
-   debug_printf("\t\t.matrix[4] = %f\n", (*cmd).matrix[4]);
-   debug_printf("\t\t.matrix[5] = %f\n", (*cmd).matrix[5]);
-   debug_printf("\t\t.matrix[6] = %f\n", (*cmd).matrix[6]);
-   debug_printf("\t\t.matrix[7] = %f\n", (*cmd).matrix[7]);
-   debug_printf("\t\t.matrix[8] = %f\n", (*cmd).matrix[8]);
-   debug_printf("\t\t.matrix[9] = %f\n", (*cmd).matrix[9]);
-   debug_printf("\t\t.matrix[10] = %f\n", (*cmd).matrix[10]);
-   debug_printf("\t\t.matrix[11] = %f\n", (*cmd).matrix[11]);
-   debug_printf("\t\t.matrix[12] = %f\n", (*cmd).matrix[12]);
-   debug_printf("\t\t.matrix[13] = %f\n", (*cmd).matrix[13]);
-   debug_printf("\t\t.matrix[14] = %f\n", (*cmd).matrix[14]);
-   debug_printf("\t\t.matrix[15] = %f\n", (*cmd).matrix[15]);
+   _debug_printf("\t\t.matrix[0] = %f\n", (*cmd).matrix[0]);
+   _debug_printf("\t\t.matrix[1] = %f\n", (*cmd).matrix[1]);
+   _debug_printf("\t\t.matrix[2] = %f\n", (*cmd).matrix[2]);
+   _debug_printf("\t\t.matrix[3] = %f\n", (*cmd).matrix[3]);
+   _debug_printf("\t\t.matrix[4] = %f\n", (*cmd).matrix[4]);
+   _debug_printf("\t\t.matrix[5] = %f\n", (*cmd).matrix[5]);
+   _debug_printf("\t\t.matrix[6] = %f\n", (*cmd).matrix[6]);
+   _debug_printf("\t\t.matrix[7] = %f\n", (*cmd).matrix[7]);
+   _debug_printf("\t\t.matrix[8] = %f\n", (*cmd).matrix[8]);
+   _debug_printf("\t\t.matrix[9] = %f\n", (*cmd).matrix[9]);
+   _debug_printf("\t\t.matrix[10] = %f\n", (*cmd).matrix[10]);
+   _debug_printf("\t\t.matrix[11] = %f\n", (*cmd).matrix[11]);
+   _debug_printf("\t\t.matrix[12] = %f\n", (*cmd).matrix[12]);
+   _debug_printf("\t\t.matrix[13] = %f\n", (*cmd).matrix[13]);
+   _debug_printf("\t\t.matrix[14] = %f\n", (*cmd).matrix[14]);
+   _debug_printf("\t\t.matrix[15] = %f\n", (*cmd).matrix[15]);
 }
 
 static void
 dump_SVGA3dCmdDestroyShader(const SVGA3dCmdDestroyShader *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
-   debug_printf("\t\t.shid = %u\n", (*cmd).shid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.shid = %u\n", (*cmd).shid);
    switch((*cmd).type) {
    case SVGA3D_SHADERTYPE_COMPILED_DX8:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_COMPILED_DX8\n");
       break;
    case SVGA3D_SHADERTYPE_VS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_VS\n");
       break;
    case SVGA3D_SHADERTYPE_PS:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_PS\n");
       break;
    case SVGA3D_SHADERTYPE_MAX:
-      debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
+      _debug_printf("\t\t.type = SVGA3D_SHADERTYPE_MAX\n");
       break;
    default:
-      debug_printf("\t\t.type = %i\n", (*cmd).type);
+      _debug_printf("\t\t.type = %i\n", (*cmd).type);
       break;
    }
 }
@@ -1233,187 +1233,519 @@ dump_SVGA3dCmdDestroyShader(const SVGA3dCmdDestroyShader *cmd)
 static void
 dump_SVGA3dCmdDestroyContext(const SVGA3dCmdDestroyContext *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
 }
 
 static void
 dump_SVGA3dCmdClear(const SVGA3dCmdClear *cmd)
 {
-   debug_printf("\t\t.cid = %u\n", (*cmd).cid);
+   _debug_printf("\t\t.cid = %u\n", (*cmd).cid);
    switch((*cmd).clearFlag) {
    case SVGA3D_CLEAR_COLOR:
-      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_COLOR\n");
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_COLOR\n");
       break;
    case SVGA3D_CLEAR_DEPTH:
-      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_DEPTH\n");
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_DEPTH\n");
       break;
    case SVGA3D_CLEAR_STENCIL:
-      debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_STENCIL\n");
+      _debug_printf("\t\t.clearFlag = SVGA3D_CLEAR_STENCIL\n");
       break;
    default:
-      debug_printf("\t\t.clearFlag = %i\n", (*cmd).clearFlag);
+      _debug_printf("\t\t.clearFlag = %i\n", (*cmd).clearFlag);
       break;
    }
-   debug_printf("\t\t.color = %u\n", (*cmd).color);
-   debug_printf("\t\t.depth = %f\n", (*cmd).depth);
-   debug_printf("\t\t.stencil = %u\n", (*cmd).stencil);
+   _debug_printf("\t\t.color = %u\n", (*cmd).color);
+   _debug_printf("\t\t.depth = %f\n", (*cmd).depth);
+   _debug_printf("\t\t.stencil = %u\n", (*cmd).stencil);
 }
 
 static void
 dump_SVGA3dCmdDefineSurface(const SVGA3dCmdDefineSurface *cmd)
 {
-   debug_printf("\t\t.sid = %u\n", (*cmd).sid);
+   _debug_printf("\t\t.sid = %u\n", (*cmd).sid);
    switch((*cmd).surfaceFlags) {
    case SVGA3D_SURFACE_CUBEMAP:
-      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_CUBEMAP\n");
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_CUBEMAP\n");
       break;
    case SVGA3D_SURFACE_HINT_STATIC:
-      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_STATIC\n");
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_STATIC\n");
       break;
    case SVGA3D_SURFACE_HINT_DYNAMIC:
-      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_DYNAMIC\n");
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_DYNAMIC\n");
       break;
    case SVGA3D_SURFACE_HINT_INDEXBUFFER:
-      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_INDEXBUFFER\n");
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_INDEXBUFFER\n");
       break;
    case SVGA3D_SURFACE_HINT_VERTEXBUFFER:
-      debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_VERTEXBUFFER\n");
+      _debug_printf("\t\t.surfaceFlags = SVGA3D_SURFACE_HINT_VERTEXBUFFER\n");
       break;
    default:
-      debug_printf("\t\t.surfaceFlags = %i\n", (*cmd).surfaceFlags);
+      _debug_printf("\t\t.surfaceFlags = %i\n", (*cmd).surfaceFlags);
       break;
    }
    switch((*cmd).format) {
    case SVGA3D_FORMAT_INVALID:
-      debug_printf("\t\t.format = SVGA3D_FORMAT_INVALID\n");
+      _debug_printf("\t\t.format = SVGA3D_FORMAT_INVALID\n");
       break;
    case SVGA3D_X8R8G8B8:
-      debug_printf("\t\t.format = SVGA3D_X8R8G8B8\n");
+      _debug_printf("\t\t.format = SVGA3D_X8R8G8B8\n");
       break;
    case SVGA3D_A8R8G8B8:
-      debug_printf("\t\t.format = SVGA3D_A8R8G8B8\n");
+      _debug_printf("\t\t.format = SVGA3D_A8R8G8B8\n");
       break;
    case SVGA3D_R5G6B5:
-      debug_printf("\t\t.format = SVGA3D_R5G6B5\n");
+      _debug_printf("\t\t.format = SVGA3D_R5G6B5\n");
       break;
    case SVGA3D_X1R5G5B5:
-      debug_printf("\t\t.format = SVGA3D_X1R5G5B5\n");
+      _debug_printf("\t\t.format = SVGA3D_X1R5G5B5\n");
       break;
    case SVGA3D_A1R5G5B5:
-      debug_printf("\t\t.format = SVGA3D_A1R5G5B5\n");
+      _debug_printf("\t\t.format = SVGA3D_A1R5G5B5\n");
       break;
    case SVGA3D_A4R4G4B4:
-      debug_printf("\t\t.format = SVGA3D_A4R4G4B4\n");
+      _debug_printf("\t\t.format = SVGA3D_A4R4G4B4\n");
       break;
    case SVGA3D_Z_D32:
-      debug_printf("\t\t.format = SVGA3D_Z_D32\n");
+      _debug_printf("\t\t.format = SVGA3D_Z_D32\n");
       break;
    case SVGA3D_Z_D16:
-      debug_printf("\t\t.format = SVGA3D_Z_D16\n");
+      _debug_printf("\t\t.format = SVGA3D_Z_D16\n");
       break;
    case SVGA3D_Z_D24S8:
-      debug_printf("\t\t.format = SVGA3D_Z_D24S8\n");
+      _debug_printf("\t\t.format = SVGA3D_Z_D24S8\n");
       break;
    case SVGA3D_Z_D15S1:
-      debug_printf("\t\t.format = SVGA3D_Z_D15S1\n");
+      _debug_printf("\t\t.format = SVGA3D_Z_D15S1\n");
       break;
    case SVGA3D_LUMINANCE8:
-      debug_printf("\t\t.format = SVGA3D_LUMINANCE8\n");
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE8\n");
       break;
    case SVGA3D_LUMINANCE4_ALPHA4:
-      debug_printf("\t\t.format = SVGA3D_LUMINANCE4_ALPHA4\n");
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE4_ALPHA4\n");
       break;
    case SVGA3D_LUMINANCE16:
-      debug_printf("\t\t.format = SVGA3D_LUMINANCE16\n");
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE16\n");
       break;
    case SVGA3D_LUMINANCE8_ALPHA8:
-      debug_printf("\t\t.format = SVGA3D_LUMINANCE8_ALPHA8\n");
+      _debug_printf("\t\t.format = SVGA3D_LUMINANCE8_ALPHA8\n");
       break;
    case SVGA3D_DXT1:
-      debug_printf("\t\t.format = SVGA3D_DXT1\n");
+      _debug_printf("\t\t.format = SVGA3D_DXT1\n");
       break;
    case SVGA3D_DXT2:
-      debug_printf("\t\t.format = SVGA3D_DXT2\n");
+      _debug_printf("\t\t.format = SVGA3D_DXT2\n");
       break;
    case SVGA3D_DXT3:
-      debug_printf("\t\t.format = SVGA3D_DXT3\n");
+      _debug_printf("\t\t.format = SVGA3D_DXT3\n");
       break;
    case SVGA3D_DXT4:
-      debug_printf("\t\t.format = SVGA3D_DXT4\n");
+      _debug_printf("\t\t.format = SVGA3D_DXT4\n");
       break;
    case SVGA3D_DXT5:
-      debug_printf("\t\t.format = SVGA3D_DXT5\n");
+      _debug_printf("\t\t.format = SVGA3D_DXT5\n");
       break;
    case SVGA3D_BUMPU8V8:
-      debug_printf("\t\t.format = SVGA3D_BUMPU8V8\n");
+      _debug_printf("\t\t.format = SVGA3D_BUMPU8V8\n");
       break;
    case SVGA3D_BUMPL6V5U5:
-      debug_printf("\t\t.format = SVGA3D_BUMPL6V5U5\n");
+      _debug_printf("\t\t.format = SVGA3D_BUMPL6V5U5\n");
       break;
    case SVGA3D_BUMPX8L8V8U8:
-      debug_printf("\t\t.format = SVGA3D_BUMPX8L8V8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_BUMPX8L8V8U8\n");
       break;
    case SVGA3D_BUMPL8V8U8:
-      debug_printf("\t\t.format = SVGA3D_BUMPL8V8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_BUMPL8V8U8\n");
       break;
    case SVGA3D_ARGB_S10E5:
-      debug_printf("\t\t.format = SVGA3D_ARGB_S10E5\n");
+      _debug_printf("\t\t.format = SVGA3D_ARGB_S10E5\n");
       break;
    case SVGA3D_ARGB_S23E8:
-      debug_printf("\t\t.format = SVGA3D_ARGB_S23E8\n");
+      _debug_printf("\t\t.format = SVGA3D_ARGB_S23E8\n");
       break;
    case SVGA3D_A2R10G10B10:
-      debug_printf("\t\t.format = SVGA3D_A2R10G10B10\n");
+      _debug_printf("\t\t.format = SVGA3D_A2R10G10B10\n");
       break;
    case SVGA3D_V8U8:
-      debug_printf("\t\t.format = SVGA3D_V8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_V8U8\n");
       break;
    case SVGA3D_Q8W8V8U8:
-      debug_printf("\t\t.format = SVGA3D_Q8W8V8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_Q8W8V8U8\n");
       break;
    case SVGA3D_CxV8U8:
-      debug_printf("\t\t.format = SVGA3D_CxV8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_CxV8U8\n");
       break;
    case SVGA3D_X8L8V8U8:
-      debug_printf("\t\t.format = SVGA3D_X8L8V8U8\n");
+      _debug_printf("\t\t.format = SVGA3D_X8L8V8U8\n");
       break;
    case SVGA3D_A2W10V10U10:
-      debug_printf("\t\t.format = SVGA3D_A2W10V10U10\n");
+      _debug_printf("\t\t.format = SVGA3D_A2W10V10U10\n");
       break;
    case SVGA3D_ALPHA8:
-      debug_printf("\t\t.format = SVGA3D_ALPHA8\n");
+      _debug_printf("\t\t.format = SVGA3D_ALPHA8\n");
       break;
    case SVGA3D_R_S10E5:
-      debug_printf("\t\t.format = SVGA3D_R_S10E5\n");
+      _debug_printf("\t\t.format = SVGA3D_R_S10E5\n");
       break;
    case SVGA3D_R_S23E8:
-      debug_printf("\t\t.format = SVGA3D_R_S23E8\n");
+      _debug_printf("\t\t.format = SVGA3D_R_S23E8\n");
       break;
    case SVGA3D_RG_S10E5:
-      debug_printf("\t\t.format = SVGA3D_RG_S10E5\n");
+      _debug_printf("\t\t.format = SVGA3D_RG_S10E5\n");
       break;
    case SVGA3D_RG_S23E8:
-      debug_printf("\t\t.format = SVGA3D_RG_S23E8\n");
+      _debug_printf("\t\t.format = SVGA3D_RG_S23E8\n");
       break;
    case SVGA3D_BUFFER:
-      debug_printf("\t\t.format = SVGA3D_BUFFER\n");
+      _debug_printf("\t\t.format = SVGA3D_BUFFER\n");
       break;
    case SVGA3D_Z_D24X8:
-      debug_printf("\t\t.format = SVGA3D_Z_D24X8\n");
+      _debug_printf("\t\t.format = SVGA3D_Z_D24X8\n");
       break;
    case SVGA3D_FORMAT_MAX:
-      debug_printf("\t\t.format = SVGA3D_FORMAT_MAX\n");
+      _debug_printf("\t\t.format = SVGA3D_FORMAT_MAX\n");
       break;
    default:
-      debug_printf("\t\t.format = %i\n", (*cmd).format);
+      _debug_printf("\t\t.format = %i\n", (*cmd).format);
       break;
    }
-   debug_printf("\t\t.face[0].numMipLevels = %u\n", (*cmd).face[0].numMipLevels);
-   debug_printf("\t\t.face[1].numMipLevels = %u\n", (*cmd).face[1].numMipLevels);
-   debug_printf("\t\t.face[2].numMipLevels = %u\n", (*cmd).face[2].numMipLevels);
-   debug_printf("\t\t.face[3].numMipLevels = %u\n", (*cmd).face[3].numMipLevels);
-   debug_printf("\t\t.face[4].numMipLevels = %u\n", (*cmd).face[4].numMipLevels);
-   debug_printf("\t\t.face[5].numMipLevels = %u\n", (*cmd).face[5].numMipLevels);
+   _debug_printf("\t\t.face[0].numMipLevels = %u\n", (*cmd).face[0].numMipLevels);
+   _debug_printf("\t\t.face[1].numMipLevels = %u\n", (*cmd).face[1].numMipLevels);
+   _debug_printf("\t\t.face[2].numMipLevels = %u\n", (*cmd).face[2].numMipLevels);
+   _debug_printf("\t\t.face[3].numMipLevels = %u\n", (*cmd).face[3].numMipLevels);
+   _debug_printf("\t\t.face[4].numMipLevels = %u\n", (*cmd).face[4].numMipLevels);
+   _debug_printf("\t\t.face[5].numMipLevels = %u\n", (*cmd).face[5].numMipLevels);
+}
+
+static void
+dump_SVGASignedRect(const SVGASignedRect *cmd)
+{
+   _debug_printf("\t\t.left = %i\n", (*cmd).left);
+   _debug_printf("\t\t.top = %i\n", (*cmd).top);
+   _debug_printf("\t\t.right = %i\n", (*cmd).right);
+   _debug_printf("\t\t.bottom = %i\n", (*cmd).bottom);
+}
+
+static void
+dump_SVGA3dCmdBlitSurfaceToScreen(const SVGA3dCmdBlitSurfaceToScreen *cmd)
+{
+   _debug_printf("\t\t.srcImage.sid = %u\n", (*cmd).srcImage.sid);
+   _debug_printf("\t\t.srcImage.face = %u\n", (*cmd).srcImage.face);
+   _debug_printf("\t\t.srcImage.mipmap = %u\n", (*cmd).srcImage.mipmap);
+   _debug_printf("\t\t.srcRect.left = %i\n", (*cmd).srcRect.left);
+   _debug_printf("\t\t.srcRect.top = %i\n", (*cmd).srcRect.top);
+   _debug_printf("\t\t.srcRect.right = %i\n", (*cmd).srcRect.right);
+   _debug_printf("\t\t.srcRect.bottom = %i\n", (*cmd).srcRect.bottom);
+   _debug_printf("\t\t.destScreenId = %u\n", (*cmd).destScreenId);
+   _debug_printf("\t\t.destRect.left = %i\n", (*cmd).destRect.left);
+   _debug_printf("\t\t.destRect.top = %i\n", (*cmd).destRect.top);
+   _debug_printf("\t\t.destRect.right = %i\n", (*cmd).destRect.right);
+   _debug_printf("\t\t.destRect.bottom = %i\n", (*cmd).destRect.bottom);
+}
+
+
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+  
+   switch(cmd_id) {
+   case SVGA_3D_CMD_SURFACE_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
+      {
+         const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
+         dump_SVGA3dCmdDefineSurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dSize) <= next) {
+            dump_SVGA3dSize((const SVGA3dSize *)body);
+            body += sizeof(SVGA3dSize);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
+      {
+         const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
+         dump_SVGA3dCmdDestroySurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_COPY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
+      {
+         const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
+         dump_SVGA3dCmdSurfaceCopy(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_STRETCHBLT:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
+      {
+         const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
+         dump_SVGA3dCmdSurfaceStretchBlt(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DMA:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
+      {
+         const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
+         dump_SVGA3dCmdSurfaceDMA(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+         while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
+            dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
+            body += sizeof(SVGA3dCmdSurfaceDMASuffix);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
+      {
+         const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
+         dump_SVGA3dCmdDefineContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
+         dump_SVGA3dCmdDestroyContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTRANSFORM:
+      _debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
+      {
+         const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
+         dump_SVGA3dCmdSetTransform(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETZRANGE:
+      _debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
+      {
+         const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
+         dump_SVGA3dCmdSetZRange(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERSTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
+      {
+         const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
+         dump_SVGA3dCmdSetRenderState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRenderState) <= next) {
+            dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
+            body += sizeof(SVGA3dRenderState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERTARGET:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
+      {
+         const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
+         dump_SVGA3dCmdSetRenderTarget(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTEXTURESTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
+      {
+         const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
+         dump_SVGA3dCmdSetTextureState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dTextureState) <= next) {
+            dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
+            body += sizeof(SVGA3dTextureState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETMATERIAL:
+      _debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
+      {
+         const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
+         dump_SVGA3dCmdSetMaterial(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTDATA:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
+      {
+         const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
+         dump_SVGA3dCmdSetLightData(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTENABLED:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
+      {
+         const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
+         dump_SVGA3dCmdSetLightEnabled(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETVIEWPORT:
+      _debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
+      {
+         const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
+         dump_SVGA3dCmdSetViewport(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETCLIPPLANE:
+      _debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
+      {
+         const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
+         dump_SVGA3dCmdSetClipPlane(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CLEAR:
+      _debug_printf("\tSVGA_3D_CMD_CLEAR\n");
+      {
+         const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
+         dump_SVGA3dCmdClear(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRect) <= next) {
+            dump_SVGA3dRect((const SVGA3dRect *)body);
+            body += sizeof(SVGA3dRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_PRESENT:
+      _debug_printf("\tSVGA_3D_CMD_PRESENT\n");
+      {
+         const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
+         dump_SVGA3dCmdPresent(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyRect) <= next) {
+            dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
+            body += sizeof(SVGA3dCopyRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
+      {
+         const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
+         dump_SVGA3dCmdDefineShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+         svga_shader_dump((const uint32_t *)body, 
+                      (unsigned)(next - body)/sizeof(uint32_t),
+                      FALSE );
+         body = next;
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
+         dump_SVGA3dCmdDestroyShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
+      {
+         const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
+         dump_SVGA3dCmdSetShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER_CONST:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
+      {
+         const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
+         dump_SVGA3dCmdSetShaderConst(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_DRAW_PRIMITIVES:
+      _debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
+      {
+         const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
+         unsigned i, j;
+         dump_SVGA3dCmdDrawPrimitives(cmd);
+         body = (const uint8_t *)&cmd[1];
+         for(i = 0; i < cmd->numVertexDecls; ++i) {
+            dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
+            body += sizeof(SVGA3dVertexDecl);
+         }
+         for(j = 0; j < cmd->numRanges; ++j) {
+            dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
+            body += sizeof(SVGA3dPrimitiveRange);
+         }
+         while(body + sizeof(SVGA3dVertexDivisor) <= next) {
+            dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
+            body += sizeof(SVGA3dVertexDivisor);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETSCISSORRECT:
+      _debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
+      {
+         const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
+         dump_SVGA3dCmdSetScissorRect(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BEGIN_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
+      {
+         const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
+         dump_SVGA3dCmdBeginQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_END_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
+      {
+         const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
+         dump_SVGA3dCmdEndQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_WAIT_FOR_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
+      {
+         const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
+         dump_SVGA3dCmdWaitForQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN:
+      _debug_printf("\tSVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN\n");
+      {
+         const SVGA3dCmdBlitSurfaceToScreen *cmd = (const SVGA3dCmdBlitSurfaceToScreen *)body;
+         dump_SVGA3dCmdBlitSurfaceToScreen(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGASignedRect) <= next) {
+            dump_SVGASignedRect((const SVGASignedRect *)body);
+            body += sizeof(SVGASignedRect);
+         }
+      }
+      break;
+   default:
+      _debug_printf("\t0x%08x\n", cmd_id);
+      break;
+   }
+
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
 }
 
 
@@ -1432,303 +1764,19 @@ svga_dump_commands(const void *commands, uint32_t size)
          const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
          const uint8_t *body = (const uint8_t *)&header[1];
 
-         next = (const uint8_t *)body + header->size;
+         next = body + header->size;
          if(next > last)
             break;
 
-         switch(cmd_id) {
-         case SVGA_3D_CMD_SURFACE_DEFINE:
-            debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
-            {
-               const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
-               dump_SVGA3dCmdDefineSurface(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dSize) <= next) {
-                  dump_SVGA3dSize((const SVGA3dSize *)body);
-                  body += sizeof(SVGA3dSize);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_DESTROY:
-            debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
-            {
-               const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
-               dump_SVGA3dCmdDestroySurface(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_COPY:
-            debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
-            {
-               const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
-               dump_SVGA3dCmdSurfaceCopy(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyBox) <= next) {
-                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
-                  body += sizeof(SVGA3dCopyBox);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_STRETCHBLT:
-            debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
-            {
-               const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
-               dump_SVGA3dCmdSurfaceStretchBlt(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_DMA:
-            debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
-            {
-               const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
-               dump_SVGA3dCmdSurfaceDMA(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyBox) <= next) {
-                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
-                  body += sizeof(SVGA3dCopyBox);
-               }
-               while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
-                  dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
-                  body += sizeof(SVGA3dCmdSurfaceDMASuffix);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_CONTEXT_DEFINE:
-            debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
-            {
-               const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
-               dump_SVGA3dCmdDefineContext(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_CONTEXT_DESTROY:
-            debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
-            {
-               const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
-               dump_SVGA3dCmdDestroyContext(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETTRANSFORM:
-            debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
-            {
-               const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
-               dump_SVGA3dCmdSetTransform(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETZRANGE:
-            debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
-            {
-               const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
-               dump_SVGA3dCmdSetZRange(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETRENDERSTATE:
-            debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
-            {
-               const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
-               dump_SVGA3dCmdSetRenderState(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dRenderState) <= next) {
-                  dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
-                  body += sizeof(SVGA3dRenderState);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETRENDERTARGET:
-            debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
-            {
-               const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
-               dump_SVGA3dCmdSetRenderTarget(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETTEXTURESTATE:
-            debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
-            {
-               const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
-               dump_SVGA3dCmdSetTextureState(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dTextureState) <= next) {
-                  dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
-                  body += sizeof(SVGA3dTextureState);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETMATERIAL:
-            debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
-            {
-               const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
-               dump_SVGA3dCmdSetMaterial(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETLIGHTDATA:
-            debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
-            {
-               const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
-               dump_SVGA3dCmdSetLightData(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETLIGHTENABLED:
-            debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
-            {
-               const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
-               dump_SVGA3dCmdSetLightEnabled(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETVIEWPORT:
-            debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
-            {
-               const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
-               dump_SVGA3dCmdSetViewport(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETCLIPPLANE:
-            debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
-            {
-               const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
-               dump_SVGA3dCmdSetClipPlane(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_CLEAR:
-            debug_printf("\tSVGA_3D_CMD_CLEAR\n");
-            {
-               const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
-               dump_SVGA3dCmdClear(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dRect) <= next) {
-                  dump_SVGA3dRect((const SVGA3dRect *)body);
-                  body += sizeof(SVGA3dRect);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_PRESENT:
-            debug_printf("\tSVGA_3D_CMD_PRESENT\n");
-            {
-               const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
-               dump_SVGA3dCmdPresent(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyRect) <= next) {
-                  dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
-                  body += sizeof(SVGA3dCopyRect);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SHADER_DEFINE:
-            debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
-            {
-               const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
-               dump_SVGA3dCmdDefineShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-               svga_shader_dump((const uint32_t *)body, 
-                            (unsigned)(next - body)/sizeof(uint32_t),
-                            FALSE );
-               body = next;
-            }
-            break;
-         case SVGA_3D_CMD_SHADER_DESTROY:
-            debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
-            {
-               const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
-               dump_SVGA3dCmdDestroyShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SET_SHADER:
-            debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
-            {
-               const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
-               dump_SVGA3dCmdSetShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SET_SHADER_CONST:
-            debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
-            {
-               const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
-               dump_SVGA3dCmdSetShaderConst(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_DRAW_PRIMITIVES:
-            debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
-            {
-               const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
-               unsigned i, j;
-               dump_SVGA3dCmdDrawPrimitives(cmd);
-               body = (const uint8_t *)&cmd[1];
-               for(i = 0; i < cmd->numVertexDecls; ++i) {
-                  dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
-                  body += sizeof(SVGA3dVertexDecl);
-               }
-               for(j = 0; j < cmd->numRanges; ++j) {
-                  dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
-                  body += sizeof(SVGA3dPrimitiveRange);
-               }
-               while(body + sizeof(SVGA3dVertexDivisor) <= next) {
-                  dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
-                  body += sizeof(SVGA3dVertexDivisor);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETSCISSORRECT:
-            debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
-            {
-               const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
-               dump_SVGA3dCmdSetScissorRect(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_BEGIN_QUERY:
-            debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
-            {
-               const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
-               dump_SVGA3dCmdBeginQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_END_QUERY:
-            debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
-            {
-               const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
-               dump_SVGA3dCmdEndQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_WAIT_FOR_QUERY:
-            debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
-            {
-               const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
-               dump_SVGA3dCmdWaitForQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         default:
-            debug_printf("\t0x%08x\n", cmd_id);
-            break;
-         }
-
-         while(body + sizeof(uint32_t) <= next) {
-            debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
-            body += sizeof(uint32_t);
-         }
-         while(body + sizeof(uint32_t) <= next)
-            debug_printf("\t\t0x%02x\n", *body++);
+         svga_dump_command(cmd_id, body, header->size);
       }
       else if(cmd_id == SVGA_CMD_FENCE) {
-         debug_printf("\tSVGA_CMD_FENCE\n");
-         debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         _debug_printf("\tSVGA_CMD_FENCE\n");
+         _debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
          next += 2*sizeof(uint32_t);
       }
       else {
-         debug_printf("\t0x%08x\n", cmd_id);
+         _debug_printf("\t0x%08x\n", cmd_id);
          next += sizeof(uint32_t);
       }
    }
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.h b/src/gallium/drivers/svga/svgadump/svga_dump.h
index 69a87020875..ca0154361cc 100644
--- a/src/gallium/drivers/svga/svgadump/svga_dump.h
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.h
@@ -28,6 +28,9 @@
 
 #include "pipe/p_compiler.h"
 
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size);
+
 void
 svga_dump_commands(const void *commands, uint32_t size);
 
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.py b/src/gallium/drivers/svga/svgadump/svga_dump.py
index 288e753296e..0bc0b3ae317 100755
--- a/src/gallium/drivers/svga/svgadump/svga_dump.py
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.py
@@ -71,14 +71,14 @@ class decl_dumper_t(decl_visitor.decl_visitor_t):
             print '   switch(%s) {' % ("(*cmd)" + self._instance,)
             for name, value in self.decl.values:
                 print '   case %s:' % (name,)
-                print '      debug_printf("\\t\\t%s = %s\\n");' % (self._instance, name)
+                print '      _debug_printf("\\t\\t%s = %s\\n");' % (self._instance, name)
                 print '      break;'
             print '   default:'
-            print '      debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+            print '      _debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
             print '      break;'
             print '   }'
         else:
-            print '   debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
+            print '   _debug_printf("\\t\\t%s = %%i\\n", %s);' % (self._instance, "(*cmd)" + self._instance)
 
 
 def dump_decl(instance, decl):
@@ -154,7 +154,7 @@ class type_dumper_t(type_visitor.type_visitor_t):
         dump_decl(self.instance, decl)
 
     def print_instance(self, format):
-        print '   debug_printf("\\t\\t%s = %s\\n", %s);' % (self.instance, format, "(*cmd)" + self.instance)
+        print '   _debug_printf("\\t\\t%s = %s\\n", %s);' % (self.instance, format, "(*cmd)" + self.instance)
 
 
 def dump_type(instance, type_):
@@ -202,11 +202,62 @@ cmds = [
     ('SVGA_3D_CMD_END_QUERY', 'SVGA3dCmdEndQuery', (), None),
     ('SVGA_3D_CMD_WAIT_FOR_QUERY', 'SVGA3dCmdWaitForQuery', (), None),
     #('SVGA_3D_CMD_PRESENT_READBACK', None, (), None),
+    ('SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN', 'SVGA3dCmdBlitSurfaceToScreen', (), 'SVGASignedRect'),
 ]
 
 def dump_cmds():
     print r'''
 void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+'''
+    print '   switch(cmd_id) {'
+    indexes = 'ijklmn'
+    for id, header, body, footer in cmds:
+        print '   case %s:' % id
+        print '      _debug_printf("\\t%s\\n");' % id
+        print '      {'
+        print '         const %s *cmd = (const %s *)body;' % (header, header)
+        if len(body):
+            print '         unsigned ' + ', '.join(indexes[:len(body)]) + ';'
+        print '         dump_%s(cmd);' % header
+        print '         body = (const uint8_t *)&cmd[1];'
+        for i in range(len(body)):
+            struct, count = body[i]
+            idx = indexes[i]
+            print '         for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
+            print '            dump_%s((const %s *)body);' % (struct, struct)
+            print '            body += sizeof(%s);' % struct
+            print '         }'
+        if footer is not None:
+            print '         while(body + sizeof(%s) <= next) {' % footer
+            print '            dump_%s((const %s *)body);' % (footer, footer)
+            print '            body += sizeof(%s);' % footer
+            print '         }'
+        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
+            print '         svga_shader_dump((const uint32_t *)body,'
+            print '                          (unsigned)(next - body)/sizeof(uint32_t),'
+            print '                          FALSE);'
+            print '         body = next;'
+        print '      }'
+        print '      break;'
+    print '   default:'
+    print '      _debug_printf("\\t0x%08x\\n", cmd_id);'
+    print '      break;'
+    print '   }'
+    print r'''
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
+}
+'''
+    print r'''
+void            
 svga_dump_commands(const void *commands, uint32_t size)
 {
    const uint8_t *next = commands;
@@ -221,59 +272,19 @@ svga_dump_commands(const void *commands, uint32_t size)
          const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
          const uint8_t *body = (const uint8_t *)&header[1];
 
-         next = (const uint8_t *)body + header->size;
+         next = body + header->size;
          if(next > last)
             break;
-'''
 
-    print '         switch(cmd_id) {'
-    indexes = 'ijklmn'
-    for id, header, body, footer in cmds:
-        print '         case %s:' % id
-        print '            debug_printf("\\t%s\\n");' % id
-        print '            {'
-        print '               const %s *cmd = (const %s *)body;' % (header, header)
-        if len(body):
-            print '               unsigned ' + ', '.join(indexes[:len(body)]) + ';'
-        print '               dump_%s(cmd);' % header
-        print '               body = (const uint8_t *)&cmd[1];'
-        for i in range(len(body)):
-            struct, count = body[i]
-            idx = indexes[i]
-            print '               for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
-            print '                  dump_%s((const %s *)body);' % (struct, struct)
-            print '                  body += sizeof(%s);' % struct
-            print '               }'
-        if footer is not None:
-            print '               while(body + sizeof(%s) <= next) {' % footer
-            print '                  dump_%s((const %s *)body);' % (footer, footer)
-            print '                  body += sizeof(%s);' % footer
-            print '               }'
-        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
-            print '               sh_svga_dump((const uint32_t *)body, (unsigned)(next - body)/sizeof(uint32_t));'
-            print '               body = next;'
-        print '            }'
-        print '            break;'
-    print '         default:'
-    print '            debug_printf("\\t0x%08x\\n", cmd_id);'
-    print '            break;'
-    print '         }'
-            
-    print r'''
-         while(body + sizeof(uint32_t) <= next) {
-            debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
-            body += sizeof(uint32_t);
-         }
-         while(body + sizeof(uint32_t) <= next)
-            debug_printf("\t\t0x%02x\n", *body++);
+         svga_dump_command(cmd_id, body, header->size);
       }
       else if(cmd_id == SVGA_CMD_FENCE) {
-         debug_printf("\tSVGA_CMD_FENCE\n");
-         debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
+         _debug_printf("\tSVGA_CMD_FENCE\n");
+         _debug_printf("\t\t0x%08x\n", ((const uint32_t *)next)[1]);
          next += 2*sizeof(uint32_t);
       }
       else {
-         debug_printf("\t0x%08x\n", cmd_id);
+         _debug_printf("\t0x%08x\n", cmd_id);
          next += sizeof(uint32_t);
       }
    }
@@ -294,18 +305,18 @@ def main():
     print '#include "svga_shader_dump.h"'
     print '#include "svga3d_reg.h"'
     print
-    print '#include "pipe/p_debug.h"'
+    print '#include "util/u_debug.h"'
     print '#include "svga_dump.h"'
     print
 
     config = parser.config_t(
-        include_paths = ['include'],
+        include_paths = ['../../../include', '../include'],
         compiler = 'gcc',
     )
 
     headers = [
-        'include/svga_types.h', 
-        'include/svga3d_reg.h', 
+        'svga_types.h', 
+        'svga3d_reg.h', 
     ]
 
     decls = parser.parse(headers, config, parser.COMPILATION_MODE.ALL_AT_ONCE)
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_dump.c b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
index b0e7fdf378a..70e27d86d30 100644
--- a/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_dump.c
@@ -50,16 +50,16 @@ static void dump_op( struct sh_op op, const char *mnemonic )
    assert( op.is_reg == 0 );
 
    if (op.coissue)
-      debug_printf( "+" );
-   debug_printf( "%s", mnemonic );
+      _debug_printf( "+" );
+   _debug_printf( "%s", mnemonic );
    switch (op.control) {
    case 0:
       break;
    case SVGA3DOPCONT_PROJECT:
-      debug_printf( "p" );
+      _debug_printf( "p" );
       break;
    case SVGA3DOPCONT_BIAS:
-      debug_printf( "b" );
+      _debug_printf( "b" );
       break;
    default:
       assert( 0 );
@@ -72,28 +72,28 @@ static void dump_comp_op( struct sh_op op, const char *mnemonic )
    assert( op.is_reg == 0 );
 
    if (op.coissue)
-      debug_printf( "+" );
-   debug_printf( "%s", mnemonic );
+      _debug_printf( "+" );
+   _debug_printf( "%s", mnemonic );
    switch (op.control) {
    case SVGA3DOPCOMP_RESERVED0:
       break;
    case SVGA3DOPCOMP_GT:
-      debug_printf("_gt");
+      _debug_printf("_gt");
       break;
    case SVGA3DOPCOMP_EQ:
-      debug_printf("_eq");
+      _debug_printf("_eq");
       break;
    case SVGA3DOPCOMP_GE:
-      debug_printf("_ge");
+      _debug_printf("_ge");
       break;
    case SVGA3DOPCOMP_LT:
-      debug_printf("_lt");
+      _debug_printf("_lt");
       break;
    case SVGA3DOPCOMPC_NE:
-      debug_printf("_ne");
+      _debug_printf("_ne");
       break;
    case SVGA3DOPCOMP_LE:
-      debug_printf("_le");
+      _debug_printf("_le");
       break;
    case SVGA3DOPCOMP_RESERVED1:
    default:
@@ -109,93 +109,93 @@ static void dump_reg( struct sh_reg reg, struct sh_srcreg *indreg, const struct
 
    switch (sh_reg_type( reg )) {
    case SVGA3DREG_TEMP:
-      debug_printf( "r%u", reg.number );
+      _debug_printf( "r%u", reg.number );
       break;
 
    case SVGA3DREG_INPUT:
-      debug_printf( "v%u", reg.number );
+      _debug_printf( "v%u", reg.number );
       break;
 
    case SVGA3DREG_CONST:
       if (reg.relative) {
          if (sh_srcreg_type( *indreg ) == SVGA3DREG_LOOP)
-            debug_printf( "c[aL+%u]", reg.number );
+            _debug_printf( "c[aL+%u]", reg.number );
          else
-            debug_printf( "c[a%u.x+%u]", indreg->number, reg.number );
+            _debug_printf( "c[a%u.x+%u]", indreg->number, reg.number );
       }
       else
-         debug_printf( "c%u", reg.number );
+         _debug_printf( "c%u", reg.number );
       break;
 
    case SVGA3DREG_ADDR:    /* VS */
    /* SVGA3DREG_TEXTURE */ /* PS */
       if (di->is_ps)
-         debug_printf( "t%u", reg.number );
+         _debug_printf( "t%u", reg.number );
       else
-         debug_printf( "a%u", reg.number );
+         _debug_printf( "a%u", reg.number );
       break;
 
    case SVGA3DREG_RASTOUT:
       switch (reg.number) {
       case 0 /*POSITION*/:
-         debug_printf( "oPos" );
+         _debug_printf( "oPos" );
          break;
       case 1 /*FOG*/:
-         debug_printf( "oFog" );
+         _debug_printf( "oFog" );
          break;
       case 2 /*POINT_SIZE*/:
-         debug_printf( "oPts" );
+         _debug_printf( "oPts" );
          break;
       default:
          assert( 0 );
-         debug_printf( "???" );
+         _debug_printf( "???" );
       }
       break;
 
    case SVGA3DREG_ATTROUT:
       assert( reg.number < 2 );
-      debug_printf( "oD%u", reg.number );
+      _debug_printf( "oD%u", reg.number );
       break;
 
    case SVGA3DREG_TEXCRDOUT:
    /* SVGA3DREG_OUTPUT */
-      debug_printf( "oT%u", reg.number );
+      _debug_printf( "oT%u", reg.number );
       break;
 
    case SVGA3DREG_COLOROUT:
-      debug_printf( "oC%u", reg.number );
+      _debug_printf( "oC%u", reg.number );
       break;
 
    case SVGA3DREG_DEPTHOUT:
-      debug_printf( "oD%u", reg.number );
+      _debug_printf( "oD%u", reg.number );
       break;
 
    case SVGA3DREG_SAMPLER:
-      debug_printf( "s%u", reg.number );
+      _debug_printf( "s%u", reg.number );
       break;
 
    case SVGA3DREG_CONSTBOOL:
       assert( !reg.relative );
-      debug_printf( "b%u", reg.number );
+      _debug_printf( "b%u", reg.number );
       break;
 
    case SVGA3DREG_CONSTINT:
       assert( !reg.relative );
-      debug_printf( "i%u", reg.number );
+      _debug_printf( "i%u", reg.number );
       break;
 
    case SVGA3DREG_LOOP:
       assert( reg.number == 0 );
-      debug_printf( "aL" );
+      _debug_printf( "aL" );
       break;
 
    case SVGA3DREG_MISCTYPE:
       switch (reg.number) {
       case SVGA3DMISCREG_POSITION:
-         debug_printf( "vPos" );
+         _debug_printf( "vPos" );
          break;
       case SVGA3DMISCREG_FACE:
-         debug_printf( "vFace" );
+         _debug_printf( "vFace" );
          break;
       default:
          assert(0);
@@ -204,46 +204,46 @@ static void dump_reg( struct sh_reg reg, struct sh_srcreg *indreg, const struct
       break;
 
    case SVGA3DREG_LABEL:
-      debug_printf( "l%u", reg.number );
+      _debug_printf( "l%u", reg.number );
       break;
 
    case SVGA3DREG_PREDICATE:
-      debug_printf( "p%u", reg.number );
+      _debug_printf( "p%u", reg.number );
       break;
 
 
    default:
       assert( 0 );
-      debug_printf( "???" );
+      _debug_printf( "???" );
    }
 }
 
 static void dump_cdata( struct sh_cdata cdata )
 {
-   debug_printf( "%f, %f, %f, %f", cdata.xyzw[0], cdata.xyzw[1], cdata.xyzw[2], cdata.xyzw[3] );
+   _debug_printf( "%f, %f, %f, %f", cdata.xyzw[0], cdata.xyzw[1], cdata.xyzw[2], cdata.xyzw[3] );
 }
 
 static void dump_idata( struct sh_idata idata )
 {
-   debug_printf( "%d, %d, %d, %d", idata.xyzw[0], idata.xyzw[1], idata.xyzw[2], idata.xyzw[3] );
+   _debug_printf( "%d, %d, %d, %d", idata.xyzw[0], idata.xyzw[1], idata.xyzw[2], idata.xyzw[3] );
 }
 
 static void dump_bdata( boolean bdata )
 {
-   debug_printf( bdata ? "TRUE" : "FALSE" );
+   _debug_printf( bdata ? "TRUE" : "FALSE" );
 }
 
 static void dump_sampleinfo( struct ps_sampleinfo sampleinfo )
 {
    switch (sampleinfo.texture_type) {
    case SVGA3DSAMP_2D:
-      debug_printf( "_2d" );
+      _debug_printf( "_2d" );
       break;
    case SVGA3DSAMP_CUBE:
-      debug_printf( "_cube" );
+      _debug_printf( "_cube" );
       break;
    case SVGA3DSAMP_VOLUME:
-      debug_printf( "_volume" );
+      _debug_printf( "_volume" );
       break;
    default:
       assert( 0 );
@@ -255,46 +255,46 @@ static void dump_usageinfo( struct vs_semantic semantic )
 {
    switch (semantic.usage) {
    case SVGA3D_DECLUSAGE_POSITION:
-      debug_printf("_position" );
+      _debug_printf("_position" );
       break;
    case SVGA3D_DECLUSAGE_BLENDWEIGHT:
-      debug_printf("_blendweight" );
+      _debug_printf("_blendweight" );
       break;
    case SVGA3D_DECLUSAGE_BLENDINDICES:
-      debug_printf("_blendindices" );
+      _debug_printf("_blendindices" );
       break;
    case SVGA3D_DECLUSAGE_NORMAL:
-      debug_printf("_normal" );
+      _debug_printf("_normal" );
       break;
    case SVGA3D_DECLUSAGE_PSIZE:
-      debug_printf("_psize" );
+      _debug_printf("_psize" );
       break;
    case SVGA3D_DECLUSAGE_TEXCOORD:
-      debug_printf("_texcoord");
+      _debug_printf("_texcoord");
       break;
    case SVGA3D_DECLUSAGE_TANGENT:
-      debug_printf("_tangent" );
+      _debug_printf("_tangent" );
       break;
    case SVGA3D_DECLUSAGE_BINORMAL:
-      debug_printf("_binormal" );
+      _debug_printf("_binormal" );
       break;
    case SVGA3D_DECLUSAGE_TESSFACTOR:
-      debug_printf("_tessfactor" );
+      _debug_printf("_tessfactor" );
       break;
    case SVGA3D_DECLUSAGE_POSITIONT:
-      debug_printf("_positiont" );
+      _debug_printf("_positiont" );
       break;
    case SVGA3D_DECLUSAGE_COLOR:
-      debug_printf("_color" );
+      _debug_printf("_color" );
       break;
    case SVGA3D_DECLUSAGE_FOG:
-      debug_printf("_fog" );
+      _debug_printf("_fog" );
       break;
    case SVGA3D_DECLUSAGE_DEPTH:
-      debug_printf("_depth" );
+      _debug_printf("_depth" );
       break;
    case SVGA3D_DECLUSAGE_SAMPLE:
-      debug_printf("_sample");
+      _debug_printf("_sample");
       break;
    default:
       assert( 0 );
@@ -302,7 +302,7 @@ static void dump_usageinfo( struct vs_semantic semantic )
    }
 
    if (semantic.usage_index != 0) {
-      debug_printf("%d", semantic.usage_index );
+      _debug_printf("%d", semantic.usage_index );
    }
 }
 
@@ -316,47 +316,47 @@ static void dump_dstreg( struct sh_dstreg dstreg, const struct dump_info *di )
    assert( (dstreg.modifier & (SVGA3DDSTMOD_SATURATE | SVGA3DDSTMOD_PARTIALPRECISION)) == dstreg.modifier );
 
    if (dstreg.modifier & SVGA3DDSTMOD_SATURATE)
-      debug_printf( "_sat" );
+      _debug_printf( "_sat" );
    if (dstreg.modifier & SVGA3DDSTMOD_PARTIALPRECISION)
-      debug_printf( "_pp" );
+      _debug_printf( "_pp" );
    switch (dstreg.shift_scale) {
    case 0:
       break;
    case 1:
-      debug_printf( "_x2" );
+      _debug_printf( "_x2" );
       break;
    case 2:
-      debug_printf( "_x4" );
+      _debug_printf( "_x4" );
       break;
    case 3:
-      debug_printf( "_x8" );
+      _debug_printf( "_x8" );
       break;
    case 13:
-      debug_printf( "_d8" );
+      _debug_printf( "_d8" );
       break;
    case 14:
-      debug_printf( "_d4" );
+      _debug_printf( "_d4" );
       break;
    case 15:
-      debug_printf( "_d2" );
+      _debug_printf( "_d2" );
       break;
    default:
       assert( 0 );
    }
-   debug_printf( " " );
+   _debug_printf( " " );
 
    u.dstreg = dstreg;
    dump_reg( u.reg, NULL, di );
    if (dstreg.write_mask != SVGA3DWRITEMASK_ALL) {
-      debug_printf( "." );
+      _debug_printf( "." );
       if (dstreg.write_mask & SVGA3DWRITEMASK_0)
-         debug_printf( "x" );
+         _debug_printf( "x" );
       if (dstreg.write_mask & SVGA3DWRITEMASK_1)
-         debug_printf( "y" );
+         _debug_printf( "y" );
       if (dstreg.write_mask & SVGA3DWRITEMASK_2)
-         debug_printf( "z" );
+         _debug_printf( "z" );
       if (dstreg.write_mask & SVGA3DWRITEMASK_3)
-         debug_printf( "w" );
+         _debug_printf( "w" );
    }
 }
 
@@ -372,19 +372,19 @@ static void dump_srcreg( struct sh_srcreg srcreg, struct sh_srcreg *indreg, cons
    case SVGA3DSRCMOD_BIASNEG:
    case SVGA3DSRCMOD_SIGNNEG:
    case SVGA3DSRCMOD_X2NEG:
-      debug_printf( "-" );
+      _debug_printf( "-" );
       break;
    case SVGA3DSRCMOD_ABS:
-      debug_printf( "|" );
+      _debug_printf( "|" );
       break;
    case SVGA3DSRCMOD_ABSNEG:
-      debug_printf( "-|" );
+      _debug_printf( "-|" );
       break;
    case SVGA3DSRCMOD_COMP:
-      debug_printf( "1-" );
+      _debug_printf( "1-" );
       break;
    case SVGA3DSRCMOD_NOT:
-      debug_printf( "!" );
+      _debug_printf( "!" );
    }
 
    u.srcreg = srcreg;
@@ -397,39 +397,39 @@ static void dump_srcreg( struct sh_srcreg srcreg, struct sh_srcreg *indreg, cons
       break;
    case SVGA3DSRCMOD_ABS:
    case SVGA3DSRCMOD_ABSNEG:
-      debug_printf( "|" );
+      _debug_printf( "|" );
       break;
    case SVGA3DSRCMOD_BIAS:
    case SVGA3DSRCMOD_BIASNEG:
-      debug_printf( "_bias" );
+      _debug_printf( "_bias" );
       break;
    case SVGA3DSRCMOD_SIGN:
    case SVGA3DSRCMOD_SIGNNEG:
-      debug_printf( "_bx2" );
+      _debug_printf( "_bx2" );
       break;
    case SVGA3DSRCMOD_X2:
    case SVGA3DSRCMOD_X2NEG:
-      debug_printf( "_x2" );
+      _debug_printf( "_x2" );
       break;
    case SVGA3DSRCMOD_DZ:
-      debug_printf( "_dz" );
+      _debug_printf( "_dz" );
       break;
    case SVGA3DSRCMOD_DW:
-      debug_printf( "_dw" );
+      _debug_printf( "_dw" );
       break;
    default:
       assert( 0 );
    }
    if (srcreg.swizzle_x != 0 || srcreg.swizzle_y != 1 || srcreg.swizzle_z != 2 || srcreg.swizzle_w != 3) {
-      debug_printf( "." );
+      _debug_printf( "." );
       if (srcreg.swizzle_x == srcreg.swizzle_y && srcreg.swizzle_y == srcreg.swizzle_z && srcreg.swizzle_z == srcreg.swizzle_w) {
-         debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
       }
       else {
-         debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
-         debug_printf( "%c", "xyzw"[srcreg.swizzle_y] );
-         debug_printf( "%c", "xyzw"[srcreg.swizzle_z] );
-         debug_printf( "%c", "xyzw"[srcreg.swizzle_w] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_x] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_y] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_z] );
+         _debug_printf( "%c", "xyzw"[srcreg.swizzle_w] );
       }
    }
 }
@@ -447,15 +447,15 @@ svga_shader_dump(
 
    if (do_binary) {
       for (i = 0; i < dwords; i++) 
-         debug_printf("  0x%08x,\n", assem[i]);
+         _debug_printf("  0x%08x,\n", assem[i]);
       
-      debug_printf("\n\n");
+      _debug_printf("\n\n");
    }
 
    di.version.value = *assem++;
    di.is_ps = (di.version.type == SVGA3D_PS_TYPE);
 
-   debug_printf(
+   _debug_printf(
       "%s_%u_%u\n",
       di.is_ps ? "ps" : "vs",
       di.version.major,
@@ -465,7 +465,7 @@ svga_shader_dump(
       struct sh_op op = *(struct sh_op *) assem;
 
       if (assem - start >= dwords) {
-         debug_printf("... ran off end of buffer\n");
+         _debug_printf("... ran off end of buffer\n");
          assert(0);
          return;
       }
@@ -475,7 +475,7 @@ svga_shader_dump(
          {
             struct sh_dcl dcl = *(struct sh_dcl *) assem;
 
-            debug_printf( "dcl" );
+            _debug_printf( "dcl" );
             if (sh_dstreg_type( dcl.reg ) == SVGA3DREG_SAMPLER)
                dump_sampleinfo( dcl.u.ps.sampleinfo );
             else if (di.is_ps) {
@@ -486,7 +486,7 @@ svga_shader_dump(
             else
                dump_usageinfo( dcl.u.vs.semantic );
             dump_dstreg( dcl.reg, &di );
-            debug_printf( "\n" );
+            _debug_printf( "\n" );
             assem += sizeof( struct sh_dcl ) / sizeof( unsigned );
          }
          break;
@@ -495,11 +495,11 @@ svga_shader_dump(
          {
             struct sh_defb defb = *(struct sh_defb *) assem;
 
-            debug_printf( "defb " );
+            _debug_printf( "defb " );
             dump_reg( defb.reg, NULL, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_bdata( defb.data );
-            debug_printf( "\n" );
+            _debug_printf( "\n" );
             assem += sizeof( struct sh_defb ) / sizeof( unsigned );
          }
          break;
@@ -508,11 +508,11 @@ svga_shader_dump(
          {
             struct sh_defi defi = *(struct sh_defi *) assem;
 
-            debug_printf( "defi " );
+            _debug_printf( "defi " );
             dump_reg( defi.reg, NULL, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_idata( defi.idata );
-            debug_printf( "\n" );
+            _debug_printf( "\n" );
             assem += sizeof( struct sh_defi ) / sizeof( unsigned );
          }
          break;
@@ -528,11 +528,11 @@ svga_shader_dump(
          else {
             struct sh_unaryop unaryop = *(struct sh_unaryop *) assem;
             dump_dstreg( unaryop.dst, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_srcreg( unaryop.src, NULL, &di );
             assem += sizeof( struct sh_unaryop ) / sizeof( unsigned );
          }
-         debug_printf( "\n" );
+         _debug_printf( "\n" );
          break;
 
       case SVGA3DOP_TEX:
@@ -549,7 +549,7 @@ svga_shader_dump(
                struct sh_unaryop unaryop = *(struct sh_unaryop *) assem;
 
                dump_dstreg( unaryop.dst, &di );
-               debug_printf( ", " );
+               _debug_printf( ", " );
                dump_srcreg( unaryop.src, NULL, &di );
                assem += sizeof( struct sh_unaryop ) / sizeof( unsigned );
             }
@@ -559,30 +559,30 @@ svga_shader_dump(
 
             dump_op( op, "texld" );
             dump_dstreg( binaryop.dst, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_srcreg( binaryop.src0, NULL, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_srcreg( binaryop.src1, NULL, &di );
             assem += sizeof( struct sh_binaryop ) / sizeof( unsigned );
          }
-         debug_printf( "\n" );
+         _debug_printf( "\n" );
          break;
 
       case SVGA3DOP_DEF:
          {
             struct sh_def def = *(struct sh_def *) assem;
 
-            debug_printf( "def " );
+            _debug_printf( "def " );
             dump_reg( def.reg, NULL, &di );
-            debug_printf( ", " );
+            _debug_printf( ", " );
             dump_cdata( def.cdata );
-            debug_printf( "\n" );
+            _debug_printf( "\n" );
             assem += sizeof( struct sh_def ) / sizeof( unsigned );
          }
          break;
 
       case SVGA3DOP_PHASE:
-         debug_printf( "phase\n" );
+         _debug_printf( "phase\n" );
          assem += sizeof( struct sh_op ) / sizeof( unsigned );
          break;
 
@@ -596,12 +596,12 @@ svga_shader_dump(
          break;
 
       case SVGA3DOP_RET:
-         debug_printf( "ret\n" );
+         _debug_printf( "ret\n" );
          assem += sizeof( struct sh_op ) / sizeof( unsigned );
          break;
 
       case SVGA3DOP_END:
-         debug_printf( "end\n" );
+         _debug_printf( "end\n" );
          finished = TRUE;
          break;
 
@@ -640,14 +640,14 @@ svga_shader_dump(
                }
 
                if (not_first_arg)
-                  debug_printf( ", " );
+                  _debug_printf( ", " );
                else
-                  debug_printf( " " );
+                  _debug_printf( " " );
                dump_srcreg( srcreg, &indreg, &di );
                not_first_arg = TRUE;
             }
 
-            debug_printf( "\n" );
+            _debug_printf( "\n" );
          }
       }
    }
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
index 1000c31e49a..203c3851bc3 100644
--- a/src/gallium/drivers/trace/README
+++ b/src/gallium/drivers/trace/README
@@ -24,11 +24,10 @@ ensure the right libGL.so is being picked by doing
 
  ldd progs/trivial/tri 
 
-== Traceing ==
+== Tracing ==
 
-For traceing then do
+For tracing then do
 
- export XMESA_TRACE=y
  GALLIUM_TRACE=tri.trace progs/trivial/tri
 
 which should create a tri.trace file, which is an XML file. You can view copying 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 540855c067d..5a9f0fc6901 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -97,25 +97,6 @@ trace_surface_unwrap(struct trace_context *tr_ctx,
 
 
 static INLINE void
-trace_context_set_edgeflags(struct pipe_context *_pipe,
-                            const unsigned *bitfield)
-{
-   struct trace_context *tr_ctx = trace_context(_pipe);
-   struct pipe_context *pipe = tr_ctx->pipe;
-
-   trace_dump_call_begin("pipe_context", "set_edgeflags");
-
-   trace_dump_arg(ptr, pipe);
-   /* FIXME: we don't know how big this array is */
-   trace_dump_arg(ptr, bitfield);
-
-   pipe->set_edgeflags(pipe, bitfield);;
-
-   trace_dump_call_end();
-}
-
-
-static INLINE void
 trace_context_draw_block(struct trace_context *tr_ctx, int flag)
 {
    int k;
@@ -145,10 +126,16 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
          for (k = 0; k < tr_ctx->curr.nr_cbufs; k++)
             if (tr_ctx->draw_rule.surf == tr_ctx->curr.cbufs[k])
                block = TRUE;
-      if (tr_ctx->draw_rule.tex)
+      if (tr_ctx->draw_rule.tex) {
          for (k = 0; k < tr_ctx->curr.num_texs; k++)
             if (tr_ctx->draw_rule.tex == tr_ctx->curr.tex[k])
                block = TRUE;
+         for (k = 0; k < tr_ctx->curr.num_vert_texs; k++) {
+            if (tr_ctx->draw_rule.tex == tr_ctx->curr.vert_tex[k]) {
+               block = TRUE;
+            }
+         }
+      }
 
       if (block)
          tr_ctx->draw_blocked |= (flag | 4);
@@ -174,16 +161,15 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    pipe_mutex_unlock(tr_ctx->draw_mutex);
 }
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_arrays(struct pipe_context *_pipe,
                           unsigned mode, unsigned start, unsigned count)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -194,19 +180,15 @@ trace_context_draw_arrays(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_arrays(pipe, mode, start, count);;
-
-   trace_dump_ret(bool, result);
+   pipe->draw_arrays(pipe, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_elements(struct pipe_context *_pipe,
                           struct pipe_buffer *_indexBuffer,
                           unsigned indexSize,
@@ -216,10 +198,9 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -234,19 +215,15 @@ trace_context_draw_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);;
-
-   trace_dump_ret(bool, result);
+   pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
-static INLINE boolean
+static INLINE void
 trace_context_draw_range_elements(struct pipe_context *_pipe,
                                   struct pipe_buffer *_indexBuffer,
                                   unsigned indexSize,
@@ -260,10 +237,9 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    struct trace_buffer *tr_buf = trace_buffer(_indexBuffer);
    struct pipe_context *pipe = tr_ctx->pipe;
    struct pipe_buffer *indexBuffer = tr_buf->buffer;
-   boolean result;
 
    if (tr_ctx->curr.fs->disabled || tr_ctx->curr.vs->disabled)
-      return 0;
+      return;
 
    trace_context_draw_block(tr_ctx, 1);
 
@@ -280,18 +256,14 @@ trace_context_draw_range_elements(struct pipe_context *_pipe,
    trace_dump_arg(uint, start);
    trace_dump_arg(uint, count);
 
-   result = pipe->draw_range_elements(pipe,
-                                      indexBuffer,
-                                      indexSize, minIndex, maxIndex,
-                                      mode, start, count);
-
-   trace_dump_ret(bool, result);
+   pipe->draw_range_elements(pipe,
+                             indexBuffer,
+                             indexSize, minIndex, maxIndex,
+                             mode, start, count);
 
    trace_dump_call_end();
 
    trace_context_draw_block(tr_ctx, 2);
-
-   return result;
 }
 
 
@@ -308,7 +280,7 @@ trace_context_create_query(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(uint, query_type);
 
-   result = pipe->create_query(pipe, query_type);;
+   result = pipe->create_query(pipe, query_type);
 
    trace_dump_ret(ptr, result);
 
@@ -330,7 +302,7 @@ trace_context_destroy_query(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, query);
 
-   pipe->destroy_query(pipe, query);;
+   pipe->destroy_query(pipe, query);
 
    trace_dump_call_end();
 }
@@ -348,7 +320,7 @@ trace_context_begin_query(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, query);
 
-   pipe->begin_query(pipe, query);;
+   pipe->begin_query(pipe, query);
 
    trace_dump_call_end();
 }
@@ -387,7 +359,7 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 
    trace_dump_arg(ptr, pipe);
 
-   _result = pipe->get_query_result(pipe, query, wait, presult);;
+   _result = pipe->get_query_result(pipe, query, wait, presult);
    result = *presult;
 
    trace_dump_arg(uint, result);
@@ -412,7 +384,7 @@ trace_context_create_blend_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(blend_state, state);
 
-   result = pipe->create_blend_state(pipe, state);;
+   result = pipe->create_blend_state(pipe, state);
 
    trace_dump_ret(ptr, result);
 
@@ -434,7 +406,7 @@ trace_context_bind_blend_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->bind_blend_state(pipe, state);;
+   pipe->bind_blend_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -452,7 +424,7 @@ trace_context_delete_blend_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->delete_blend_state(pipe, state);;
+   pipe->delete_blend_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -471,7 +443,7 @@ trace_context_create_sampler_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(sampler_state, state);
 
-   result = pipe->create_sampler_state(pipe, state);;
+   result = pipe->create_sampler_state(pipe, state);
 
    trace_dump_ret(ptr, result);
 
@@ -482,19 +454,40 @@ trace_context_create_sampler_state(struct pipe_context *_pipe,
 
 
 static INLINE void
-trace_context_bind_sampler_states(struct pipe_context *_pipe,
-                                  unsigned num_states, void **states)
+trace_context_bind_fragment_sampler_states(struct pipe_context *_pipe,
+                                           unsigned num_states,
+                                           void **states)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
 
-   trace_dump_call_begin("pipe_context", "bind_sampler_states");
+   trace_dump_call_begin("pipe_context", "bind_fragment_sampler_states");
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(uint, num_states);
    trace_dump_arg_array(ptr, states, num_states);
 
-   pipe->bind_sampler_states(pipe, num_states, states);;
+   pipe->bind_fragment_sampler_states(pipe, num_states, states);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_bind_vertex_sampler_states(struct pipe_context *_pipe,
+                                         unsigned num_states,
+                                         void **states)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vertex_sampler_states");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_states);
+   trace_dump_arg_array(ptr, states, num_states);
+
+   pipe->bind_vertex_sampler_states(pipe, num_states, states);
 
    trace_dump_call_end();
 }
@@ -512,7 +505,7 @@ trace_context_delete_sampler_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->delete_sampler_state(pipe, state);;
+   pipe->delete_sampler_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -531,7 +524,7 @@ trace_context_create_rasterizer_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(rasterizer_state, state);
 
-   result = pipe->create_rasterizer_state(pipe, state);;
+   result = pipe->create_rasterizer_state(pipe, state);
 
    trace_dump_ret(ptr, result);
 
@@ -553,7 +546,7 @@ trace_context_bind_rasterizer_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->bind_rasterizer_state(pipe, state);;
+   pipe->bind_rasterizer_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -571,7 +564,7 @@ trace_context_delete_rasterizer_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->delete_rasterizer_state(pipe, state);;
+   pipe->delete_rasterizer_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -587,7 +580,7 @@ trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
 
    trace_dump_call_begin("pipe_context", "create_depth_stencil_alpha_state");
 
-   result = pipe->create_depth_stencil_alpha_state(pipe, state);;
+   result = pipe->create_depth_stencil_alpha_state(pipe, state);
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(depth_stencil_alpha_state, state);
@@ -612,7 +605,7 @@ trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->bind_depth_stencil_alpha_state(pipe, state);;
+   pipe->bind_depth_stencil_alpha_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -630,7 +623,7 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->delete_depth_stencil_alpha_state(pipe, state);;
+   pipe->delete_depth_stencil_alpha_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -649,7 +642,7 @@ trace_context_create_fs_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(shader_state, state);
 
-   result = pipe->create_fs_state(pipe, state);;
+   result = pipe->create_fs_state(pipe, state);
 
    trace_dump_ret(ptr, result);
 
@@ -752,7 +745,7 @@ trace_context_bind_vs_state(struct pipe_context *_pipe,
    if (tr_shdr && tr_shdr->replaced)
       state = tr_shdr->replaced;
 
-   pipe->bind_vs_state(pipe, state);;
+   pipe->bind_vs_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -772,7 +765,7 @@ trace_context_delete_vs_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(ptr, state);
 
-   pipe->delete_vs_state(pipe, state);;
+   pipe->delete_vs_state(pipe, state);
 
    trace_dump_call_end();
 
@@ -792,7 +785,7 @@ trace_context_set_blend_color(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(blend_color, state);
 
-   pipe->set_blend_color(pipe, state);;
+   pipe->set_blend_color(pipe, state);
 
    trace_dump_call_end();
 }
@@ -810,7 +803,7 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(clip_state, state);
 
-   pipe->set_clip_state(pipe, state);;
+   pipe->set_clip_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -819,13 +812,13 @@ trace_context_set_clip_state(struct pipe_context *_pipe,
 static INLINE void
 trace_context_set_constant_buffer(struct pipe_context *_pipe,
                                   uint shader, uint index,
-                                  const struct pipe_constant_buffer *buffer)
+                                  struct pipe_buffer *buffer)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct pipe_context *pipe = tr_ctx->pipe;
 
    if (buffer)
-      trace_screen_user_buffer_update(_pipe->screen, buffer->buffer);
+      trace_screen_user_buffer_update(_pipe->screen, buffer);
 
    trace_dump_call_begin("pipe_context", "set_constant_buffer");
 
@@ -834,10 +827,11 @@ trace_context_set_constant_buffer(struct pipe_context *_pipe,
    trace_dump_arg(uint, index);
    trace_dump_arg(constant_buffer, buffer);
 
+   /* XXX hmm? */
    if (buffer) {
-      struct pipe_constant_buffer _buffer;
-      _buffer.buffer = trace_buffer_unwrap(tr_ctx, buffer->buffer);
-      pipe->set_constant_buffer(pipe, shader, index, &_buffer);
+      struct pipe_buffer *_buffer;
+      _buffer = trace_buffer_unwrap(tr_ctx, buffer);
+      pipe->set_constant_buffer(pipe, shader, index, _buffer);
    } else {
       pipe->set_constant_buffer(pipe, shader, index, buffer);
    }
@@ -882,7 +876,7 @@ trace_context_set_framebuffer_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(framebuffer_state, state);
 
-   pipe->set_framebuffer_state(pipe, state);;
+   pipe->set_framebuffer_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -900,7 +894,7 @@ trace_context_set_polygon_stipple(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(poly_stipple, state);
 
-   pipe->set_polygon_stipple(pipe, state);;
+   pipe->set_polygon_stipple(pipe, state);
 
    trace_dump_call_end();
 }
@@ -918,7 +912,7 @@ trace_context_set_scissor_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(scissor_state, state);
 
-   pipe->set_scissor_state(pipe, state);;
+   pipe->set_scissor_state(pipe, state);
 
    trace_dump_call_end();
 }
@@ -936,16 +930,16 @@ trace_context_set_viewport_state(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(viewport_state, state);
 
-   pipe->set_viewport_state(pipe, state);;
+   pipe->set_viewport_state(pipe, state);
 
    trace_dump_call_end();
 }
 
 
 static INLINE void
-trace_context_set_sampler_textures(struct pipe_context *_pipe,
-                                   unsigned num_textures,
-                                   struct pipe_texture **textures)
+trace_context_set_fragment_sampler_textures(struct pipe_context *_pipe,
+                                            unsigned num_textures,
+                                            struct pipe_texture **textures)
 {
    struct trace_context *tr_ctx = trace_context(_pipe);
    struct trace_texture *tr_tex;
@@ -961,13 +955,44 @@ trace_context_set_sampler_textures(struct pipe_context *_pipe,
    }
    textures = unwrapped_textures;
 
-   trace_dump_call_begin("pipe_context", "set_sampler_textures");
+   trace_dump_call_begin("pipe_context", "set_fragment_sampler_textures");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_textures);
+   trace_dump_arg_array(ptr, textures, num_textures);
+
+   pipe->set_fragment_sampler_textures(pipe, num_textures, textures);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_sampler_textures(struct pipe_context *_pipe,
+                                          unsigned num_textures,
+                                          struct pipe_texture **textures)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct trace_texture *tr_tex;
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_texture *unwrapped_textures[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned i;
+
+   tr_ctx->curr.num_vert_texs = num_textures;
+   for(i = 0; i < num_textures; ++i) {
+      tr_tex = trace_texture(textures[i]);
+      tr_ctx->curr.vert_tex[i] = tr_tex;
+      unwrapped_textures[i] = tr_tex ? tr_tex->texture : NULL;
+   }
+   textures = unwrapped_textures;
+
+   trace_dump_call_begin("pipe_context", "set_vertex_sampler_textures");
 
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(uint, num_textures);
    trace_dump_arg_array(ptr, textures, num_textures);
 
-   pipe->set_sampler_textures(pipe, num_textures, textures);;
+   pipe->set_vertex_sampler_textures(pipe, num_textures, textures);
 
    trace_dump_call_end();
 }
@@ -1026,7 +1051,7 @@ trace_context_set_vertex_elements(struct pipe_context *_pipe,
    trace_dump_struct_array(vertex_element, elements, num_elements);
    trace_dump_arg_end();
 
-   pipe->set_vertex_elements(pipe, num_elements, elements);;
+   pipe->set_vertex_elements(pipe, num_elements, elements);
 
    trace_dump_call_end();
 }
@@ -1087,7 +1112,7 @@ trace_context_surface_fill(struct pipe_context *_pipe,
    trace_dump_arg(uint, width);
    trace_dump_arg(uint, height);
 
-   pipe->surface_fill(pipe, dst, dstx, dsty, width, height, value);;
+   pipe->surface_fill(pipe, dst, dstx, dsty, width, height, value);
 
    trace_dump_call_end();
 }
@@ -1130,7 +1155,7 @@ trace_context_flush(struct pipe_context *_pipe,
    trace_dump_arg(ptr, pipe);
    trace_dump_arg(uint, flags);
 
-   pipe->flush(pipe, flags, fence);;
+   pipe->flush(pipe, flags, fence);
 
    if(fence)
       trace_dump_ret(ptr, *fence);
@@ -1242,7 +1267,6 @@ trace_context_create(struct pipe_screen *_screen,
    tr_ctx->base.winsys = _screen->winsys;
    tr_ctx->base.screen = _screen;
    tr_ctx->base.destroy = trace_context_destroy;
-   tr_ctx->base.set_edgeflags = trace_context_set_edgeflags;
    tr_ctx->base.draw_arrays = trace_context_draw_arrays;
    tr_ctx->base.draw_elements = trace_context_draw_elements;
    tr_ctx->base.draw_range_elements = trace_context_draw_range_elements;
@@ -1255,7 +1279,8 @@ trace_context_create(struct pipe_screen *_screen,
    tr_ctx->base.bind_blend_state = trace_context_bind_blend_state;
    tr_ctx->base.delete_blend_state = trace_context_delete_blend_state;
    tr_ctx->base.create_sampler_state = trace_context_create_sampler_state;
-   tr_ctx->base.bind_sampler_states = trace_context_bind_sampler_states;
+   tr_ctx->base.bind_fragment_sampler_states = trace_context_bind_fragment_sampler_states;
+   tr_ctx->base.bind_vertex_sampler_states = trace_context_bind_vertex_sampler_states;
    tr_ctx->base.delete_sampler_state = trace_context_delete_sampler_state;
    tr_ctx->base.create_rasterizer_state = trace_context_create_rasterizer_state;
    tr_ctx->base.bind_rasterizer_state = trace_context_bind_rasterizer_state;
@@ -1276,7 +1301,8 @@ trace_context_create(struct pipe_screen *_screen,
    tr_ctx->base.set_polygon_stipple = trace_context_set_polygon_stipple;
    tr_ctx->base.set_scissor_state = trace_context_set_scissor_state;
    tr_ctx->base.set_viewport_state = trace_context_set_viewport_state;
-   tr_ctx->base.set_sampler_textures = trace_context_set_sampler_textures;
+   tr_ctx->base.set_fragment_sampler_textures = trace_context_set_fragment_sampler_textures;
+   tr_ctx->base.set_vertex_sampler_textures = trace_context_set_vertex_sampler_textures;
    tr_ctx->base.set_vertex_buffers = trace_context_set_vertex_buffers;
    tr_ctx->base.set_vertex_elements = trace_context_set_vertex_elements;
    if (pipe->surface_copy)
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
index 6febe4b4114..852b480765a 100644
--- a/src/gallium/drivers/trace/tr_context.h
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -54,6 +54,9 @@ struct trace_context
       struct trace_texture *tex[PIPE_MAX_SAMPLERS];
       unsigned num_texs;
 
+      struct trace_texture *vert_tex[PIPE_MAX_VERTEX_SAMPLERS];
+      unsigned num_vert_texs;
+
       unsigned nr_cbufs;
       struct trace_texture *cbufs[PIPE_MAX_COLOR_BUFS];
       struct trace_texture *zsbuf;
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index bcf6751af4f..32f61f8c944 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -43,19 +43,6 @@ void trace_dump_format(enum pipe_format format)
 }
 
 
-void trace_dump_block(const struct pipe_format_block *block)
-{
-   if (!trace_dumping_enabled_locked())
-      return;
-
-   trace_dump_struct_begin("pipe_format_block");
-   trace_dump_member(uint, block, size);
-   trace_dump_member(uint, block, width);
-   trace_dump_member(uint, block, height);
-   trace_dump_struct_end();
-}
-
-
 static void trace_dump_reference(const struct pipe_reference *reference)
 {
    if (!trace_dumping_enabled_locked())
@@ -83,19 +70,15 @@ void trace_dump_template(const struct pipe_texture *templat)
    trace_dump_member(format, templat, format);
 
    trace_dump_member_begin("width");
-   trace_dump_array(uint, templat->width, 1);
+   trace_dump_uint(templat->width0);
    trace_dump_member_end();
 
    trace_dump_member_begin("height");
-   trace_dump_array(uint, templat->height, 1);
+   trace_dump_uint(templat->height0);
    trace_dump_member_end();
 
    trace_dump_member_begin("depth");
-   trace_dump_array(uint, templat->depth, 1);
-   trace_dump_member_end();
-
-   trace_dump_member_begin("block");
-   trace_dump_block(&templat->block);
+   trace_dump_uint(templat->depth0);
    trace_dump_member_end();
 
    trace_dump_member(uint, templat, last_level);
@@ -246,7 +229,7 @@ void trace_dump_clip_state(const struct pipe_clip_state *state)
 }
 
 
-void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
+void trace_dump_constant_buffer(const struct pipe_buffer *state)
 {
    if (!trace_dumping_enabled_locked())
       return;
@@ -258,7 +241,7 @@ void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
 
    trace_dump_struct_begin("pipe_constant_buffer");
 
-   trace_dump_member(buffer_ptr, state, buffer);
+   trace_dump_reference(&state->reference);
 
    trace_dump_struct_end();
 }
@@ -426,7 +409,7 @@ void trace_dump_sampler_state(const struct pipe_sampler_state *state)
    trace_dump_member(uint, state, min_img_filter);
    trace_dump_member(uint, state, min_mip_filter);
    trace_dump_member(uint, state, mag_img_filter);
-   trace_dump_member(bool, state, compare_mode);
+   trace_dump_member(uint, state, compare_mode);
    trace_dump_member(uint, state, compare_func);
    trace_dump_member(bool, state, normalized_coords);
    trace_dump_member(uint, state, prefilter);
@@ -483,16 +466,9 @@ void trace_dump_transfer(const struct pipe_transfer *state)
 
    trace_dump_struct_begin("pipe_transfer");
 
-   trace_dump_member(format, state, format);
    trace_dump_member(uint, state, width);
    trace_dump_member(uint, state, height);
 
-   trace_dump_member_begin("block");
-   trace_dump_block(&state->block);
-   trace_dump_member_end();
-
-   trace_dump_member(uint, state, nblocksx);
-   trace_dump_member(uint, state, nblocksy);
    trace_dump_member(uint, state, stride);
    trace_dump_member(uint, state, usage);
 
diff --git a/src/gallium/drivers/trace/tr_dump_state.h b/src/gallium/drivers/trace/tr_dump_state.h
index 05b821adb64..c7860fd6e18 100644
--- a/src/gallium/drivers/trace/tr_dump_state.h
+++ b/src/gallium/drivers/trace/tr_dump_state.h
@@ -35,11 +35,8 @@
 
 void trace_dump_format(enum pipe_format format);
 
-void trace_dump_block(const struct pipe_format_block *block);
-
 void trace_dump_template(const struct pipe_texture *templat);
 
-
 void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state);
 
 void trace_dump_poly_stipple(const struct pipe_poly_stipple *state);
@@ -50,7 +47,7 @@ void trace_dump_scissor_state(const struct pipe_scissor_state *state);
 
 void trace_dump_clip_state(const struct pipe_clip_state *state);
 
-void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
+void trace_dump_constant_buffer(const struct pipe_buffer *state);
 
 void trace_dump_token(const struct tgsi_token *token);
 
diff --git a/src/gallium/drivers/trace/tr_rbug.c b/src/gallium/drivers/trace/tr_rbug.c
index 0372d92782b..0546aad9b50 100644
--- a/src/gallium/drivers/trace/tr_rbug.c
+++ b/src/gallium/drivers/trace/tr_rbug.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 
+#include "util/u_format.h"
 #include "util/u_string.h"
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
@@ -200,10 +201,12 @@ trace_rbug_texture_info(struct trace_rbug *tr_rbug, struct rbug_header *header,
    t = tr_tex->texture;
    rbug_send_texture_info_reply(tr_rbug->con, serial,
                                t->target, t->format,
-                               t->width, t->last_level + 1,
-                               t->height, t->last_level + 1,
-                               t->depth, t->last_level + 1,
-                               t->block.width, t->block.height, t->block.size,
+                               &t->width0, 1,
+                               &t->height0, 1,
+                               &t->depth0, 1,
+                               util_format_get_blockwidth(t->format),
+                               util_format_get_blockheight(t->format),
+                               util_format_get_blocksize(t->format),
                                t->last_level,
                                t->nr_samples,
                                t->tex_usage,
@@ -251,9 +254,12 @@ trace_rbug_texture_read(struct trace_rbug *tr_rbug, struct rbug_header *header,
    map = screen->transfer_map(screen, t);
 
    rbug_send_texture_read_reply(tr_rbug->con, serial,
-                                t->format,
-                                t->block.width, t->block.height, t->block.size,
-                                (uint8_t*)map, t->stride * t->nblocksy,
+                                t->texture->format,
+                                util_format_get_blockwidth(t->texture->format),
+                                util_format_get_blockheight(t->texture->format),
+                                util_format_get_blocksize(t->texture->format),
+                                (uint8_t*)map,
+                                t->stride * util_format_get_nblocksy(t->texture->format, t->height),
                                 t->stride,
                                 NULL);
 
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 7da9bd3866b..117503aaff6 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -25,6 +25,7 @@
  *
  **************************************************************************/
 
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_simple_list.h"
 
@@ -35,6 +36,7 @@
 #include "tr_screen.h"
 
 #include "pipe/p_inlines.h"
+#include "pipe/p_format.h"
 
 
 static boolean trace = FALSE;
@@ -424,7 +426,7 @@ trace_screen_transfer_unmap(struct pipe_screen *_screen,
    struct pipe_transfer *transfer = tr_trans->transfer;
 
    if(tr_trans->map) {
-      size_t size = transfer->nblocksy * transfer->stride;
+      size_t size = util_format_get_nblocksy(transfer->texture->format, transfer->height) * transfer->stride;
 
       trace_dump_call_begin("pipe_screen", "transfer_write");