i965: Unify most of the visiting code in the VEC4 and FS visitors.i965-unified-visitor

The VEC4 and FS visitor classes are still huge and there's still a lot that could be unified -- Most of what is left doesn't have much to do with visiting though.
author: Francisco Jerez <currojerez@riseup.net> 2014-10-28 15:59:34 +0200
committer: Francisco Jerez <currojerez@riseup.net> 2014-10-30 16:39:53 +0200
commit: a841b3c0cb61b11f993eaa52e75ae72daa4d5fa4 (patch)
tree: 859e614042badaee0feeb510f6f3fbc089ccb421
parent: d46cf50e4ce13b478544de223ec64302ab832d59 (diff)
36 files changed, 3760 insertions, 6993 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 9c006daa0e3..d61193f8970 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -70,6 +70,7 @@ i965_FILES = \
 	brw_gs_state.c \
 	brw_gs_surface_state.c \
 	brw_interpolation_map.c \
+	brw_ir_visitor.cpp \
 	brw_lower_texture_gradients.cpp \
 	brw_lower_unnormalized_offset.cpp \
 	brw_meta_updownsample.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index bb49a0ae955..7af127f5fee 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -143,7 +143,7 @@ bblock_t::combine_with(bblock_t *that)
 }
 
 void
-bblock_t::dump(backend_visitor *v) const
+bblock_t::dump(brw::base_visitor *v) const
 {
    int ip = this->start_ip;
    foreach_inst_in_block(backend_instruction, inst, this) {
@@ -422,7 +422,7 @@ cfg_t::make_block_array()
 }
 
 void
-cfg_t::dump(backend_visitor *v) const
+cfg_t::dump(brw::base_visitor *v) const
 {
    foreach_block (block, this) {
       fprintf(stderr, "START B%d", block->num);
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index c06ed61a79f..6e27027e41a 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -31,6 +31,10 @@
 
 #include "brw_shader.h"
 
+#ifdef __cplusplus
+#include "brw_ir_visitor.h"
+#endif
+
 struct bblock_t;
 
 struct bblock_link {
@@ -60,7 +64,7 @@ struct bblock_t {
    bool is_successor_of(const bblock_t *block) const;
    bool can_combine_with(const bblock_t *that) const;
    void combine_with(bblock_t *that);
-   void dump(backend_visitor *v) const;
+   void dump(brw::base_visitor *v) const;
 
    backend_instruction *start();
    const backend_instruction *start() const;
@@ -204,7 +208,7 @@ struct cfg_t {
    void set_next_block(bblock_t **cur, bblock_t *block, int ip);
    void make_block_array();
 
-   void dump(backend_visitor *v) const;
+   void dump(brw::base_visitor *v) const;
 #endif
    void *mem_ctx;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 884e28bf8b4..4a1ffdc5b8a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -757,20 +757,20 @@ struct brw_tracked_state {
    void (*emit)( struct brw_context *brw );
 };
 
+enum shader_time_shader_entry {
+   ST_BASE,
+   ST_WRITTEN,
+   ST_RESET,
+   ST_SUM,
+   ST_NUM_ENTRIES
+};
+
 enum shader_time_shader_type {
    ST_NONE,
-   ST_VS,
-   ST_VS_WRITTEN,
-   ST_VS_RESET,
-   ST_GS,
-   ST_GS_WRITTEN,
-   ST_GS_RESET,
-   ST_FS8,
-   ST_FS8_WRITTEN,
-   ST_FS8_RESET,
-   ST_FS16,
-   ST_FS16_WRITTEN,
-   ST_FS16_RESET,
+   ST_VS = ST_NONE + ST_NUM_ENTRIES,
+   ST_GS = ST_VS + ST_NUM_ENTRIES,
+   ST_FS8 = ST_GS + ST_NUM_ENTRIES,
+   ST_FS16 = ST_FS8 + ST_NUM_ENTRIES
 };
 
 /* Flags for brw->state.cache.
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
index 4c9d7b95db8..be66c9efcb4 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
@@ -36,7 +36,7 @@
  *   - if/else/endif
  */
 bool
-dead_control_flow_eliminate(backend_visitor *v)
+dead_control_flow_eliminate(brw::base_visitor *v)
 {
    bool progress = false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
index 57a4dabc83c..1824fb98c33 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
@@ -23,4 +23,4 @@
 
 #include "brw_shader.h"
 
-bool dead_control_flow_eliminate(backend_visitor *v);
+bool dead_control_flow_eliminate(brw::base_visitor *v);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2943f042dd0..2cf2294960b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -124,7 +124,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case GRF:
    case HW_REG:
    case MRF:
-      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+      this->regs_written = (MAX2(dst.width * dst.stride, 1) *
+                            type_sz(dst.type) + 31) / 32;
       break;
    case BAD_FILE:
       this->regs_written = 0;
@@ -228,7 +229,7 @@ fs_inst::resize_sources(uint8_t num_sources)
    if (this->sources != num_sources) {
       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 
-      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
+      for (int i = 0; i < MIN2(this->sources, num_sources); ++i)
          src[i] = this->src[i];
 
       delete[] this->src;
@@ -237,236 +238,6 @@ fs_inst::resize_sources(uint8_t num_sources)
    }
 }
 
-#define ALU1(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
-   }
-
-#define ALU2(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
-   }
-
-#define ALU2_ACC(op)                                                    \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
-      inst->writes_accumulator = true;                                  \
-      return inst;                                                      \
-   }
-
-#define ALU3(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1, const fs_reg &src2)               \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
-   }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(SEL)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-fs_inst *
-fs_visitor::IF(enum brw_predicate predicate)
-{
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
-   inst->predicate = predicate;
-   return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-fs_inst *
-fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition)
-{
-   assert(brw->gen == 6);
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
-                                        reg_null_d, src0, src1);
-   inst->conditional_mod = condition;
-   return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-fs_inst *
-fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition)
-{
-   fs_inst *inst;
-
-   /* Take the instruction:
-    *
-    * CMP null<d> src0<f> src1<f>
-    *
-    * Original gen4 does type conversion to the destination type before
-    * comparison, producing garbage results for floating point comparisons.
-    * gen5 does the comparison on the execution type (resolved source types),
-    * so dst type doesn't matter.  gen6 does comparison and then uses the
-    * result as if it was the dst type with no conversion, which happens to
-    * mostly work out for float-interpreted-as-int since our comparisons are
-    * for >0, =0, <0.
-    */
-   if (brw->gen == 4) {
-      dst.type = src0.type;
-      if (dst.file == HW_REG)
-	 dst.fixed_hw_reg.type = dst.type;
-   }
-
-   resolve_ud_negate(&src0);
-   resolve_ud_negate(&src1);
-
-   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = condition;
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
-{
-   uint8_t exec_size = dst.width;
-   for (int i = 0; i < sources; ++i) {
-      assert(src[i].width % dst.width == 0);
-      if (src[i].width > exec_size)
-         exec_size = src[i].width;
-   }
-
-   fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
-                                        dst, src, sources);
-   inst->regs_written = 0;
-   for (int i = 0; i < sources; ++i) {
-      /* The LOAD_PAYLOAD instruction only really makes sense if we are
-       * dealing with whole registers.  If this ever changes, we can deal
-       * with it later.
-       */
-      int size = src[i].effective_width * type_sz(src[i].type);
-      assert(size % 32 == 0);
-      inst->regs_written += (size + 31) / 32;
-   }
-
-   return inst;
-}
-
-exec_list
-fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
-                                       const fs_reg &surf_index,
-                                       const fs_reg &varying_offset,
-                                       uint32_t const_offset)
-{
-   exec_list instructions;
-   fs_inst *inst;
-
-   /* We have our constant surface use a pitch of 4 bytes, so our index can
-    * be any component of a vector, and then we load 4 contiguous
-    * components starting from that.
-    *
-    * We break down the const_offset to a portion added to the variable
-    * offset and a portion done using reg_offset, which means that if you
-    * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
-    * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
-    * CSE can later notice that those loads are all the same and eliminate
-    * the redundant ones.
-    */
-   fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
-   instructions.push_tail(ADD(vec4_offset,
-                              varying_offset, fs_reg(const_offset & ~3)));
-
-   int scale = 1;
-   if (brw->gen == 4 && dst.width == 8) {
-      /* Pre-gen5, we can either use a SIMD8 message that requires (header,
-       * u, v, r) as parameters, or we can just use the SIMD16 message
-       * consisting of (header, u).  We choose the second, at the cost of a
-       * longer return length.
-       */
-      scale = 2;
-   }
-
-   enum opcode op;
-   if (brw->gen >= 7)
-      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
-   else
-      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
-
-   assert(dst.width % 8 == 0);
-   int regs_written = 4 * (dst.width / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
-                               dst.type, dst.width);
-   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
-   inst->regs_written = regs_written;
-   instructions.push_tail(inst);
-
-   if (brw->gen < 7) {
-      inst->base_mrf = 13;
-      inst->header_present = true;
-      if (brw->gen == 4)
-         inst->mlen = 3;
-      else
-         inst->mlen = 1 + dispatch_width / 8;
-   }
-
-   fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
-   instructions.push_tail(MOV(dst, result));
-
-   return instructions;
-}
-
-/**
- * A helper for MOV generation for fixing up broken hardware SEND dependency
- * handling.
- */
-fs_inst *
-fs_visitor::DEP_RESOLVE_MOV(int grf)
-{
-   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
-
-   inst->ir = NULL;
-   inst->annotation = "send dependency resolve";
-
-   /* The caller always wants uncompressed to emit the minimal extra
-    * dependencies, and to avoid having to deal with aligning its regs to 2.
-    */
-   inst->exec_size = 8;
-
-   return inst;
-}
-
 bool
 fs_inst::equals(fs_inst *inst) const
 {
@@ -632,186 +403,6 @@ fs_reg::is_contiguous() const
    return stride == 1;
 }
 
-bool
-fs_reg::is_valid_3src() const
-{
-   return file == GRF || file == UNIFORM;
-}
-
-int
-fs_visitor::type_size(const struct glsl_type *type)
-{
-   unsigned int size, i;
-
-   switch (type->base_type) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_BOOL:
-      return type->components();
-   case GLSL_TYPE_ARRAY:
-      return type_size(type->fields.array) * type->length;
-   case GLSL_TYPE_STRUCT:
-      size = 0;
-      for (i = 0; i < type->length; i++) {
-	 size += type_size(type->fields.structure[i].type);
-      }
-      return size;
-   case GLSL_TYPE_SAMPLER:
-      /* Samplers take up no register space, since they're baked in at
-       * link time.
-       */
-      return 0;
-   case GLSL_TYPE_ATOMIC_UINT:
-      return 0;
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-      unreachable("not reached");
-   }
-
-   return 0;
-}
-
-fs_reg
-fs_visitor::get_timestamp()
-{
-   assert(brw->gen >= 7);
-
-   fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-                                          BRW_ARF_TIMESTAMP,
-                                          0),
-                             BRW_REGISTER_TYPE_UD));
-
-   fs_reg dst = fs_reg(this, glsl_type::uint_type);
-
-   fs_inst *mov = emit(MOV(dst, ts));
-   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
-    * even if it's not enabled in the dispatch.
-    */
-   mov->force_writemask_all = true;
-   mov->exec_size = 8;
-
-   /* The caller wants the low 32 bits of the timestamp.  Since it's running
-    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
-    * which is plenty of time for our purposes.  It is identical across the
-    * EUs, but since it's tracking GPU core speed it will increment at a
-    * varying rate as render P-states change.
-    *
-    * The caller could also check if render P-states have changed (or anything
-    * else that might disrupt timing) by setting smear to 2 and checking if
-    * that field is != 0.
-    */
-   dst.set_smear(0);
-
-   return dst;
-}
-
-void
-fs_visitor::emit_shader_time_begin()
-{
-   current_annotation = "shader time start";
-   shader_start_time = get_timestamp();
-}
-
-void
-fs_visitor::emit_shader_time_end()
-{
-   current_annotation = "shader time end";
-
-   enum shader_time_shader_type type, written_type, reset_type;
-   if (dispatch_width == 8) {
-      type = ST_FS8;
-      written_type = ST_FS8_WRITTEN;
-      reset_type = ST_FS8_RESET;
-   } else {
-      assert(dispatch_width == 16);
-      type = ST_FS16;
-      written_type = ST_FS16_WRITTEN;
-      reset_type = ST_FS16_RESET;
-   }
-
-   fs_reg shader_end_time = get_timestamp();
-
-   /* Check that there weren't any timestamp reset events (assuming these
-    * were the only two timestamp reads that happened).
-    */
-   fs_reg reset = shader_end_time;
-   reset.set_smear(2);
-   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
-   test->conditional_mod = BRW_CONDITIONAL_Z;
-   emit(IF(BRW_PREDICATE_NORMAL));
-
-   push_force_uncompressed();
-   fs_reg start = shader_start_time;
-   start.negate = true;
-   fs_reg diff = fs_reg(this, glsl_type::uint_type);
-   emit(ADD(diff, start, shader_end_time));
-
-   /* If there were no instructions between the two timestamp gets, the diff
-    * is 2 cycles.  Remove that overhead, so I can forget about that when
-    * trying to determine the time taken for single instructions.
-    */
-   emit(ADD(diff, diff, fs_reg(-2u)));
-
-   emit_shader_time_write(type, diff);
-   emit_shader_time_write(written_type, fs_reg(1u));
-   emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(reset_type, fs_reg(1u));
-   emit(BRW_OPCODE_ENDIF);
-
-   pop_force_uncompressed();
-}
-
-void
-fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                   fs_reg value)
-{
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
-
-   fs_reg payload;
-   if (dispatch_width == 8)
-      payload = fs_reg(this, glsl_type::uvec2_type);
-   else
-      payload = fs_reg(this, glsl_type::uint_type);
-
-   emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
-                             fs_reg(), payload, offset, value));
-}
-
-void
-fs_visitor::vfail(const char *format, va_list va)
-{
-   char *msg;
-
-   if (failed)
-      return;
-
-   failed = true;
-
-   msg = ralloc_vasprintf(mem_ctx, format, va);
-   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
-
-   this->fail_msg = msg;
-
-   if (INTEL_DEBUG & DEBUG_WM) {
-      fprintf(stderr, "%s",  msg);
-   }
-}
-
-void
-fs_visitor::fail(const char *format, ...)
-{
-   va_list va;
-
-   va_start(va, format);
-   vfail(format, va);
-   va_end(va);
-}
-
 /**
  * Mark this program as impossible to compile in SIMD16 mode.
  *
@@ -844,58 +435,6 @@ fs_visitor::no16(const char *format, ...)
    va_end(va);
 }
 
-fs_inst *
-fs_visitor::emit(enum opcode opcode)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
-}
-
-void
-fs_visitor::push_force_uncompressed()
-{
-   force_uncompressed_stack++;
-}
-
-void
-fs_visitor::pop_force_uncompressed()
-{
-   force_uncompressed_stack--;
-   assert(force_uncompressed_stack >= 0);
-}
-
 /**
  * Returns true if the instruction has a flag that means it won't
  * update an entire destination register.
@@ -958,67 +497,6 @@ fs_inst::writes_flag() const
           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 }
 
-/**
- * Returns how many MRFs an FS opcode will write over.
- *
- * Note that this is not the 0 or 1 implied writes in an actual gen
- * instruction -- the FS opcodes often generate MOVs in addition.
- */
-int
-fs_visitor::implied_mrf_writes(fs_inst *inst)
-{
-   if (inst->mlen == 0)
-      return 0;
-
-   if (inst->base_mrf == -1)
-      return 0;
-
-   switch (inst->opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      return 1 * dispatch_width / 8;
-   case SHADER_OPCODE_POW:
-   case SHADER_OPCODE_INT_QUOTIENT:
-   case SHADER_OPCODE_INT_REMAINDER:
-      return 2 * dispatch_width / 8;
-   case SHADER_OPCODE_TEX:
-   case FS_OPCODE_TXB:
-   case SHADER_OPCODE_TXD:
-   case SHADER_OPCODE_TXF:
-   case SHADER_OPCODE_TXF_CMS:
-   case SHADER_OPCODE_TXF_MCS:
-   case SHADER_OPCODE_TG4:
-   case SHADER_OPCODE_TG4_OFFSET:
-   case SHADER_OPCODE_TXL:
-   case SHADER_OPCODE_TXS:
-   case SHADER_OPCODE_LOD:
-      return 1;
-   case FS_OPCODE_FB_WRITE:
-      return 2;
-   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-   case SHADER_OPCODE_GEN4_SCRATCH_READ:
-      return 1;
-   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
-      return inst->mlen;
-   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-      return 2;
-   case SHADER_OPCODE_UNTYPED_ATOMIC:
-   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
-   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
-   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
-   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
-      return 0;
-   default:
-      unreachable("not reached");
-   }
-}
-
 /** Fixed HW reg constructor. */
 fs_reg::fs_reg(enum register_file file, int reg)
 {
@@ -1078,12 +556,6 @@ fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
    assert(this->width == 8 || this->width == 16);
 }
 
-fs_reg *
-fs_visitor::variable_storage(ir_variable *var)
-{
-   return (fs_reg *)hash_table_find(this->variable_ht, var);
-}
-
 void
 import_uniforms_callback(const void *key,
 			 void *data,
@@ -1110,82 +582,102 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->push_constant_loc = v->push_constant_loc;
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
+   this->uniform_size = v->uniform_size;
 }
 
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
  */
-void
-fs_visitor::setup_uniform_values(ir_variable *ir)
+fs_inst *
+fs_visitor::DEP_RESOLVE_MOV(int grf)
 {
-   int namelen = strlen(ir->name);
+   fs_inst *inst = bld.MOV(brw_null_reg(),
+                           fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 
-   /* The data for our (non-builtin) uniforms is stored in a series of
-    * gl_uniform_driver_storage structs for each subcomponent that
-    * glGetUniformLocation() could name.  We know it's been set up in the same
-    * order we'd walk the type, so walk the list of storage and find anything
-    * with our name, or the prefix of a component that starts with our name.
+   inst->ir = NULL;
+   inst->annotation = "send dependency resolve";
+
+   /* The caller always wants uncompressed to emit the minimal extra
+    * dependencies, and to avoid having to deal with aligning its regs to 2.
     */
-   unsigned params_before = uniforms;
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
-      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
-      if (strncmp(ir->name, storage->name, namelen) != 0 ||
-          (storage->name[namelen] != 0 &&
-           storage->name[namelen] != '.' &&
-           storage->name[namelen] != '[')) {
-         continue;
-      }
+   inst->exec_size = 8;
 
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
+   return inst;
+}
 
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[uniforms++] = &storage->storage[i];
+void
+fs_visitor::emit_pull_constant_load(brw::fs_builder &bld,
+                                    const fs_reg &dst,
+                                    const fs_reg &surf_index,
+                                    uint32_t off,
+                                    const fs_reg *reladdr,
+                                    unsigned num_components)
+{
+   if (reladdr) {
+      /* We have our constant surface use a pitch of 4 bytes, so our index can
+       * be any component of a vector, and then we load 4 contiguous
+       * components starting from that.
+       */
+      fs_reg addr = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+      bld.ADD(fs_reg(addr), *reladdr, fs_reg((off / 4) & ~3));
+
+      int scale = 1;
+      if (brw->gen == 4 && dst.width == 8) {
+         /* Pre-gen5, we can either use a SIMD8 message that requires (header,
+          * u, v, r) as parameters, or we can just use the SIMD16 message
+          * consisting of (header, u).  We choose the second, at the cost of a
+          * longer return length.
+          */
+         scale = 2;
       }
-   }
 
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(params_before + ir->type->component_slots() == uniforms);
-   (void)params_before;
-}
+      enum opcode op;
+      if (brw->gen >= 7)
+         op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+      else
+         op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
+      assert(dst.width % 8 == 0);
+      int regs_written = 4 * (dst.width / 8) * scale;
+      fs_reg result = bld.scalar_reg(dst.type, regs_written);
+      instruction *inst = bld.emit(op, result, surf_index, addr);
 
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
-void
-fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
-{
-   const ir_state_slot *const slots = ir->get_state_slots();
-   assert(slots != NULL);
+      inst->regs_written = regs_written;
 
-   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
-      /* This state reference has already been setup by ir_to_mesa, but we'll
-       * get the same index back here.
-       */
-      int index = _mesa_add_state_reference(this->prog->Parameters,
-					    (gl_state_index *)slots[i].tokens);
+      if (brw->gen < 7) {
+         inst->base_mrf = 13;
+         inst->header_present = true;
+         if (brw->gen == 4)
+            inst->mlen = 3;
+         else
+            inst->mlen = 1 + dispatch_width / 8;
+      }
 
-      /* Add each of the unique swizzles of the element as a parameter.
-       * This'll end up matching the expected layout of the
-       * array/matrix/structure we're trying to fill in.
-       */
-      int last_swiz = -1;
-      for (unsigned int j = 0; j < 4; j++) {
-	 int swiz = GET_SWZ(slots[i].swizzle, j);
-	 if (swiz == last_swiz)
-	    break;
-	 last_swiz = swiz;
+      for (unsigned i = 0; i < num_components; ++i)
+         bld.MOV(offset(dst, i), offset(fs_reg(result),
+                                    (((off / 4) & 3) + i) * scale));
 
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][swiz];
+   } else {
+      brw::fs_builder ubld = bld.force_uncompressed();
+      fs_reg result = bld.scalar_reg(dst.type);
+      fs_reg addr;
+
+      if (brw->gen >= 8) {
+         /* Store the offset in a GRF so we can send-from-GRF. */
+         addr = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+         ubld.MOV(fs_reg(addr), fs_reg(off & ~15));
+      } else {
+         /* Immediates are fine on older generations since they'll be moved
+          * to a (potentially fake) MRF at the generator level.
+          */
+         addr = fs_reg(off & ~15);
       }
+
+      ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, result, surf_index, addr);
+
+      for (unsigned i = 0; i < num_components; ++i)
+         bld.MOV(offset(dst, i), component(result, ((off / 4) & 3) + i));
    }
 }
 
@@ -1200,15 +692,15 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 
    /* gl_FragCoord.x */
    if (ir->data.pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_x));
+      bld.MOV(wpos, this->pixel_x);
    } else {
-      emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
+      bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.y */
    if (!flip && ir->data.pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_y));
+      bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
@@ -1218,15 +710,15 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 	 offset += key->drawable_height - 1.0;
       }
 
-      emit(ADD(wpos, pixel_y, fs_reg(offset)));
+      bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.z */
    if (brw->gen >= 6) {
-      emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
    } else {
-      emit(FS_OPCODE_LINTERP, wpos,
+      bld.emit(FS_OPCODE_LINTERP, wpos,
            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
@@ -1234,7 +726,7 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
-   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
+   bld.emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 
    return reg;
 }
@@ -1269,7 +761,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
        */
       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
    }
-   return emit(FS_OPCODE_LINTERP, attr,
+   return bld.emit(FS_OPCODE_LINTERP, attr,
                this->delta_x[barycoord_mode],
                this->delta_y[barycoord_mode], interp);
 }
@@ -1323,7 +815,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
 	       struct brw_reg interp = interp_reg(location, k);
 	       interp = suboffset(interp, 3);
                interp.type = reg->type;
-	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
+	       bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 	       attr = offset(attr, 1);
 	    }
 	 } else {
@@ -1336,7 +828,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
                    * unlit, replace the centroid data with non-centroid
                    * data.
                    */
-                  emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+                  bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
 
                   fs_inst *inst;
                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
@@ -1360,7 +852,7 @@ fs_visitor::emit_general_interpolation(ir_variable *ir)
                                ir->data.sample || key->persample_shading);
                }
                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
-                  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
+                  bld.emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
                }
 	       attr = offset(attr, 1);
 	    }
@@ -1393,7 +885,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
       g0.negate = true;
 
-      emit(ASR(*reg, g0, fs_reg(15)));
+      bld.ASR(*reg, g0, fs_reg(15));
    } else {
       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
        * a boolean result from this (1/true or 0/false).
@@ -1410,8 +902,8 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
       g1_6.negate = true;
 
-      emit(ASR(asr, g1_6, fs_reg(31)));
-      emit(AND(*reg, asr, fs_reg(1)));
+      bld.ASR(asr, g1_6, fs_reg(31));
+      bld.AND(*reg, asr, fs_reg(1));
    }
 
    return reg;
@@ -1426,9 +918,9 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
 
    if (key->compute_pos_offset) {
       /* Convert int_sample_pos to floating point */
-      emit(MOV(dst, int_sample_pos));
+      bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
-      emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
+      bld.MUL(dst, dst, fs_reg(1 / 16.0f));
    }
    else {
       /* From ARB_sample_shading specification:
@@ -1436,7 +928,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
        *  rasterization is disabled, gl_SamplePosition will always be
        *  (0.5, 0.5).
        */
-      emit(MOV(dst, fs_reg(0.5f)));
+      bld.MOV(dst, fs_reg(0.5f));
    }
 }
 
@@ -1445,7 +937,7 @@ fs_visitor::emit_samplepos_setup()
 {
    assert(brw->gen >= 6);
 
-   this->current_annotation = "compute sample position";
+   bld.set_annotation("compute sample position");
    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
    fs_reg pos = *reg;
    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
@@ -1467,21 +959,21 @@ fs_visitor::emit_samplepos_setup()
                     BRW_REGISTER_TYPE_B), 16, 8, 2);
 
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
+      bld.MOV(int_sample_x, fs_reg(sample_pos_reg));
    } else {
-      emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
-      emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
+      bld.MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+      bld.MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16)))
          ->force_sechalf = true;
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
    pos = offset(pos, 1);
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
+      bld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
-      emit(MOV(half(int_sample_y, 0),
-               fs_reg(suboffset(sample_pos_reg, 1))));
-      emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
+      bld.MOV(half(int_sample_y, 0),
+               fs_reg(suboffset(sample_pos_reg, 1)));
+      bld.MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17)))
          ->force_sechalf = true;
    }
    /* Compute gl_SamplePosition.y */
@@ -1496,7 +988,7 @@ fs_visitor::emit_sampleid_setup(ir_variable *ir)
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    assert(brw->gen >= 6);
 
-   this->current_annotation = "compute sample id";
+   bld.set_annotation("compute sample id");
    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 
    if (key->compute_sample_id) {
@@ -1524,130 +1016,30 @@ fs_visitor::emit_sampleid_setup(ir_variable *ir)
        * subspan 1, and finally sample 1 of subspan 1.
        */
       fs_inst *inst;
-      inst = emit(BRW_OPCODE_AND, t1,
+      inst = bld.emit(BRW_OPCODE_AND, t1,
                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
                   fs_reg(0xc0));
       inst->force_writemask_all = true;
-      inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
+      inst = bld.emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
       inst->force_writemask_all = true;
       /* This works for both SIMD8 and SIMD16 */
-      inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
+      inst = bld.MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
       inst->force_writemask_all = true;
       /* This special instruction takes care of setting vstride=1,
        * width=4, hstride=0 of t2 during an ADD instruction.
        */
-      emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+      bld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
    } else {
       /* As per GL_ARB_sample_shading specification:
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SampleID will always be zero."
        */
-      emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
+      bld.emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
    }
 
    return reg;
 }
 
-fs_reg
-fs_visitor::fix_math_operand(fs_reg src)
-{
-   /* Can't do hstride == 0 args on gen6 math, so expand it out. We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * The hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
-       !src.abs && !src.negate)
-      return src;
-
-   /* Gen7 relaxes most of the above restrictions, but still can't use IMM
-    * operands to math
-    */
-   if (brw->gen >= 7 && src.file != IMM)
-      return src;
-
-   fs_reg expanded = fs_reg(this, glsl_type::float_type);
-   expanded.type = src.type;
-   emit(BRW_OPCODE_MOV, expanded, src);
-   return expanded;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
-{
-   switch (opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      break;
-   default:
-      unreachable("not reached: bad math opcode");
-   }
-
-   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * Gen 6 hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (brw->gen == 6 || brw->gen == 7)
-      src = fix_math_operand(src);
-
-   fs_inst *inst = emit(opcode, dst, src);
-
-   if (brw->gen < 6) {
-      inst->base_mrf = 2;
-      inst->mlen = dispatch_width / 8;
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   int base_mrf = 2;
-   fs_inst *inst;
-
-   if (brw->gen >= 8) {
-      inst = emit(opcode, dst, src0, src1);
-   } else if (brw->gen >= 6) {
-      src0 = fix_math_operand(src0);
-      src1 = fix_math_operand(src1);
-
-      inst = emit(opcode, dst, src0, src1);
-   } else {
-      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
-       * "Message Payload":
-       *
-       * "Operand0[7].  For the INT DIV functions, this operand is the
-       *  denominator."
-       *  ...
-       * "Operand1[7].  For the INT DIV functions, this operand is the
-       *  numerator."
-       */
-      bool is_int_div = opcode != SHADER_OPCODE_POW;
-      fs_reg &op0 = is_int_div ? src1 : src0;
-      fs_reg &op1 = is_int_div ? src0 : src1;
-
-      emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
-      inst = emit(opcode, dst, op0, reg_null_f);
-
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2 * dispatch_width / 8;
-   }
-   return inst;
-}
-
 void
 fs_visitor::assign_curb_setup()
 {
@@ -2069,9 +1461,9 @@ fs_visitor::move_uniform_array_access_to_pull_constants()
          if (pull_constant_loc[uniform] == -1) {
             const gl_constant_value **values = &stage_prog_data->param[uniform];
 
-            assert(param_size[uniform]);
+            assert(uniform_size[uniform]);
 
-            for (int j = 0; j < param_size[uniform]; j++) {
+            for (int j = 0; j < uniform_size[uniform]; j++) {
                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
 
                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
@@ -2187,34 +1579,23 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         base_ir = inst->ir;
-         current_annotation = inst->annotation;
+         bld.set_base_ir(inst->ir);
+         bld.set_annotation(inst->annotation);
 
+         brw::fs_builder ibld = bld.at(block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
-         fs_reg dst = fs_reg(this, glsl_type::float_type);
+         fs_reg dst = ibld.scalar_reg(BRW_REGISTER_TYPE_F);
 
          /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
-                                                        surf_index,
-                                                        *inst->src[i].reladdr,
-                                                        pull_index);
-            inst->insert_before(block, &list);
-            inst->src[i].reladdr = NULL;
-         } else {
-            fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            fs_inst *pull =
-               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                    dst, surf_index, offset);
-            inst->insert_before(block, pull);
-            inst->src[i].set_smear(pull_index & 3);
-         }
+         emit_pull_constant_load(ibld, dst, surf_index, pull_index * 4,
+                                 inst->src[i].reladdr, 1);
 
          /* Rewrite the instruction to use the temporary VGRF. */
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
          inst->src[i].width = dispatch_width;
+         inst->src[i].reladdr = NULL;
       }
    }
    invalidate_live_intervals();
@@ -2573,13 +1954,13 @@ fs_visitor::emit_repclear_shader()
    int base_mrf = 1;
    int color_mrf = base_mrf + 2;
 
-   fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
-                           fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
+   fs_inst *mov = bld.MOV(vec4(brw_message_reg(color_mrf)),
+                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
    mov->force_writemask_all = true;
 
-   fs_inst *write;
+   fs_inst *write = NULL;
    if (key->nr_color_regions == 1) {
-      write = emit(FS_OPCODE_REP_FB_WRITE);
+      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
       write->saturate = key->clamp_fragment_color;
       write->base_mrf = color_mrf;
       write->target = 0;
@@ -2587,7 +1968,7 @@ fs_visitor::emit_repclear_shader()
       write->mlen = 1;
    } else {
       for (int i = 0; i < key->nr_color_regions; ++i) {
-         write = emit(FS_OPCODE_REP_FB_WRITE);
+         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
          write->saturate = key->clamp_fragment_color;
          write->base_mrf = base_mrf;
          write->target = i;
@@ -2597,6 +1978,7 @@ fs_visitor::emit_repclear_shader()
    }
    write->eot = true;
 
+   bld = bld.at(NULL, NULL);
    calculate_cfg();
 
    assign_constant_locations();
@@ -2983,6 +2365,7 @@ fs_visitor::lower_load_payload()
 
       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
          assert(inst->dst.file == MRF || inst->dst.file == GRF);
+         brw::fs_builder ibld = bld.at(block, inst);
          fs_reg dst = inst->dst;
 
          for (int i = 0; i < inst->sources; i++) {
@@ -3001,13 +2384,11 @@ fs_visitor::lower_load_payload()
                compr4_dst.width = 16;
                fs_reg compr4_src = inst->src[i];
                compr4_src.width = 16;
-               fs_inst *mov = MOV(compr4_dst, compr4_src);
-               mov->force_writemask_all = true;
-               inst->insert_before(block, mov);
+               brw::exec_all(ibld.MOV(compr4_dst, compr4_src));
                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
                inst->src[i + 4].file = BAD_FILE;
             } else {
-               fs_inst *mov = MOV(dst, inst->src[i]);
+               fs_inst *mov = ibld.MOV(dst, inst->src[i]);
                if (inst->src[i].file == GRF) {
                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
                                 inst->src[i].reg_offset;
@@ -3029,7 +2410,6 @@ fs_visitor::lower_load_payload()
                      metadata[dst_reg + 1].force_sechalf = true;
                   }
                }
-               inst->insert_before(block, mov);
             }
 
             dst = offset(dst, 1);
@@ -3267,34 +2647,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    fprintf(file, "\n");
 }
 
-/**
- * Possibly returns an instruction that set up @param reg.
- *
- * Sometimes we want to take the result of some expression/variable
- * dereference tree and rewrite the instruction generating the result
- * of the tree.  When processing the tree, we know that the
- * instructions generated are all writing temporaries that are dead
- * outside of this tree.  So, if we have some instructions that write
- * a temporary, we're free to point that temp write somewhere else.
- *
- * Note that this doesn't guarantee that the instruction generated
- * only reg -- it might be the size=4 destination of a texture instruction.
- */
-fs_inst *
-fs_visitor::get_instruction_generating_reg(fs_inst *start,
-					   fs_inst *end,
-					   const fs_reg &reg)
-{
-   if (end == start ||
-       end->is_partial_write() ||
-       reg.reladdr ||
-       !reg.equals(end->dst)) {
-      return NULL;
-   } else {
-      return end;
-   }
-}
-
 void
 fs_visitor::setup_payload_gen6()
 {
@@ -3480,7 +2832,7 @@ fs_visitor::run()
          (stage == MESA_SHADER_FRAGMENT) &&
          ((brw_wm_prog_key*) this->key)->alpha_test_func;
       if (uses_kill || alpha_test_func) {
-         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+         fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
          discard_init->flag_subreg = 1;
       }
 
@@ -3489,24 +2841,25 @@ fs_visitor::run()
        */
       if (shader) {
          foreach_in_list(ir_instruction, ir, shader->base.ir) {
-            base_ir = ir;
+            bld.set_base_ir(ir);
             this->result = reg_undef;
             ir->accept(this);
          }
       } else {
          emit_fragment_program_code();
       }
-      base_ir = NULL;
+      bld.set_base_ir(NULL);
       if (failed)
 	 return false;
 
-      emit(FS_OPCODE_PLACEHOLDER_HALT);
+      bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
 
       if (alpha_test_func)
          emit_alpha_test();
 
       emit_fb_writes();
 
+      bld = bld.at(NULL, NULL);
       calculate_cfg();
 
       split_virtual_grfs();
@@ -3526,7 +2879,7 @@ fs_visitor::run()
          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
                   dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                            \
-         backend_visitor::dump_instructions(filename);                     \
+         brw::base_visitor::dump_instructions(filename);                   \
       }                                                                    \
                                                                            \
       progress = progress || this_progress;                                \
@@ -3537,7 +2890,7 @@ fs_visitor::run()
          snprintf(filename, 64, "fs%d-%04d-00-start",
                   dispatch_width, shader_prog ? shader_prog->Name : 0);
 
-         backend_visitor::dump_instructions(filename);
+         brw::base_visitor::dump_instructions(filename);
       }
 
       bool progress;
@@ -3622,7 +2975,6 @@ fs_visitor::run()
          }
       }
    }
-   assert(force_uncompressed_stack == 0);
 
    /* This must come after all optimization and register allocation, since
     * it inserts dead code that happens to have side effects, and it does
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f38db3b8abb..3982838dd51 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "brw_shader.h"
-#include "brw_ir_fs.h"
+#include "brw_ir_visitor.h"
 
 extern "C" {
 
@@ -42,7 +42,6 @@ extern "C" {
 #include "program/prog_optimize.h"
 #include "util/register_allocate.h"
 #include "program/sampler.h"
-#include "program/hash_table.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@@ -81,13 +80,9 @@ public:
  *
  * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
  */
-class fs_visitor : public backend_visitor
+class fs_visitor : public brw::backend_visitor<fs_visitor, brw::fs_builder>
 {
 public:
-   const fs_reg reg_null_f;
-   const fs_reg reg_null_d;
-   const fs_reg reg_null_ud;
-
    fs_visitor(struct brw_context *brw,
               void *mem_ctx,
               const struct brw_wm_prog_key *key,
@@ -95,98 +90,23 @@ public:
               struct gl_shader_program *shader_prog,
               struct gl_fragment_program *fp,
               unsigned dispatch_width);
-   ~fs_visitor();
    void init();
 
-   fs_reg *variable_storage(ir_variable *var);
    void import_uniforms(fs_visitor *v);
 
    void visit(ir_variable *ir);
-   void visit(ir_assignment *ir);
-   void visit(ir_dereference_variable *ir);
-   void visit(ir_dereference_record *ir);
-   void visit(ir_dereference_array *ir);
-   void visit(ir_expression *ir);
-   void visit(ir_texture *ir);
-   void visit(ir_if *ir);
-   void visit(ir_constant *ir);
-   void visit(ir_swizzle *ir);
-   void visit(ir_return *ir);
-   void visit(ir_loop *ir);
-   void visit(ir_loop_jump *ir);
    void visit(ir_discard *ir);
-   void visit(ir_call *ir);
-   void visit(ir_function *ir);
-   void visit(ir_function_signature *ir);
    void visit(ir_emit_vertex *);
    void visit(ir_end_primitive *);
 
-   uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
-   void swizzle_result(ir_texture *ir, fs_reg orig_val, uint32_t sampler);
-
-   fs_inst *emit(fs_inst *inst);
-   void emit(exec_list list);
-
-   fs_inst *emit(enum opcode opcode);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources);
-
-   fs_inst *MOV(const fs_reg &dst, const fs_reg &src);
-   fs_inst *NOT(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDD(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDE(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDZ(const fs_reg &dst, const fs_reg &src);
-   fs_inst *FRC(const fs_reg &dst, const fs_reg &src);
-   fs_inst *ADD(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MUL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MACH(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MAC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *ASR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *AND(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *OR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *XOR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *IF(enum brw_predicate predicate);
-   fs_inst *IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition);
-   fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition);
-   fs_inst *LRP(const fs_reg &dst, const fs_reg &a, const fs_reg &y,
-                const fs_reg &x);
-   fs_inst *DEP_RESOLVE_MOV(int grf);
-   fs_inst *BFREV(const fs_reg &dst, const fs_reg &value);
-   fs_inst *BFE(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset,
-                const fs_reg &value);
-   fs_inst *BFI1(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset);
-   fs_inst *BFI2(const fs_reg &dst, const fs_reg &bfi1_dst,
-                 const fs_reg &insert, const fs_reg &base);
-   fs_inst *FBH(const fs_reg &dst, const fs_reg &value);
-   fs_inst *FBL(const fs_reg &dst, const fs_reg &value);
-   fs_inst *CBIT(const fs_reg &dst, const fs_reg &value);
-   fs_inst *MAD(const fs_reg &dst, const fs_reg &c, const fs_reg &b,
-                const fs_reg &a);
-   fs_inst *ADDC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SUBB(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SEL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-
-   int type_size(const struct glsl_type *type);
-   fs_inst *get_instruction_generating_reg(fs_inst *start,
-					   fs_inst *end,
-					   const fs_reg &reg);
-
-   fs_inst *LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources);
-
-   exec_list VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
-                                        const fs_reg &surf_index,
-                                        const fs_reg &varying_offset,
-                                        uint32_t const_offset);
+   dst_reg
+   temporary_reg(const glsl_type *type)
+   {
+      return bld.scalar_reg(brw_type_for_base_type(type),
+                            type_size(type));
+   }
+
+   instruction *DEP_RESOLVE_MOV(int grf);
 
    bool run();
    void assign_binding_table_offsets();
@@ -233,15 +153,10 @@ public:
                                                     fs_inst *inst);
    void insert_gen4_post_send_dependency_workarounds(bblock_t *block,
                                                      fs_inst *inst);
-   void vfail(const char *msg, va_list args);
-   void fail(const char *msg, ...);
    void no16(const char *msg, ...);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
 
-   void push_force_uncompressed();
-   void pop_force_uncompressed();
-
    void emit_dummy_fs();
    void emit_repclear_shader();
    fs_reg *emit_fragcoord_interpolation(ir_variable *ir);
@@ -255,34 +170,26 @@ public:
    void emit_interpolation_setup_gen4();
    void emit_interpolation_setup_gen6();
    void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
-   fs_reg rescale_texcoord(ir_texture *ir, fs_reg coordinate,
-                           bool is_rect, uint32_t sampler, int texunit);
    fs_inst *emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-                              uint32_t sampler);
-   fs_inst *emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-                              fs_reg sample_index, uint32_t sampler);
-   fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_comp, fs_reg lod, fs_reg lod2,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler);
-   fs_reg emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler);
-   void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
-   fs_reg fix_math_operand(fs_reg src);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
-   void emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                 const fs_reg &a);
-   void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                    const fs_reg &src0, const fs_reg &src1);
-   bool try_emit_saturate(ir_expression *ir);
-   bool try_emit_mad(ir_expression *ir);
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              const fs_reg &sampler);
+   fs_inst *emit_texture_gen5(ir_texture *ir, const fs_reg &dst, fs_reg coordinate,
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              const fs_reg &sample_index, const fs_reg &sampler);
+   fs_inst *emit_texture_gen7(ir_texture *ir, const fs_reg &dst, fs_reg coordinate,
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              fs_reg offset_val, const fs_reg &sample_index,
+                              const fs_reg &mcs, const fs_reg &sampler);
+   fs_inst *emit_texture(ir_texture *ir, const fs_reg &dst,
+                         const fs_reg &coordinate, const fs_reg &shadow_c,
+                         const fs_reg &lod, const fs_reg &lod2,
+                         const fs_reg &offset_val, const fs_reg &sample_index,
+                         const fs_reg &mcs, const fs_reg &sampler);
+   fs_reg emit_untyped_surface_header();
    void try_replace_with_sel();
    bool opt_peephole_sel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
-   void emit_bool_to_cond_code(ir_rvalue *condition);
-   void emit_if_gen6(ir_if *ir);
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
    void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
@@ -317,59 +224,45 @@ public:
                                  fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
 
-   void emit_shader_time_begin();
-   void emit_shader_time_end();
-   void emit_shader_time_write(enum shader_time_shader_type type,
-                               fs_reg value);
-
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            fs_reg dst, fs_reg offset, fs_reg src0,
-                            fs_reg src1);
+   void emit_interpolate_expression(ir_expression *ir);
 
-   void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                  fs_reg offset);
+   void emit_pull_constant_load(brw::fs_builder &bld,
+                                const dst_reg &dst,
+                                const src_reg &surf_index,
+                                uint32_t offset,
+                                const src_reg *reladdr,
+                                unsigned num_components);
 
-   void emit_interpolate_expression(ir_expression *ir);
+   struct brw_reg interp_reg(int location, int channel);
 
-   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
-			       fs_reg dst,
-			       fs_reg src,
-			       fs_inst *pre_rhs_inst,
-			       fs_inst *last_rhs_inst);
-   void emit_assignment_writes(fs_reg &l, fs_reg &r,
-			       const glsl_type *type, bool predicated);
-   void resolve_ud_negate(fs_reg *reg);
-   void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
+   void emit_pack_half_2x16(dst_reg dst, src_reg src0)
+   {
+      unreachable("not reached");
+   }
 
-   fs_reg get_timestamp();
+   void emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+   {
+      unreachable("not reached");
+   }
 
-   struct brw_reg interp_reg(int location, int channel);
-   void setup_uniform_values(ir_variable *ir);
-   void setup_builtin_uniform_values(ir_variable *ir);
-   int implied_mrf_writes(fs_inst *inst);
+   const struct brw_sampler_prog_key_data *
+   sampler_prog_key() const {
+      return &((const brw_wm_prog_key *)key)->tex;
+   }
 
    virtual void dump_instructions();
    virtual void dump_instructions(const char *name);
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
-   void visit_atomic_counter_intrinsic(ir_call *ir);
-
    const void *const key;
    struct brw_stage_prog_data *prog_data;
    unsigned int sanity_param_count;
 
-   int *param_size;
-
-   int *virtual_grf_start;
-   int *virtual_grf_end;
    brw::fs_live_variables *live_intervals;
 
    int *regs_live_at_ip;
 
-   /** Number of uniform variable components visited. */
-   unsigned uniforms;
-
    /** Byte-offset for the next available spot in the scratch space buffer. */
    unsigned last_scratch;
 
@@ -385,33 +278,19 @@ public:
     */
    int *push_constant_loc;
 
-   struct hash_table *variable_ht;
    fs_reg frag_depth;
    fs_reg sample_mask;
    fs_reg outputs[BRW_MAX_DRAW_BUFFERS];
    unsigned output_components[BRW_MAX_DRAW_BUFFERS];
    fs_reg dual_src_output;
    bool do_dual_src;
-   int first_non_payload_grf;
-   /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
-   unsigned max_grf;
 
    fs_reg *fp_temp_regs;
    fs_reg *fp_input_regs;
 
-   /** @{ debug annotation info */
-   const char *current_annotation;
-   const void *base_ir;
-   /** @} */
-
-   bool failed;
-   char *fail_msg;
    bool simd16_unsupported;
    char *no16_msg;
 
-   /* Result of last visit() method. */
-   fs_reg result;
-
    /** Register numbers for thread payload fields. */
    struct {
       uint8_t source_depth_reg;
@@ -435,14 +314,11 @@ public:
    fs_reg pixel_w;
    fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
    fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
-   fs_reg shader_start_time;
 
    unsigned grf_used;
    bool spilled_any_registers;
 
    const unsigned dispatch_width; /**< 8 or 16 */
-
-   int force_uncompressed_stack;
 };
 
 /**
@@ -486,7 +362,8 @@ private:
                            struct brw_reg src1);
    void generate_math_gen4(fs_inst *inst,
 			   struct brw_reg dst,
-			   struct brw_reg src);
+                           struct brw_reg src0,
+                           struct brw_reg src1);
    void generate_math_g45(fs_inst *inst,
 			  struct brw_reg dst,
 			  struct brw_reg src);
@@ -540,11 +417,6 @@ private:
                                         struct brw_reg dst,
                                         struct brw_reg src);
 
-   void generate_shader_time_add(fs_inst *inst,
-                                 struct brw_reg payload,
-                                 struct brw_reg offset,
-                                 struct brw_reg value);
-
    void generate_untyped_atomic(fs_inst *inst,
                                 struct brw_reg dst,
                                 struct brw_reg payload,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index e1989cb5e4c..99a412a85b7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -302,7 +302,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
        (entry->dst.reg_offset + entry->regs_written) * 32)
       return false;
 
-   /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
+   /* See fix_condmod_negate() and comment in brw_fs_emit.cpp. */
    if (inst->conditional_mod &&
        inst->src[arg].type == BRW_REGISTER_TYPE_UD &&
        entry->src.negate)
@@ -381,7 +381,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
       break;
    case GRF:
       {
-         assert(entry->src.width % inst->src[arg].width == 0);
+         assert(entry->src.width % inst->src[arg].width == 0 ||
+                entry->src.width == 1);
          /* In this case, we'll just leave the width alone.  The source
           * register could have different widths depending on how it is
           * being used.  For instance, if only half of the register was
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 87f67564657..0aeb67c0900 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -211,18 +211,18 @@ fs_visitor::opt_cse_local(bblock_t *block)
                entry->tmp = tmp;
                entry->generator->dst = tmp;
 
-               fs_inst *copy;
+               brw::fs_builder ibld = bld.at(block,
+                                             (fs_inst *)entry->generator->next);
                if (written > dst_width) {
                   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
                   for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(orig_dst, sources, written / dst_width);
+                  ibld.LOAD_PAYLOAD(orig_dst, sources, written / dst_width);
                } else {
-                  copy = MOV(orig_dst, tmp);
-                  copy->force_writemask_all =
+                  ibld.MOV(orig_dst, tmp)
+                     ->force_writemask_all =
                      entry->generator->force_writemask_all;
                }
-               entry->generator->insert_after(block, copy);
             }
 
             /* dest <- temp */
@@ -234,17 +234,16 @@ fs_visitor::opt_cse_local(bblock_t *block)
                assert(inst->dst.type == entry->tmp.type);
                fs_reg dst = inst->dst;
                fs_reg tmp = entry->tmp;
-               fs_inst *copy;
+               brw::fs_builder ibld = bld.at(block, inst);
                if (written > dst_width) {
                   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
                   for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(dst, sources, written / dst_width);
+                  ibld.LOAD_PAYLOAD(dst, sources, written / dst_width);
                } else {
-                  copy = MOV(dst, tmp);
-                  copy->force_writemask_all = inst->force_writemask_all;
+                  ibld.MOV(dst, tmp)
+                     ->force_writemask_all = inst->force_writemask_all;
                }
-               inst->insert_before(block, copy);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
index 9f0c0c7ac48..114bf67494d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
@@ -37,7 +37,7 @@ fs_visitor::emit_fp_alu1(enum opcode opcode,
 {
    for (int i = 0; i < 4; i++) {
       if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i), offset(src, i));
+         bld.emit(opcode, offset(dst, i), offset(src, i));
    }
 }
 
@@ -48,7 +48,7 @@ fs_visitor::emit_fp_alu2(enum opcode opcode,
 {
    for (int i = 0; i < 4; i++) {
       if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i),
+         bld.emit(opcode, offset(dst, i),
               offset(src0, i), offset(src1, i));
    }
 }
@@ -65,7 +65,7 @@ fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
 
    for (int i = 0; i < 4; i++) {
       if (fpi->DstReg.WriteMask & (1 << i)) {
-         emit_minmax(conditionalmod, offset(dst, i),
+         bld.emit_minmax(conditionalmod, offset(dst, i),
                      offset(src0, i), offset(src1, i));
       }
    }
@@ -81,10 +81,10 @@ fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
       if (fpi->DstReg.WriteMask & (1 << i)) {
          fs_inst *inst;
 
-         emit(CMP(reg_null_d, offset(src0, i), offset(src1, i),
-                  conditional_mod));
+         bld.CMP(bld.reg_null_d(), offset(src0, i), offset(src1, i),
+                 conditional_mod);
 
-         inst = emit(BRW_OPCODE_SEL, offset(dst, i), one, fs_reg(0.0f));
+         inst = bld.emit(BRW_OPCODE_SEL, offset(dst, i), one, fs_reg(0.0f));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
    }
@@ -96,7 +96,7 @@ fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
 {
    for (int i = 0; i < 4; i++) {
       if (fpi->DstReg.WriteMask & (1 << i))
-         emit(MOV(offset(dst, i), src));
+         bld.MOV(offset(dst, i), src);
    }
 }
 
@@ -106,7 +106,7 @@ fs_visitor::emit_fp_scalar_math(enum opcode opcode,
                                 fs_reg dst, fs_reg src)
 {
    fs_reg temp = fs_reg(this, glsl_type::float_type);
-   emit_math(opcode, temp, src);
+   bld.emit_math(opcode, temp, src);
    emit_fp_scalar_write(fpi, dst, temp);
 }
 
@@ -126,11 +126,11 @@ fs_visitor::emit_fragment_program_code()
     * mov.f0 dst 1.0
     */
    fs_reg one = fs_reg(this, glsl_type::float_type);
-   emit(MOV(one, fs_reg(1.0f)));
+   bld.MOV(one, fs_reg(1.0f));
 
    for (unsigned int insn = 0; insn < prog->NumInstructions; insn++) {
       const struct prog_instruction *fpi = &prog->Instructions[insn];
-      base_ir = fpi;
+      bld.set_base_ir(fpi);
 
       //_mesa_print_instruction(fpi);
 
@@ -161,10 +161,10 @@ fs_visitor::emit_fragment_program_code()
             if (fpi->DstReg.WriteMask & (1 << i)) {
                fs_inst *inst;
 
-               emit(CMP(reg_null_f, offset(src[0], i), fs_reg(0.0f),
-                        BRW_CONDITIONAL_L));
+               bld.CMP(bld.reg_null_f(), offset(src[0], i), fs_reg(0.0f),
+                       BRW_CONDITIONAL_L);
 
-               inst = emit(BRW_OPCODE_SEL, offset(dst, i),
+               inst = bld.emit(BRW_OPCODE_SEL, offset(dst, i),
                            offset(src[1], i), offset(src[2], i));
                inst->predicate = BRW_PREDICATE_NORMAL;
             }
@@ -191,14 +191,14 @@ fs_visitor::emit_fragment_program_code()
          default: unreachable("not reached");
          }
 
-         emit(MUL(acc, offset(src[0], 0), offset(src[1], 0)));
+         bld.MUL(acc, offset(src[0], 0), offset(src[1], 0));
          for (int i = 1; i < count; i++) {
-            emit(MUL(mul, offset(src[0], i), offset(src[1], i)));
-            emit(ADD(acc, acc, mul));
+            bld.MUL(mul, offset(src[0], i), offset(src[1], i));
+            bld.ADD(acc, acc, mul);
          }
 
          if (fpi->Opcode == OPCODE_DPH)
-            emit(ADD(acc, acc, offset(src[1], 3)));
+            bld.ADD(acc, acc, offset(src[1], 3));
 
          emit_fp_scalar_write(fpi, dst, acc);
          break;
@@ -206,15 +206,15 @@ fs_visitor::emit_fragment_program_code()
 
       case OPCODE_DST:
          if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(dst, fs_reg(1.0f)));
+            bld.MOV(dst, fs_reg(1.0f));
          if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit(MUL(offset(dst, 1),
-                     offset(src[0], 1), offset(src[1], 1)));
+            bld.MUL(offset(dst, 1),
+                     offset(src[0], 1), offset(src[1], 1));
          }
          if (fpi->DstReg.WriteMask & WRITEMASK_Z)
-            emit(MOV(offset(dst, 2), offset(src[0], 2)));
+            bld.MOV(offset(dst, 2), offset(src[0], 2));
          if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), offset(src[1], 3)));
+            bld.MOV(offset(dst, 3), offset(src[1], 3));
          break;
 
       case OPCODE_EX2:
@@ -248,8 +248,8 @@ fs_visitor::emit_fragment_program_code()
              * undiscarded pixels, and updates just those pixels to be
              * turned off.
              */
-            fs_inst *cmp = emit(CMP(reg_null_f, offset(src[0], i),
-                                    fs_reg(0.0f), BRW_CONDITIONAL_GE));
+            fs_inst *cmp = bld.CMP(bld.reg_null_f(), offset(src[0], i),
+                                            fs_reg(0.0f), BRW_CONDITIONAL_GE);
             cmp->predicate = BRW_PREDICATE_NORMAL;
             cmp->flag_subreg = 1;
          }
@@ -277,30 +277,30 @@ fs_visitor::emit_fragment_program_code()
           * brw_wm_emit.c either.
           */
          if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(offset(dst, 0), fs_reg(1.0f)));
+            bld.MOV(offset(dst, 0), fs_reg(1.0f));
 
          if (fpi->DstReg.WriteMask & WRITEMASK_YZ) {
             fs_inst *inst;
-            emit(CMP(reg_null_f, offset(src[0], 0), fs_reg(0.0f),
-                     BRW_CONDITIONAL_LE));
+            bld.CMP(bld.reg_null_f(), offset(src[0], 0), fs_reg(0.0f),
+                             BRW_CONDITIONAL_LE);
 
             if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-               emit(MOV(offset(dst, 1), offset(src[0], 0)));
-               inst = emit(MOV(offset(dst, 1), fs_reg(0.0f)));
+               bld.MOV(offset(dst, 1), offset(src[0], 0));
+               inst = bld.MOV(offset(dst, 1), fs_reg(0.0f));
                inst->predicate = BRW_PREDICATE_NORMAL;
             }
 
             if (fpi->DstReg.WriteMask & WRITEMASK_Z) {
-               emit_math(SHADER_OPCODE_POW, offset(dst, 2),
-                         offset(src[0], 1), offset(src[0], 3));
+               bld.emit_math(SHADER_OPCODE_POW, offset(dst, 2),
+                             offset(src[0], 1), offset(src[0], 3));
 
-               inst = emit(MOV(offset(dst, 2), fs_reg(0.0f)));
+               inst = bld.MOV(offset(dst, 2), fs_reg(0.0f));
                inst->predicate = BRW_PREDICATE_NORMAL;
             }
          }
 
          if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), fs_reg(1.0f)));
+            bld.MOV(offset(dst, 3), fs_reg(1.0f));
 
          break;
 
@@ -310,7 +310,7 @@ fs_visitor::emit_fragment_program_code()
                fs_reg a = offset(src[0], i);
                fs_reg y = offset(src[1], i);
                fs_reg x = offset(src[2], i);
-               emit_lrp(offset(dst, i), x, y, a);
+               bld.LRP(offset(dst, i), x, y, a);
             }
          }
          break;
@@ -319,8 +319,8 @@ fs_visitor::emit_fragment_program_code()
          for (int i = 0; i < 4; i++) {
             if (fpi->DstReg.WriteMask & (1 << i)) {
                fs_reg temp = fs_reg(this, glsl_type::float_type);
-               emit(MUL(temp, offset(src[0], i), offset(src[1], i)));
-               emit(ADD(offset(dst, i), temp, offset(src[2], i)));
+               bld.MUL(temp, offset(src[0], i), offset(src[1], i));
+               bld.ADD(offset(dst, i), temp, offset(src[2], i));
             }
          }
          break;
@@ -343,7 +343,7 @@ fs_visitor::emit_fragment_program_code()
 
       case OPCODE_POW: {
          fs_reg temp = fs_reg(this, glsl_type::float_type);
-         emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]);
+         bld.emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]);
          emit_fp_scalar_write(fpi, dst, temp);
          break;
       }
@@ -358,13 +358,13 @@ fs_visitor::emit_fragment_program_code()
 
       case OPCODE_SCS:
          if (fpi->DstReg.WriteMask & WRITEMASK_X) {
-            emit_math(SHADER_OPCODE_COS, offset(dst, 0),
-                      offset(src[0], 0));
+            bld.emit_math(SHADER_OPCODE_COS, offset(dst, 0),
+                          offset(src[0], 0));
          }
 
          if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit_math(SHADER_OPCODE_SIN, offset(dst, 1),
-                      offset(src[0], 1));
+            bld.emit_math(SHADER_OPCODE_SIN, offset(dst, 1),
+                          offset(src[0], 1));
          }
          break;
 
@@ -414,10 +414,10 @@ fs_visitor::emit_fragment_program_code()
 
             coordinate = fs_reg(this, glsl_type::vec3_type);
             fs_reg invproj = fs_reg(this, glsl_type::float_type);
-            emit_math(SHADER_OPCODE_RCP, invproj, offset(src[0], 3));
+            bld.emit_math(SHADER_OPCODE_RCP, invproj, offset(src[0], 3));
             for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(coordinate, i),
-                        offset(src[0], i), invproj));
+               bld.MUL(offset(coordinate, i),
+                        offset(src[0], i), invproj);
             }
             break;
          }
@@ -457,14 +457,14 @@ fs_visitor::emit_fragment_program_code()
             fs_reg abscoord = coordinate;
             abscoord.negate = false;
             abscoord.abs = true;
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        offset(abscoord, 0), offset(abscoord, 1));
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        temp, offset(abscoord, 2));
-            emit_math(SHADER_OPCODE_RCP, temp, temp);
+            bld.emit_minmax(BRW_CONDITIONAL_GE, temp,
+                            offset(abscoord, 0), offset(abscoord, 1));
+            bld.emit_minmax(BRW_CONDITIONAL_GE, temp,
+                            temp, offset(abscoord, 2));
+            bld.emit_math(SHADER_OPCODE_RCP, temp, temp);
             for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(cubecoord, i),
-                        offset(coordinate, i), temp));
+               bld.MUL(offset(cubecoord, i),
+                        offset(coordinate, i), temp);
             }
 
             coordinate = cubecoord;
@@ -485,15 +485,9 @@ fs_visitor::emit_fragment_program_code()
                                        fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
                                        fpi->TexSrcUnit, fpi->TexSrcUnit);
 
-         fs_inst *inst;
-         if (brw->gen >= 7) {
-            inst = emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, dpdy, sample_index, fs_reg(0u), fs_reg(fpi->TexSrcUnit));
-         } else if (brw->gen >= 5) {
-            inst = emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, dpdy, sample_index, fpi->TexSrcUnit);
-         } else {
-            inst = emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, dpdy, fpi->TexSrcUnit);
-         }
-
+         fs_inst *inst = emit_texture(ir, dst, coordinate, shadow_c, lod, dpdy,
+                                      fs_reg(), sample_index, fs_reg(0u),
+                                      fs_reg(fpi->TexSrcUnit));
          inst->shadow_compare = fpi->TexShadow;
 
          /* Reuse the GLSL swizzle_result() handler. */
@@ -519,10 +513,10 @@ fs_visitor::emit_fragment_program_code()
                fs_reg temp = fs_reg(this, glsl_type::float_type);
                fs_reg neg_src1_1 = offset(src[1], i1);
                neg_src1_1.negate = !neg_src1_1.negate;
-               emit(MUL(temp, offset(src[0], i2), neg_src1_1));
-               emit(MUL(offset(dst, i),
-                        offset(src[0], i1), offset(src[1], i2)));
-               emit(ADD(offset(dst, i), offset(dst, i), temp));
+               bld.MUL(temp, offset(src[0], i2), neg_src1_1);
+               bld.MUL(offset(dst, i),
+                        offset(src[0], i1), offset(src[1], i2));
+               bld.ADD(offset(dst, i), offset(dst, i), temp);
             }
          }
          break;
@@ -543,8 +537,8 @@ fs_visitor::emit_fragment_program_code()
 
          for (int i = 0; i < 4; i++) {
             if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_inst *inst = emit(MOV(offset(real_dst, i),
-                                        offset(dst, i)));
+               fs_inst *inst = bld.MOV(offset(real_dst, i),
+                                        offset(dst, i));
                inst->saturate = fpi->SaturateMode;
             }
          }
@@ -556,10 +550,10 @@ fs_visitor::emit_fragment_program_code()
     * Fragment depth has this strange convention of being the .z component of
     * a vec4.  emit_fb_write() wants to see a float value, instead.
     */
-   this->current_annotation = "result.depth write";
+   bld.set_annotation("result.depth write");
    if (frag_depth.file != BAD_FILE) {
       fs_reg temp = fs_reg(this, glsl_type::float_type);
-      emit(MOV(temp, offset(frag_depth, 2)));
+      bld.MOV(temp, offset(frag_depth, 2));
       frag_depth = temp;
    }
 }
@@ -595,8 +589,8 @@ fs_visitor::setup_fp_regs()
                                                     ir_var_shader_in);
          ir->data.location = i;
 
-         this->current_annotation = ralloc_asprintf(ctx, "interpolate input %d",
-                                                    i);
+         bld.set_annotation(ralloc_asprintf(ctx, "interpolate input %d",
+                                            i));
 
          switch (i) {
          case VARYING_SLOT_POS:
@@ -615,15 +609,15 @@ fs_visitor::setup_fp_regs()
             fp_input_regs[i] = *emit_general_interpolation(ir);
 
             if (i == VARYING_SLOT_FOGC) {
-               emit(MOV(offset(fp_input_regs[i], 1), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 2), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 3), fs_reg(1.0f)));
+               bld.MOV(offset(fp_input_regs[i], 1), fs_reg(0.0f));
+               bld.MOV(offset(fp_input_regs[i], 2), fs_reg(0.0f));
+               bld.MOV(offset(fp_input_regs[i], 3), fs_reg(1.0f));
             }
 
             break;
          }
 
-         this->current_annotation = NULL;
+         bld.set_annotation(NULL);
       }
    }
 }
@@ -708,8 +702,8 @@ fs_visitor::get_fp_src_reg(const prog_src_register *src)
          result = fs_reg(this, glsl_type::vec4_type);
 
          for (int i = 0; i < 4; i++) {
-            emit(MOV(offset(result, i),
-                     fs_reg(plist->ParameterValues[src->Index][i].f)));
+            bld.MOV(offset(result, i),
+                     fs_reg(plist->ParameterValues[src->Index][i].f));
          }
          break;
       }
@@ -742,15 +736,15 @@ fs_visitor::get_fp_src_reg(const prog_src_register *src)
           */
          int src_swiz = GET_SWZ(src->Swizzle, i);
          if (src_swiz == SWIZZLE_ZERO) {
-            emit(MOV(offset(result, i), fs_reg(0.0f)));
+            bld.MOV(offset(result, i), fs_reg(0.0f));
          } else if (src_swiz == SWIZZLE_ONE) {
-            emit(MOV(offset(result, i),
-                     negate ? fs_reg(-1.0f) : fs_reg(1.0f)));
+            bld.MOV(offset(result, i),
+                     negate ? fs_reg(-1.0f) : fs_reg(1.0f));
          } else {
             fs_reg src = offset(unswizzled, src_swiz);
             if (negate)
                src.negate = !src.negate;
-            emit(MOV(offset(result, i), src));
+            bld.MOV(offset(result, i), src);
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c2010c036c9..ee3eec4a665 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -329,28 +329,51 @@ fs_generator::generate_math_gen6(fs_inst *inst,
 
 void
 fs_generator::generate_math_gen4(fs_inst *inst,
-			       struct brw_reg dst,
-			       struct brw_reg src)
+                                 struct brw_reg dst,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
 {
+   /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+    * "Message Payload":
+    *
+    * "Operand0[7].  For the INT DIV functions, this operand is the
+    *  denominator."
+    *  ...
+    * "Operand1[7].  For the INT DIV functions, this operand is the
+    *  numerator."
+    */
+   bool is_int_div = (inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
+                      inst->opcode == SHADER_OPCODE_INT_REMAINDER);
+   struct brw_reg &op0 = is_int_div ? src1 : src0;
+   struct brw_reg &op1 = is_int_div ? src0 : src1;
    int op = brw_math_function(inst->opcode);
 
    assert(inst->mlen >= 1);
 
+   if (src1.file != BRW_ARCHITECTURE_REGISTER_FILE) {
+      brw_push_insn_state(p);
+      brw_set_default_saturate(p, false);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
+      brw_pop_insn_state(p);
+   }
+
    if (dispatch_width == 8) {
       gen4_math(p, dst,
                 op,
-                inst->base_mrf, src,
+                inst->base_mrf, op0,
                 BRW_MATH_PRECISION_FULL);
+
    } else if (dispatch_width == 16) {
       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
       gen4_math(p, firsthalf(dst),
 	        op,
-	        inst->base_mrf, firsthalf(src),
+	        inst->base_mrf, firsthalf(op0),
 	        BRW_MATH_PRECISION_FULL);
       brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
       gen4_math(p, sechalf(dst),
 	        op,
-	        inst->base_mrf + 1, sechalf(src),
+	        inst->base_mrf + 1, sechalf(op0),
 	        BRW_MATH_PRECISION_FULL);
 
       brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
@@ -362,13 +385,9 @@ fs_generator::generate_math_g45(fs_inst *inst,
                                 struct brw_reg dst,
                                 struct brw_reg src)
 {
-   if (inst->opcode == SHADER_OPCODE_POW ||
-       inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
-       inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
-      generate_math_gen4(inst, dst, src);
-      return;
-   }
-
+   assert(inst->opcode != SHADER_OPCODE_POW &&
+          inst->opcode != SHADER_OPCODE_INT_QUOTIENT &&
+          inst->opcode != SHADER_OPCODE_INT_REMAINDER);
    int op = brw_math_function(inst->opcode);
 
    assert(inst->mlen >= 1);
@@ -1442,45 +1461,6 @@ fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
 }
 
 void
-fs_generator::generate_shader_time_add(fs_inst *inst,
-                                       struct brw_reg payload,
-                                       struct brw_reg offset,
-                                       struct brw_reg value)
-{
-   assert(brw->gen >= 7);
-   brw_push_insn_state(p);
-   brw_set_default_mask_control(p, true);
-
-   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
-   struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
-                                          offset.type);
-   struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
-                                         value.type);
-
-   assert(offset.file == BRW_IMMEDIATE_VALUE);
-   if (value.file == BRW_GENERAL_REGISTER_FILE) {
-      value.width = BRW_WIDTH_1;
-      value.hstride = BRW_HORIZONTAL_STRIDE_0;
-      value.vstride = BRW_VERTICAL_STRIDE_0;
-   } else {
-      assert(value.file == BRW_IMMEDIATE_VALUE);
-   }
-
-   /* Trying to deal with setup of the params from the IR is crazy in the FS8
-    * case, and we don't really care about squeezing every bit of performance
-    * out of this path, so we just emit the MOVs from here.
-    */
-   brw_MOV(p, payload_offset, offset);
-   brw_MOV(p, payload_value, value);
-   brw_shader_time_add(p, payload,
-                       prog_data->binding_table.shader_time_start);
-   brw_pop_insn_state(p);
-
-   brw_mark_surface_used(prog_data,
-                         prog_data->binding_table.shader_time_start);
-}
-
-void
 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
                                       struct brw_reg payload,
                                       struct brw_reg atomic_op,
@@ -1805,7 +1785,7 @@ fs_generator::generate_code(const cfg_t *cfg)
 	 } else if (brw->gen == 5 || brw->is_g4x) {
 	    generate_math_g45(inst, dst, src[0]);
 	 } else {
-	    generate_math_gen4(inst, dst, src[0]);
+	    generate_math_gen4(inst, dst, src[0], brw_null_reg());
 	 }
 	 break;
       case SHADER_OPCODE_INT_QUOTIENT:
@@ -1817,7 +1797,7 @@ fs_generator::generate_code(const cfg_t *cfg)
 	 } else if (brw->gen >= 6) {
 	    generate_math_gen6(inst, dst, src[0], src[1]);
 	 } else {
-	    generate_math_gen4(inst, dst, src[0]);
+	    generate_math_gen4(inst, dst, src[0], src[1]);
 	 }
 	 break;
       case FS_OPCODE_PIXEL_X:
@@ -1905,7 +1885,10 @@ fs_generator::generate_code(const cfg_t *cfg)
          break;
 
       case SHADER_OPCODE_SHADER_TIME_ADD:
-         generate_shader_time_add(inst, src[0], src[1], src[2]);
+         brw_shader_time_add(p, src[0],
+                             prog_data->binding_table.shader_time_start);
+         brw_mark_surface_used(prog_data,
+                               prog_data->binding_table.shader_time_start);
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index b7a1d7e7722..7e7371f3a8e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -85,8 +85,8 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (brw->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
+         fs_inst *cmp_inst = bld.CMP(bld.reg_null_d(), if_inst->src[0], if_inst->src[1],
+                                     if_inst->conditional_mod);
          if_inst->insert_before(if_block, cmp_inst);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 833ba15b1b6..b792b03e5e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -666,7 +666,7 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
          new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-                              reg_null_f, src);
+                              bld.reg_null_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size;
       spill_inst->ir = inst->ir;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index c3bfd00e70d..c3e96a6e31a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -66,7 +66,8 @@ count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
 {
    int then_movs = 0;
    foreach_inst_in_block(fs_inst, inst, then_block) {
-      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          inst->is_partial_write())
          break;
 
       then_mov[then_movs] = inst;
@@ -75,7 +76,12 @@ count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
 
    int else_movs = 0;
    foreach_inst_in_block(fs_inst, inst, else_block) {
-      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV ||
+          /* Check that the MOVs are the right form. */
+          !then_mov[else_movs] || !then_mov[else_movs]->dst.equals(inst->dst) ||
+          /* Check that source types for mov operations match. */
+          then_mov[else_movs]->src[0].type != inst->src[0].type ||
+          inst->is_partial_write())
          break;
 
       else_mov[else_movs] = inst;
@@ -148,13 +154,13 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      fs_inst *sel_inst[MAX_MOVS] = { NULL };
-      fs_inst *mov_imm_inst[MAX_MOVS] = { NULL };
-
+      brw::fs_builder ibld = bld.at(block, if_inst);
       enum brw_predicate predicate;
       bool predicate_inverse;
       if (brw->gen == 6 && if_inst->conditional_mod) {
-         /* For Sandybridge with IF with embedded comparison */
+         /* For Sandybridge with IF with embedded comparison. */
+         ibld.CMP(ibld.reg_null_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
          predicate = BRW_PREDICATE_NORMAL;
          predicate_inverse = false;
       } else {
@@ -165,25 +171,8 @@ fs_visitor::opt_peephole_sel()
 
       /* Generate SEL instructions for pairs of MOVs to a common destination. */
       for (int i = 0; i < movs; i++) {
-         if (!then_mov[i] || !else_mov[i])
-            break;
-
-         /* Check that the MOVs are the right form. */
-         if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
-             then_mov[i]->is_partial_write() ||
-             else_mov[i]->is_partial_write()) {
-            movs = i;
-            break;
-         }
-
-         /* Check that source types for mov operations match. */
-         if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
-            movs = i;
-            break;
-         }
-
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
-            sel_inst[i] = MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {
             /* Only the last source register can be a constant, so if the MOV
              * in the "then" clause uses a constant, we need to put it in a
@@ -193,29 +182,13 @@ fs_visitor::opt_peephole_sel()
             if (src0.file == IMM) {
                src0 = fs_reg(this, glsl_type::float_type);
                src0.type = then_mov[i]->src[0].type;
-               mov_imm_inst[i] = MOV(src0, then_mov[i]->src[0]);
+               ibld.MOV(src0, then_mov[i]->src[0]);
             }
 
-            sel_inst[i] = SEL(then_mov[i]->dst, src0, else_mov[i]->src[0]);
-            sel_inst[i]->predicate = predicate;
-            sel_inst[i]->predicate_inverse = predicate_inverse;
+            brw::exec_predicate_inv(
+               predicate, predicate_inverse,
+               ibld.SEL(then_mov[i]->dst, src0, else_mov[i]->src[0]));
          }
-      }
-
-      if (movs == 0)
-         continue;
-
-      /* Emit a CMP if our IF used the embedded comparison */
-      if (brw->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
-         if_inst->insert_before(block, cmp_inst);
-      }
-
-      for (int i = 0; i < movs; i++) {
-         if (mov_imm_inst[i])
-            if_inst->insert_before(block, mov_imm_inst[i]);
-         if_inst->insert_before(block, sel_inst[i]);
 
          then_mov[i]->remove(then_block);
          else_mov[i]->remove(else_block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 034a4830a9b..a898ebbe636 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -37,7 +37,6 @@ extern "C" {
 #include "program/prog_print.h"
 #include "program/prog_optimize.h"
 #include "util/register_allocate.h"
-#include "program/sampler.h"
 #include "program/hash_table.h"
 #include "brw_context.h"
 #include "brw_eu.h"
@@ -67,8 +66,7 @@ fs_visitor::visit(ir_variable *ir)
 	 reg = emit_general_interpolation(ir);
       }
       assert(reg);
-      hash_table_insert(this->variable_ht, reg, ir);
-      return;
+
    } else if (ir->data.mode == ir_var_shader_out) {
       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 
@@ -105,35 +103,6 @@ fs_visitor::visit(ir_variable *ir)
 	    this->output_components[output] = vector_elements;
 	 }
       }
-   } else if (ir->data.mode == ir_var_uniform) {
-      int param_index = uniforms;
-
-      /* Thanks to the lower_ubo_reference pass, we will see only
-       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
-       * variables, so no need for them to be in variable_ht.
-       *
-       * Some uniforms, such as samplers and atomic counters, have no actual
-       * storage, so we should ignore them.
-       */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
-         return;
-
-      if (dispatch_width == 16) {
-	 if (!variable_storage(ir)) {
-	    fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
-	 }
-	 return;
-      }
-
-      param_size[param_index] = type_size(ir->type);
-      if (!strncmp(ir->name, "gl_", 3)) {
-	 setup_builtin_uniform_values(ir);
-      } else {
-	 setup_uniform_values(ir);
-      }
-
-      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
-      reg->type = brw_type_for_base_type(ir->type);
 
    } else if (ir->data.mode == ir_var_system_value) {
       if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
@@ -146,199 +115,13 @@ fs_visitor::visit(ir_variable *ir)
             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
                           BRW_REGISTER_TYPE_D));
       }
-   }
-
-   if (!reg)
-      reg = new(this->mem_ctx) fs_reg(this, ir->type);
-
-   hash_table_insert(this->variable_ht, reg, ir);
-}
-
-void
-fs_visitor::visit(ir_dereference_variable *ir)
-{
-   fs_reg *reg = variable_storage(ir->var);
-
-   if (!reg) {
-      fail("Failed to find variable storage for %s\n", ir->var->name);
-      this->result = fs_reg(reg_null_d);
-      return;
-   }
-   this->result = *reg;
-}
-
-void
-fs_visitor::visit(ir_dereference_record *ir)
-{
-   const glsl_type *struct_type = ir->record->type;
-
-   ir->record->accept(this);
-
-   unsigned int off = 0;
-   for (unsigned int i = 0; i < struct_type->length; i++) {
-      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
-	 break;
-      off += type_size(struct_type->fields.structure[i].type);
-   }
-   this->result = offset(this->result, off);
-   this->result.type = brw_type_for_base_type(ir->type);
-}
-
-void
-fs_visitor::visit(ir_dereference_array *ir)
-{
-   ir_constant *constant_index;
-   fs_reg src;
-   int element_size = type_size(ir->type);
-
-   constant_index = ir->array_index->as_constant();
-
-   ir->array->accept(this);
-   src = this->result;
-   src.type = brw_type_for_base_type(ir->type);
-
-   if (constant_index) {
-      assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
-      src = offset(src, constant_index->value.i[0] * element_size);
-   } else {
-      /* Variable index array dereference.  We attach the variable index
-       * component to the reg as a pointer to a register containing the
-       * offset.  Currently only uniform arrays are supported in this patch,
-       * and that reladdr pointer is resolved by
-       * move_uniform_array_access_to_pull_constants().  All other array types
-       * are lowered by lower_variable_index_to_cond_assign().
-       */
-      ir->array_index->accept(this);
-
-      fs_reg index_reg;
-      index_reg = fs_reg(this, glsl_type::int_type);
-      emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
-
-      if (src.reladdr) {
-         emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
-      }
-
-      src.reladdr = ralloc(mem_ctx, fs_reg);
-      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
-   }
-   this->result = src;
-}
 
-void
-fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                     const fs_reg &a)
-{
-   if (brw->gen < 6 ||
-       !x.is_valid_3src() ||
-       !y.is_valid_3src() ||
-       !a.is_valid_3src()) {
-      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
-      fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
-      fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
-      fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
-
-      emit(MUL(y_times_a, y, a));
-
-      fs_reg negative_a = a;
-      negative_a.negate = !a.negate;
-      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
-      emit(MUL(x_times_one_minus_a, x, one_minus_a));
-
-      emit(ADD(dst, x_times_one_minus_a, y_times_a));
    } else {
-      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
-       * we need to reorder the operands.
-       */
-      emit(LRP(dst, a, y, x));
-   }
-}
-
-void
-fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                        const fs_reg &src0, const fs_reg &src1)
-{
-   fs_inst *inst;
-
-   if (brw->gen >= 6) {
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->conditional_mod = conditionalmod;
-   } else {
-      emit(CMP(reg_null_d, src0, src1, conditionalmod));
-
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   }
-}
-
-bool
-fs_visitor::try_emit_saturate(ir_expression *ir)
-{
-   if (ir->operation != ir_unop_saturate)
-      return false;
-
-   ir_rvalue *sat_val = ir->operands[0];
-
-   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
-
-   sat_val->accept(this);
-   fs_reg src = this->result;
-
-   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
-
-   /* If the last instruction from our accept() generated our
-    * src, just set the saturate flag instead of emmitting a separate mov.
-    */
-   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
-   if (modify && modify->regs_written == modify->dst.width / 8 &&
-       modify->can_do_saturate()) {
-      modify->saturate = true;
-      this->result = src;
-      return true;
-   }
-
-   return false;
-}
-
-bool
-fs_visitor::try_emit_mad(ir_expression *ir)
-{
-   /* 3-src instructions were introduced in gen6. */
-   if (brw->gen < 6)
-      return false;
-
-   /* MAD can only handle floating-point data. */
-   if (ir->type != glsl_type::float_type)
-      return false;
-
-   ir_rvalue *nonmul = ir->operands[1];
-   ir_expression *mul = ir->operands[0]->as_expression();
-
-   if (!mul || mul->operation != ir_binop_mul) {
-      nonmul = ir->operands[0];
-      mul = ir->operands[1]->as_expression();
-
-      if (!mul || mul->operation != ir_binop_mul)
-         return false;
+      backend_visitor::visit(ir);
+      return;
    }
 
-   if (nonmul->as_constant() ||
-       mul->operands[0]->as_constant() ||
-       mul->operands[1]->as_constant())
-      return false;
-
-   nonmul->accept(this);
-   fs_reg src0 = this->result;
-
-   mul->operands[0]->accept(this);
-   fs_reg src1 = this->result;
-
-   mul->operands[1]->accept(this);
-   fs_reg src2 = this->result;
-
-   this->result = fs_reg(this, ir->type);
-   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
-
-   return true;
+   hash_table_insert(this->variable_ht, reg, ir);
 }
 
 static int
@@ -391,7 +174,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
 
    switch (ir->operation) {
    case ir_unop_interpolate_at_centroid:
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
+      inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
       break;
 
    case ir_binop_interpolate_at_sample: {
@@ -399,7 +182,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
       assert(sample_num || !"nonconstant sample number should have been lowered.");
 
       unsigned msg_data = sample_num->value.i[0] << 4;
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
+      inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
       break;
    }
 
@@ -408,7 +191,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
       if (const_offset) {
          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
                      fs_reg(msg_data));
       } else {
          /* pack the operands: hw wants offsets as 4 bit signed ints */
@@ -417,8 +200,8 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
          fs_reg src2 = src;
          for (int i = 0; i < 2; i++) {
             fs_reg temp = fs_reg(this, glsl_type::float_type);
-            emit(MUL(temp, this->result, fs_reg(16.0f)));
-            emit(MOV(src2, temp));  /* float to int */
+            bld.MUL(temp, this->result, fs_reg(16.0f));
+            bld.MOV(src2, temp);  /* float to int */
 
             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
              * that we support a maximum offset of +0.5, which isn't representable
@@ -433,7 +216,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
              */
 
-            fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
+            fs_inst *inst = bld.emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 
             src2 = offset(src2, 1);
@@ -441,7 +224,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
          }
 
          mlen = 2 * reg_width;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
                      fs_reg(0u));
       }
       break;
@@ -463,714 +246,17 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir)
 
    for (int i = 0; i < ir->type->vector_elements; i++) {
       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
-      emit(FS_OPCODE_LINTERP, res,
+      bld.emit(FS_OPCODE_LINTERP, res,
            dst_x, dst_y,
            fs_reg(interp_reg(var->data.location, ch)));
       res = offset(res, 1);
    }
 }
 
-void
-fs_visitor::visit(ir_expression *ir)
-{
-   unsigned int operand;
-   fs_reg op[3], temp;
-   fs_inst *inst;
-
-   assert(ir->get_num_operands() <= 3);
-
-   if (try_emit_saturate(ir))
-      return;
-
-   /* Deal with the real oddball stuff first */
-   switch (ir->operation) {
-   case ir_binop_add:
-      if (try_emit_mad(ir))
-         return;
-      break;
-
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      emit_interpolate_expression(ir);
-      return;
-
-   default:
-      break;
-   }
-
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
-      ir->operands[operand]->accept(this);
-      if (this->result.file == BAD_FILE) {
-	 fail("Failed to get tree for expression operand:\n");
-	 ir->operands[operand]->fprint(stderr);
-         fprintf(stderr, "\n");
-      }
-      assert(this->result.is_valid_3src());
-      op[operand] = this->result;
-
-      /* Matrix expression operands should have been broken down to vector
-       * operations already.
-       */
-      assert(!ir->operands[operand]->type->is_matrix());
-      /* And then those vector operands should have been broken down to scalar.
-       */
-      assert(!ir->operands[operand]->type->is_vector());
-   }
-
-   /* Storage for our result.  If our result goes into an assignment, it will
-    * just get copy-propagated out, so no worries.
-    */
-   this->result = fs_reg(this, ir->type);
-
-   switch (ir->operation) {
-   case ir_unop_logic_not:
-      if (ctx->Const.UniformBooleanTrue != 1) {
-         emit(NOT(this->result, op[0]));
-      } else {
-         emit(XOR(this->result, op[0], fs_reg(1)));
-      }
-      break;
-   case ir_unop_neg:
-      op[0].negate = !op[0].negate;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_abs:
-      op[0].abs = true;
-      op[0].negate = false;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_sign:
-      if (ir->type->is_float()) {
-         /* AND(val, 0x80000000) gives the sign bit.
-          *
-          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
-          * zero.
-          */
-         emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         this->result.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(this->result, op[0], fs_reg(0x80000000u)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         this->result.type = BRW_REGISTER_TYPE_F;
-      } else {
-         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
-          *               -> non-negative val generates 0x00000000.
-          *  Predicated OR sets 1 if val is positive.
-          */
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
-
-         emit(ASR(this->result, op[0], fs_reg(31)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(1)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-      break;
-   case ir_unop_rcp:
-      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
-      break;
-
-   case ir_unop_exp2:
-      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
-      break;
-   case ir_unop_log2:
-      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
-      break;
-   case ir_unop_exp:
-   case ir_unop_log:
-      unreachable("not reached: should be handled by ir_explog_to_explog2");
-   case ir_unop_sin:
-   case ir_unop_sin_reduced:
-      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
-      break;
-   case ir_unop_cos:
-   case ir_unop_cos_reduced:
-      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx:
-      emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_BY_HINT));
-      break;
-   case ir_unop_dFdx_coarse:
-      emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_COARSE));
-      break;
-   case ir_unop_dFdx_fine:
-      emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_FINE));
-      break;
-   case ir_unop_dFdy:
-      emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_BY_HINT));
-      break;
-   case ir_unop_dFdy_coarse:
-      emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_COARSE));
-      break;
-   case ir_unop_dFdy_fine:
-      emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_FINE));
-      break;
-
-   case ir_binop_add:
-      emit(ADD(this->result, op[0], op[1]));
-      break;
-   case ir_binop_sub:
-      unreachable("not reached: should be handled by ir_sub_to_add_neg");
-
-   case ir_binop_mul:
-      if (brw->gen < 8 && ir->type->is_integer()) {
-	 /* For integer multiplication, the MUL uses the low 16 bits
-	  * of one of the operands (src0 on gen6, src1 on gen7).  The
-	  * MACH accumulates in the contribution of the upper 16 bits
-	  * of that operand.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(this->result, op[0], op[1]));
-            else
-               emit(MUL(this->result, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(this->result, op[1], op[0]));
-            else
-               emit(MUL(this->result, op[0], op[1]));
-         } else {
-            if (brw->gen >= 7)
-               no16("SIMD16 explicit accumulator operands unsupported\n");
-
-            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                        this->result.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(reg_null_d, op[0], op[1]));
-            emit(MOV(this->result, fs_reg(acc)));
-         }
-      } else {
-	 emit(MUL(this->result, op[0], op[1]));
-      }
-      break;
-   case ir_binop_imul_high: {
-      if (brw->gen == 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  this->result.type);
-
-      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-      emit(MACH(this->result, op[0], op[1]));
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (brw->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-         }
-      }
-
-      break;
-   }
-   case ir_binop_div:
-      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_carry: {
-      if (brw->gen == 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(ADDC(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      if (brw->gen == 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(SUBB(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_mod:
-      /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         resolve_bool_comparison(ir->operands[0], &op[0]);
-         resolve_bool_comparison(ir->operands[1], &op[1]);
-      }
-
-      emit(CMP(this->result, op[0], op[1],
-               brw_conditional_for_comparison(ir->operation)));
-      break;
-
-   case ir_binop_logic_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_dot:
-   case ir_unop_any:
-      unreachable("not reached: should be handled by brw_fs_channel_expressions");
-
-   case ir_unop_noise:
-      unreachable("not reached: should be handled by lower_noise");
-
-   case ir_quadop_vector:
-      unreachable("not reached: should be handled by lower_quadop_vector");
-
-   case ir_binop_vector_extract:
-      unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
-
-   case ir_triop_vector_insert:
-      unreachable("not reached: should be handled by lower_vector_insert()");
-
-   case ir_binop_ldexp:
-      unreachable("not reached: should be handled by ldexp_to_arith()");
-
-   case ir_unop_sqrt:
-      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
-      break;
-
-   case ir_unop_rsq:
-      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
-      break;
-
-   case ir_unop_bitcast_i2f:
-   case ir_unop_bitcast_u2f:
-      op[0].type = BRW_REGISTER_TYPE_F;
-      this->result = op[0];
-      break;
-   case ir_unop_i2u:
-   case ir_unop_bitcast_f2u:
-      op[0].type = BRW_REGISTER_TYPE_UD;
-      this->result = op[0];
-      break;
-   case ir_unop_u2i:
-   case ir_unop_bitcast_f2i:
-      op[0].type = BRW_REGISTER_TYPE_D;
-      this->result = op[0];
-      break;
-   case ir_unop_i2f:
-   case ir_unop_u2f:
-   case ir_unop_f2i:
-   case ir_unop_f2u:
-      emit(MOV(this->result, op[0]));
-      break;
-
-   case ir_unop_b2i:
-      emit(AND(this->result, op[0], fs_reg(1)));
-      break;
-   case ir_unop_b2f:
-      if (ctx->Const.UniformBooleanTrue != 1) {
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         this->result.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
-         this->result.type = BRW_REGISTER_TYPE_F;
-      } else {
-         temp = fs_reg(this, glsl_type::int_type);
-         emit(AND(temp, op[0], fs_reg(1)));
-         emit(MOV(this->result, temp));
-      }
-      break;
-
-   case ir_unop_f2b:
-      emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      break;
-   case ir_unop_i2b:
-      emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      break;
-
-   case ir_unop_trunc:
-      emit(RNDZ(this->result, op[0]));
-      break;
-   case ir_unop_ceil:
-      op[0].negate = !op[0].negate;
-      emit(RNDD(this->result, op[0]));
-      this->result.negate = true;
-      break;
-   case ir_unop_floor:
-      emit(RNDD(this->result, op[0]));
-      break;
-   case ir_unop_fract:
-      emit(FRC(this->result, op[0]));
-      break;
-   case ir_unop_round_even:
-      emit(RNDE(this->result, op[0]));
-      break;
-
-   case ir_binop_min:
-   case ir_binop_max:
-      resolve_ud_negate(&op[0]);
-      resolve_ud_negate(&op[1]);
-      emit_minmax(ir->operation == ir_binop_min ?
-                  BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
-                  this->result, op[0], op[1]);
-      break;
-   case ir_unop_pack_snorm_2x16:
-   case ir_unop_pack_snorm_4x8:
-   case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_unorm_4x8:
-   case ir_unop_unpack_snorm_2x16:
-   case ir_unop_unpack_snorm_4x8:
-   case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_unorm_4x8:
-   case ir_unop_unpack_half_2x16:
-   case ir_unop_pack_half_2x16:
-      unreachable("not reached: should be handled by lower_packing_builtins");
-   case ir_unop_unpack_half_2x16_split_x:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
-      break;
-   case ir_binop_pow:
-      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
-      break;
-
-   case ir_unop_bitfield_reverse:
-      emit(BFREV(this->result, op[0]));
-      break;
-   case ir_unop_bit_count:
-      emit(CBIT(this->result, op[0]));
-      break;
-   case ir_unop_find_msb:
-      temp = fs_reg(this, glsl_type::uint_type);
-      emit(FBH(temp, op[0]));
-
-      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
-       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
-       * subtract the result from 31 to convert the MSB count into an LSB count.
-       */
-
-      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
-      emit(MOV(this->result, temp));
-      emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
-
-      temp.negate = true;
-      inst = emit(ADD(this->result, temp, fs_reg(31)));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-   case ir_unop_find_lsb:
-      emit(FBL(this->result, op[0]));
-      break;
-   case ir_unop_saturate:
-      inst = emit(MOV(this->result, op[0]));
-      inst->saturate = true;
-      break;
-   case ir_triop_bitfield_extract:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(BFE(this->result, op[2], op[1], op[0]));
-      break;
-   case ir_binop_bfm:
-      emit(BFI1(this->result, op[0], op[1]));
-      break;
-   case ir_triop_bfi:
-      emit(BFI2(this->result, op[0], op[1], op[2]));
-      break;
-   case ir_quadop_bitfield_insert:
-      unreachable("not reached: should be handled by "
-              "lower_instructions::bitfield_insert_to_bfm_bfi");
-
-   case ir_unop_bit_not:
-      emit(NOT(this->result, op[0]));
-      break;
-   case ir_binop_bit_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_lshift:
-      emit(SHL(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_rshift:
-      if (ir->type->base_type == GLSL_TYPE_INT)
-	 emit(ASR(this->result, op[0], op[1]));
-      else
-	 emit(SHR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_pack_half_2x16_split:
-      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_ubo_load: {
-      /* This IR node takes a constant uniform block and a constant or
-       * variable byte offset within the block and loads a vector from that.
-       */
-      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
-      ir_constant *const_offset = ir->operands[1]->as_constant();
-      fs_reg surf_index;
-
-      if (const_uniform_block) {
-         /* The block index is a constant, so just emit the binding table entry
-          * as an immediate.
-          */
-         surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
-                                 const_uniform_block->value.u[0]);
-      } else {
-         /* The block index is not a constant. Evaluate the index expression
-          * per-channel and add the base UBO index; the generator will select
-          * a value from any live channel.
-          */
-         surf_index = fs_reg(this, glsl_type::uint_type);
-         emit(ADD(surf_index, op[0],
-                  fs_reg(stage_prog_data->binding_table.ubo_start)))
-            ->force_writemask_all = true;
-
-         /* Assume this may touch any UBO. It would be nice to provide
-          * a tighter bound, but the array information is already lowered away.
-          */
-         brw_mark_surface_used(prog_data,
-                               stage_prog_data->binding_table.ubo_start +
-                               shader_prog->NumUniformBlocks - 1);
-      }
-
-      if (const_offset) {
-         fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
-         packed_consts.type = result.type;
-
-         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
-         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                   packed_consts, surf_index, const_offset_reg));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
-
-            /* The std140 packing rules don't allow vectors to cross 16-byte
-             * boundaries, and a reg is 32 bytes.
-             */
-            assert(packed_consts.subreg_offset < 32);
-
-            /* UBO bools are any nonzero value.  We consider bools to be
-             * values with the low bit set to 1.  Convert them using CMP.
-             */
-            if (ir->type->base_type == GLSL_TYPE_BOOL) {
-               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
-            } else {
-               emit(MOV(result, packed_consts));
-            }
-
-            result = offset(result, 1);
-         }
-      } else {
-         /* Turn the byte offset into a dword offset. */
-         fs_reg base_offset = fs_reg(this, glsl_type::int_type);
-         emit(SHR(base_offset, op[1], fs_reg(2)));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
-                                            base_offset, i));
-
-            if (ir->type->base_type == GLSL_TYPE_BOOL)
-               emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
-
-            result = offset(result, 1);
-         }
-      }
-
-      result.reg_offset = 0;
-      break;
-   }
-
-   case ir_triop_fma:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(MAD(this->result, op[2], op[1], op[0]));
-      break;
-
-   case ir_triop_lrp:
-      emit_lrp(this->result, op[0], op[1], op[2]);
-      break;
-
-   case ir_triop_csel:
-      emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      unreachable("already handled above");
-      break;
-   }
-}
-
-void
-fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
-				   const glsl_type *type, bool predicated)
-{
-   switch (type->base_type) {
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_BOOL:
-      for (unsigned int i = 0; i < type->components(); i++) {
-	 l.type = brw_type_for_base_type(type);
-	 r.type = brw_type_for_base_type(type);
-
-	 if (predicated || !l.equals(r)) {
-	    fs_inst *inst = emit(MOV(l, r));
-	    inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
-	 }
-
-	 l = offset(l, 1);
-	 r = offset(r, 1);
-      }
-      break;
-   case GLSL_TYPE_ARRAY:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.array, predicated);
-      }
-      break;
-
-   case GLSL_TYPE_STRUCT:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.structure[i].type,
-				predicated);
-      }
-      break;
-
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_ATOMIC_UINT:
-      break;
-
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-      unreachable("not reached");
-   }
-}
-
-/* If the RHS processing resulted in an instruction generating a
- * temporary value, and it would be easy to rewrite the instruction to
- * generate its result right into the LHS instead, do so.  This ends
- * up reliably removing instructions where it can be tricky to do so
- * later without real UD chain information.
- */
-bool
-fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
-                                   fs_reg dst,
-                                   fs_reg src,
-                                   fs_inst *pre_rhs_inst,
-                                   fs_inst *last_rhs_inst)
-{
-   /* Only attempt if we're doing a direct assignment. */
-   if (ir->condition ||
-       !(ir->lhs->type->is_scalar() ||
-        (ir->lhs->type->is_vector() &&
-         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
-      return false;
-
-   /* Make sure the last instruction generated our source reg. */
-   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
-						    last_rhs_inst,
-						    src);
-   if (!modify)
-      return false;
-
-   /* If last_rhs_inst wrote a different number of components than our LHS,
-    * we can't safely rewrite it.
-    */
-   if (alloc.sizes[dst.reg] != modify->regs_written)
-      return false;
-
-   /* Success!  Rewrite the instruction. */
-   modify->dst = dst;
-
-   return true;
-}
-
-void
-fs_visitor::visit(ir_assignment *ir)
-{
-   fs_reg l, r;
-   fs_inst *inst;
-
-   /* FINISHME: arrays on the lhs */
-   ir->lhs->accept(this);
-   l = this->result;
-
-   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   ir->rhs->accept(this);
-   r = this->result;
-
-   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   assert(l.file != BAD_FILE);
-   assert(r.file != BAD_FILE);
-
-   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
-      return;
-
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-   }
-
-   if (ir->lhs->type->is_scalar() ||
-       ir->lhs->type->is_vector()) {
-      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
-	 if (ir->write_mask & (1 << i)) {
-	    inst = emit(MOV(l, r));
-	    if (ir->condition)
-	       inst->predicate = BRW_PREDICATE_NORMAL;
-	    r = offset(r, 1);
-	 }
-	 l = offset(l, 1);
-      }
-   } else {
-      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
-   }
-}
-
 fs_inst *
 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_c, fs_reg lod, fs_reg dPdy,
-                              uint32_t sampler)
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              const fs_reg &sampler)
 {
    int mlen;
    int base_mrf = 1;
@@ -1182,7 +268,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 
    if (shadow_c.file != BAD_FILE) {
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1190,7 +276,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * the unused slots must be zeroed.
        */
       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       mlen += 3;
 
@@ -1198,25 +284,25 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 	 /* There's no plain shadow compare message, so we use shadow
 	  * compare with a bias of 0.0.
 	  */
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 	 mlen++;
       } else if (ir->op == ir_txb || ir->op == ir_txl) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
 	 mlen++;
       } else {
          unreachable("Should not get here.");
       }
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
       mlen++;
    } else if (ir->op == ir_tex) {
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* zero the others. */
       for (int i = ir->coordinate->type->vector_elements; i<3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
       mlen += 3;
@@ -1224,7 +310,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       fs_reg &dPdx = lod;
 
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* the slots for u and v are always present, but r is optional */
@@ -1245,20 +331,20 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        *        m5     m6     m7     m8     m9     m10
        */
       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
 	 dPdx = offset(dPdx, 1);
       }
       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 
       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
-	 dPdy = offset(dPdy, 1);
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen), lod2);
+	 lod2 = offset(lod2, 1);
       }
       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
    } else if (ir->op == ir_txs) {
       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
       simd16 = true;
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
       mlen += 2;
    } else {
       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
@@ -1268,8 +354,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
 
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                  coordinate));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
+                  coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1277,13 +363,13 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * be necessary for TXF (ld), but seems wise to do for all messages.
        */
       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
+	 bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
       }
 
       /* lod/bias appears after u/v/r. */
       mlen += 6;
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
       mlen++;
 
       /* The unused upper half. */
@@ -1315,7 +401,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, sampler);
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
    inst->header_present = true;
@@ -1323,7 +409,7 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 
    if (simd16) {
       for (int i = 0; i < 4; i++) {
-	 emit(MOV(orig_dst, dst));
+	 bld.MOV(orig_dst, dst);
 	 orig_dst = offset(orig_dst, 1);
 	 dst = offset(dst, 2);
       }
@@ -1341,9 +427,9 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
  * surprising in the disassembly.
  */
 fs_inst *
-fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_c, fs_reg lod, fs_reg lod2,
-                              fs_reg sample_index, uint32_t sampler)
+fs_visitor::emit_texture_gen5(ir_texture *ir, const fs_reg &dst, fs_reg coordinate,
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              const fs_reg &sample_index, const fs_reg &sampler)
 {
    int reg_width = dispatch_width / 8;
    bool header_present = false;
@@ -1362,7 +448,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
 
    for (int i = 0; i < vector_elements; i++) {
-      emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
+      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
       coordinate = offset(coordinate, 1);
    }
    fs_reg msg_end = offset(msg_coords, vector_elements);
@@ -1370,7 +456,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_shadow = msg_lod;
-      emit(MOV(msg_shadow, shadow_c));
+      bld.MOV(msg_shadow, shadow_c);
       msg_lod = offset(msg_shadow, 1);
       msg_end = msg_lod;
    }
@@ -1381,13 +467,13 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       opcode = SHADER_OPCODE_TEX;
       break;
    case ir_txb:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = FS_OPCODE_TXB;
       break;
    case ir_txl:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXL;
@@ -1404,11 +490,11 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        */
       msg_end = msg_lod;
       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
-         emit(MOV(msg_end, lod));
+         bld.MOV(msg_end, lod);
          lod = offset(lod, 1);
          msg_end = offset(msg_end, 1);
 
-         emit(MOV(msg_end, lod2));
+         bld.MOV(msg_end, lod2);
          lod2 = offset(lod2, 1);
          msg_end = offset(msg_end, 1);
       }
@@ -1418,21 +504,21 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
    case ir_txs:
       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_query_levels:
       msg_lod = msg_end;
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_txf:
       msg_lod = offset(msg_coords, 3);
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXF;
@@ -1440,9 +526,9 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_txf_ms:
       msg_lod = offset(msg_coords, 3);
       /* lod */
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       /* sample index */
-      emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
       msg_end = offset(msg_lod, 2);
 
       opcode = SHADER_OPCODE_TXF_CMS;
@@ -1457,7 +543,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, sampler);
    inst->base_mrf = message.reg;
    inst->mlen = msg_end.reg - message.reg;
    inst->header_present = header_present;
@@ -1481,14 +567,15 @@ is_high_sampler(struct brw_context *brw, fs_reg sampler)
 }
 
 fs_inst *
-fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
-                              fs_reg shadow_c, fs_reg lod, fs_reg lod2,
-                              fs_reg sample_index, fs_reg mcs, fs_reg sampler)
+fs_visitor::emit_texture_gen7(ir_texture *ir, const fs_reg &dst, fs_reg coordinate,
+                              const fs_reg &shadow_c, fs_reg lod, fs_reg lod2,
+                              fs_reg offset_val, const fs_reg &sample_index,
+                              const fs_reg &mcs, const fs_reg &sampler)
 {
-   int reg_width = dispatch_width / 8;
+   int reg_width = bld.dispatch_width() / 8;
    bool header_present = false;
-
    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
+
    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
       sources[i] = fs_reg(this, glsl_type::float_type);
    }
@@ -1512,7 +599,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
 
    if (shadow_c.file != BAD_FILE) {
-      emit(MOV(sources[length], shadow_c));
+      bld.MOV(sources[length], shadow_c);
       length++;
    }
 
@@ -1525,11 +612,11 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    case ir_lod:
       break;
    case ir_txb:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txl:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txd: {
@@ -1539,19 +626,19 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
        */
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(sources[length], coordinate));
-	 coordinate = offset(coordinate, 1);
-	 length++;
+         bld.MOV(sources[length], coordinate);
+         coordinate = offset(coordinate, 1);
+         length++;
 
          /* For cube map array, the coordinate is (u,v,r,ai) but there are
           * only derivatives for (u, v, r).
           */
          if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
-            emit(MOV(sources[length], lod));
+            bld.MOV(sources[length], lod);
             lod = offset(lod, 1);
             length++;
 
-            emit(MOV(sources[length], lod2));
+            bld.MOV(sources[length], lod2);
             lod2 = offset(lod2, 1);
             length++;
          }
@@ -1561,43 +648,43 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       break;
    }
    case ir_txs:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
       length++;
       break;
    case ir_query_levels:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
       length++;
       break;
    case ir_txf:
       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
       coordinate = offset(coordinate, 1);
       length++;
 
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
       length++;
 
       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
-	 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
-	 coordinate = offset(coordinate, 1);
-	 length++;
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
+         coordinate = offset(coordinate, 1);
+         length++;
       }
 
       coordinate_done = true;
       break;
    case ir_txf_ms:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
       length++;
 
       /* data from the multisample control surface */
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
       length++;
 
       /* there is no offsetting for this message; just copy in the integer
        * texture coordinates
        */
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-         emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -1610,23 +697,20 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
 
          /* More crazy intermixing */
-         ir->offset->accept(this);
-         fs_reg offset_value = this->result;
-
          for (int i = 0; i < 2; i++) { /* u, v */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
 
          for (int i = 0; i < 2; i++) { /* offu, offv */
-            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
-            offset_value = offset(offset_value, 1);
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_val);
+            offset_val = offset(offset_val, 1);
             length++;
          }
 
          if (ir->coordinate->type->vector_elements == 3) { /* r if present */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
@@ -1639,7 +723,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    /* Set up the coordinate (except for cases where it was done above) */
    if (ir->coordinate && !coordinate_done) {
       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
-         emit(MOV(sources[length], coordinate));
+         bld.MOV(sources[length], coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -1651,9 +735,8 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    else
       mlen = length * reg_width;
 
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_F);
-   emit(LOAD_PAYLOAD(src_payload, sources, length));
+   fs_reg payload = bld.natural_reg(BRW_REGISTER_TYPE_F, mlen);
+   bld.LOAD_PAYLOAD(payload, sources, length);
 
    /* Generate the SEND */
    enum opcode opcode;
@@ -1676,7 +759,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    default:
       unreachable("not reached");
    }
-   fs_inst *inst = emit(opcode, dst, src_payload, sampler);
+   instruction *inst = bld.emit(opcode, dst, payload, sampler);
    inst->base_mrf = -1;
    inst->mlen = mlen;
    inst->header_present = header_present;
@@ -1690,489 +773,22 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    return inst;
 }
 
-fs_reg
-fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
-                             bool is_rect, uint32_t sampler, int texunit)
-{
-   fs_inst *inst = NULL;
-   bool needs_gl_clamp = true;
-   fs_reg scale_x, scale_y;
-   const struct brw_sampler_prog_key_data *tex =
-      (stage == MESA_SHADER_FRAGMENT) ?
-      &((brw_wm_prog_key*) this->key)->tex : NULL;
-   assert(tex);
-
-   /* The 965 requires the EU to do the normalization of GL rectangle
-    * texture coordinates.  We use the program parameter state
-    * tracking to get the scaling factor.
-    */
-   if (is_rect &&
-       (brw->gen < 6 ||
-        (brw->gen >= 6 && (tex->gl_clamp_mask[0] & (1 << sampler) ||
-                           tex->gl_clamp_mask[1] & (1 << sampler))))) {
-      struct gl_program_parameter_list *params = prog->Parameters;
-      int tokens[STATE_LENGTH] = {
-	 STATE_INTERNAL,
-	 STATE_TEXRECT_SCALE,
-	 texunit,
-	 0,
-	 0
-      };
-
-      no16("rectangle scale uniform setup not supported on SIMD16\n");
-      if (dispatch_width == 16) {
-	 return coordinate;
-      }
-
-      GLuint index = _mesa_add_state_reference(params,
-					       (gl_state_index *)tokens);
-      /* Try to find existing copies of the texrect scale uniforms. */
-      for (unsigned i = 0; i < uniforms; i++) {
-         if (stage_prog_data->param[i] ==
-             &prog->Parameters->ParameterValues[index][0]) {
-            scale_x = fs_reg(UNIFORM, i);
-            scale_y = fs_reg(UNIFORM, i + 1);
-            break;
-         }
-      }
-
-      /* If we didn't already set them up, do so now. */
-      if (scale_x.file == BAD_FILE) {
-         scale_x = fs_reg(UNIFORM, uniforms);
-         scale_y = fs_reg(UNIFORM, uniforms + 1);
-
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][0];
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][1];
-      }
-   }
-
-   /* The 965 requires the EU to do the normalization of GL rectangle
-    * texture coordinates.  We use the program parameter state
-    * tracking to get the scaling factor.
-    */
-   if (brw->gen < 6 && is_rect) {
-      fs_reg dst = fs_reg(this, ir->coordinate->type);
-      fs_reg src = coordinate;
-      coordinate = dst;
-
-      emit(MUL(dst, src, scale_x));
-      dst = offset(dst, 1);
-      src = offset(src, 1);
-      emit(MUL(dst, src, scale_y));
-   } else if (is_rect) {
-      /* On gen6+, the sampler handles the rectangle coordinates
-       * natively, without needing rescaling.  But that means we have
-       * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
-       * not [0, 1] like the default case below.
-       */
-      needs_gl_clamp = false;
-
-      for (int i = 0; i < 2; i++) {
-	 if (tex->gl_clamp_mask[i] & (1 << sampler)) {
-	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
-
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
-	    inst->conditional_mod = BRW_CONDITIONAL_G;
-
-	    /* Our parameter comes in as 1.0/width or 1.0/height,
-	     * because that's what people normally want for doing
-	     * texture rectangle handling.  We need width or height
-	     * for clamping, but we don't care enough to make a new
-	     * parameter type, so just invert back.
-	     */
-	    fs_reg limit = fs_reg(this, glsl_type::float_type);
-	    emit(MOV(limit, i == 0 ? scale_x : scale_y));
-	    emit(SHADER_OPCODE_RCP, limit, limit);
-
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
-	    inst->conditional_mod = BRW_CONDITIONAL_L;
-	 }
-      }
-   }
-
-   if (ir->coordinate && needs_gl_clamp) {
-      for (unsigned int i = 0;
-	   i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
-	 if (tex->gl_clamp_mask[i] & (1 << sampler)) {
-	    fs_reg chan = coordinate;
-	    chan = offset(chan, i);
-
-	    fs_inst *inst = emit(MOV(chan, chan));
-	    inst->saturate = true;
-	 }
-      }
-   }
-   return coordinate;
-}
-
-/* Sample from the MCS surface attached to this multisample texture. */
-fs_reg
-fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
-{
-   int reg_width = dispatch_width / 8;
-   int length = ir->coordinate->type->vector_elements;
-   fs_reg payload = fs_reg(GRF, alloc.allocate(length * reg_width),
-                           BRW_REGISTER_TYPE_F);
-   fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length);
-
-   /* parameters are: u, v, r; missing parameters are treated as zero */
-   for (int i = 0; i < length; i++) {
-      sources[i] = fs_reg(this, glsl_type::float_type);
-      emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
-      coordinate = offset(coordinate, 1);
-   }
-
-   emit(LOAD_PAYLOAD(payload, sources, length));
-
-   fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
-   inst->base_mrf = -1;
-   inst->mlen = length * reg_width;
-   inst->header_present = false;
-   inst->regs_written = 4 * reg_width; /* we only care about one reg of
-                                        * response, but the sampler always
-                                        * writes 4/8
-                                        */
-
-   return dest;
-}
-
-void
-fs_visitor::visit(ir_texture *ir)
-{
-   const struct brw_sampler_prog_key_data *tex =
-      (stage == MESA_SHADER_FRAGMENT) ?
-      &((brw_wm_prog_key*) this->key)->tex : NULL;
-   assert(tex);
-   fs_inst *inst = NULL;
-
-   uint32_t sampler =
-      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
-
-   ir_rvalue *nonconst_sampler_index =
-      _mesa_get_sampler_array_nonconst_index(ir->sampler);
-
-   /* Handle non-constant sampler array indexing */
-   fs_reg sampler_reg;
-   if (nonconst_sampler_index) {
-      /* The highest sampler which may be used by this operation is
-       * the last element of the array. Mark it here, because the generator
-       * doesn't have enough information to determine the bound.
-       */
-      uint32_t array_size = ir->sampler->as_dereference_array()
-         ->array->type->array_size();
-
-      uint32_t max_used = sampler + array_size - 1;
-      if (ir->op == ir_tg4 && brw->gen < 8) {
-         max_used += stage_prog_data->binding_table.gather_texture_start;
-      } else {
-         max_used += stage_prog_data->binding_table.texture_start;
-      }
-
-      brw_mark_surface_used(prog_data, max_used);
-
-      /* Emit code to evaluate the actual indexing expression */
-      nonconst_sampler_index->accept(this);
-      fs_reg temp(this, glsl_type::uint_type);
-      emit(ADD(temp, this->result, fs_reg(sampler)))
-            ->force_writemask_all = true;
-      sampler_reg = temp;
-   } else {
-      /* Single sampler, or constant array index; the indexing expression
-       * is just an immediate.
-       */
-      sampler_reg = fs_reg(sampler);
-   }
-
-   /* FINISHME: We're failing to recompile our programs when the sampler is
-    * updated.  This only matters for the texture rectangle scale parameters
-    * (pre-gen6, or gen6+ with GL_CLAMP).
-    */
-   int texunit = prog->SamplerUnits[sampler];
-
-   if (ir->op == ir_tg4) {
-      /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
-       * emitting anything other than setting up the constant result.
-       */
-      ir_constant *chan = ir->lod_info.component->as_constant();
-      int swiz = GET_SWZ(tex->swizzles[sampler], chan->value.i[0]);
-      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
-
-         fs_reg res = fs_reg(this, glsl_type::vec4_type);
-         this->result = res;
-
-         for (int i=0; i<4; i++) {
-            emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
-            res = offset(res, 1);
-         }
-         return;
-      }
-   }
-
-   /* Should be lowered by do_lower_texture_projection */
-   assert(!ir->projector);
-
-   /* Should be lowered */
-   assert(!ir->offset || !ir->offset->type->is_array());
-
-   /* Generate code to compute all the subexpression trees.  This has to be
-    * done before loading any values into MRFs for the sampler message since
-    * generating these values may involve SEND messages that need the MRFs.
-    */
-   fs_reg coordinate;
-   if (ir->coordinate) {
-      ir->coordinate->accept(this);
-
-      coordinate = rescale_texcoord(ir, this->result,
-                                    ir->sampler->type->sampler_dimensionality ==
-                                    GLSL_SAMPLER_DIM_RECT,
-                                    sampler, texunit);
-   }
-
-   fs_reg shadow_comparitor;
-   if (ir->shadow_comparitor) {
-      ir->shadow_comparitor->accept(this);
-      shadow_comparitor = this->result;
-   }
-
-   fs_reg lod, lod2, sample_index, mcs;
-   switch (ir->op) {
-   case ir_tex:
-   case ir_lod:
-   case ir_tg4:
-   case ir_query_levels:
-      break;
-   case ir_txb:
-      ir->lod_info.bias->accept(this);
-      lod = this->result;
-      break;
-   case ir_txd:
-      ir->lod_info.grad.dPdx->accept(this);
-      lod = this->result;
-
-      ir->lod_info.grad.dPdy->accept(this);
-      lod2 = this->result;
-      break;
-   case ir_txf:
-   case ir_txl:
-   case ir_txs:
-      ir->lod_info.lod->accept(this);
-      lod = this->result;
-      break;
-   case ir_txf_ms:
-      ir->lod_info.sample_index->accept(this);
-      sample_index = this->result;
-
-      if (brw->gen >= 7 && tex->compressed_multisample_layout_mask & (1<<sampler))
-         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
-      else
-         mcs = fs_reg(0u);
-      break;
-   default:
-      unreachable("Unrecognized texture opcode");
-   };
-
-   /* Writemasking doesn't eliminate channels on SIMD8 texture
-    * samples, so don't worry about them.
-    */
-   fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
-
-   if (brw->gen >= 7) {
-      inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
-                               lod, lod2, sample_index, mcs, sampler_reg);
-   } else if (brw->gen >= 5) {
-      inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
-                               lod, lod2, sample_index, sampler);
-   } else {
-      inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
-                               lod, lod2, sampler);
-   }
-
-   if (ir->offset != NULL && ir->op != ir_txf)
-      inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
-
-   if (ir->op == ir_tg4)
-      inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
-
-   if (ir->shadow_comparitor)
-      inst->shadow_compare = true;
-
-   /* fixup #layers for cube map arrays */
-   if (ir->op == ir_txs) {
-      glsl_type const *type = ir->sampler->type;
-      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-          type->sampler_array) {
-         fs_reg depth = offset(dst, 2);
-         fs_reg fixed_depth = fs_reg(this, glsl_type::int_type);
-         emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
-
-         fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-         int components = inst->regs_written / (dst.width / 8);
-         for (int i = 0; i < components; i++) {
-            if (i == 2) {
-               fixed_payload[i] = fixed_depth;
-            } else {
-               fixed_payload[i] = offset(dst, i);
-            }
-         }
-         emit(LOAD_PAYLOAD(dst, fixed_payload, components));
-      }
-   }
-
-   if (brw->gen == 6 && ir->op == ir_tg4) {
-      emit_gen6_gather_wa(tex->gen6_gather_wa[sampler], dst);
-   }
-
-   swizzle_result(ir, dst, sampler);
-}
-
-/**
- * Apply workarounds for Gen6 gather with UINT/SINT
- */
-void
-fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
-{
-   if (!wa)
-      return;
-
-   int width = (wa & WA_8BIT) ? 8 : 16;
-
-   for (int i = 0; i < 4; i++) {
-      fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
-      /* Convert from UNORM to UINT */
-      emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
-      emit(MOV(dst, dst_f));
-
-      if (wa & WA_SIGN) {
-         /* Reinterpret the UINT value as a signed INT value by
-          * shifting the sign bit into place, then shifting back
-          * preserving sign.
-          */
-         emit(SHL(dst, dst, fs_reg(32 - width)));
-         emit(ASR(dst, dst, fs_reg(32 - width)));
-      }
-
-      dst = offset(dst, 1);
-   }
-}
-
-/**
- * Set up the gather channel based on the swizzle, for gather4.
- */
-uint32_t
-fs_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
-{
-   const struct brw_sampler_prog_key_data *tex =
-      (stage == MESA_SHADER_FRAGMENT) ?
-      &((brw_wm_prog_key*) this->key)->tex : NULL;
-   assert(tex);
-   ir_constant *chan = ir->lod_info.component->as_constant();
-   int swiz = GET_SWZ(tex->swizzles[sampler], chan->value.i[0]);
-   switch (swiz) {
-      case SWIZZLE_X: return 0;
-      case SWIZZLE_Y:
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         if (tex->gather_channel_quirk_mask & (1<<sampler))
-            return 2;
-         return 1;
-      case SWIZZLE_Z: return 2;
-      case SWIZZLE_W: return 3;
-      default:
-         unreachable("Not reached"); /* zero, one swizzles handled already */
-   }
-}
-
-/**
- * Swizzle the result of a texture result.  This is necessary for
- * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
- */
-void
-fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, uint32_t sampler)
-{
-   if (ir->op == ir_query_levels) {
-      /* # levels is in .w */
-      this->result = offset(orig_val, 3);
-      return;
-   }
-
-   this->result = orig_val;
-
-   /* txs,lod don't actually sample the texture, so swizzling the result
-    * makes no sense.
-    */
-   if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
-      return;
-
-   const struct brw_sampler_prog_key_data *tex =
-      (stage == MESA_SHADER_FRAGMENT) ?
-      &((brw_wm_prog_key*) this->key)->tex : NULL;
-   assert(tex);
-
-   if (ir->type == glsl_type::float_type) {
-      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
-      assert(ir->sampler->type->sampler_shadow);
-   } else if (tex->swizzles[sampler] != SWIZZLE_NOOP) {
-      fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
-
-      for (int i = 0; i < 4; i++) {
-	 int swiz = GET_SWZ(tex->swizzles[sampler], i);
-	 fs_reg l = swizzled_result;
-	 l = offset(l, i);
-
-	 if (swiz == SWIZZLE_ZERO) {
-	    emit(MOV(l, fs_reg(0.0f)));
-	 } else if (swiz == SWIZZLE_ONE) {
-	    emit(MOV(l, fs_reg(1.0f)));
-	 } else {
-            emit(MOV(l, offset(orig_val,
-                               GET_SWZ(tex->swizzles[sampler], i))));
-	 }
-      }
-      this->result = swizzled_result;
-   }
-}
-
-void
-fs_visitor::visit(ir_swizzle *ir)
+fs_inst *
+fs_visitor::emit_texture(ir_texture *ir, const fs_reg &dst,
+                         const fs_reg &coordinate, const fs_reg &shadow_c,
+                         const fs_reg &lod, const fs_reg &lod2,
+                         const fs_reg &offset_val, const fs_reg &sample_index,
+                         const fs_reg &mcs, const fs_reg &sampler)
 {
-   ir->val->accept(this);
-   fs_reg val = this->result;
-
-   if (ir->type->vector_elements == 1) {
-      this->result = offset(this->result, ir->mask.x);
-      return;
-   }
-
-   fs_reg result = fs_reg(this, ir->type);
-   this->result = result;
-
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      fs_reg channel = val;
-      int swiz = 0;
-
-      switch (i) {
-      case 0:
-	 swiz = ir->mask.x;
-	 break;
-      case 1:
-	 swiz = ir->mask.y;
-	 break;
-      case 2:
-	 swiz = ir->mask.z;
-	 break;
-      case 3:
-	 swiz = ir->mask.w;
-	 break;
-      }
-
-      emit(MOV(result, offset(channel, swiz)));
-      result = offset(result, 1);
-   }
+   if (brw->gen >= 7)
+      return emit_texture_gen7(ir, dst, coordinate, shadow_c, lod, lod2,
+                               offset_val, sample_index, mcs, sampler);
+   else if (brw->gen >= 5)
+      return emit_texture_gen5(ir, dst, coordinate, shadow_c, lod, lod2,
+                               sample_index, sampler);
+   else
+      return emit_texture_gen4(ir, dst, coordinate, shadow_c, lod, lod2,
+                               sampler);
 }
 
 void
@@ -2187,8 +803,8 @@ fs_visitor::visit(ir_discard *ir)
     */
    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                    BRW_REGISTER_TYPE_UW));
-   fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
-                           BRW_CONDITIONAL_NZ));
+   fs_inst *cmp = bld.CMP(bld.reg_null_f(), some_reg, some_reg,
+                           BRW_CONDITIONAL_NZ);
    cmp->predicate = BRW_PREDICATE_NORMAL;
    cmp->flag_subreg = 1;
 
@@ -2196,7 +812,7 @@ fs_visitor::visit(ir_discard *ir)
       /* For performance, after a discard, jump to the end of the shader.
        * Only jump if all relevant channels have been discarded.
        */
-      fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+      fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
       discard_jump->flag_subreg = 1;
 
       discard_jump->predicate = (dispatch_width == 8)
@@ -2206,292 +822,6 @@ fs_visitor::visit(ir_discard *ir)
    }
 }
 
-void
-fs_visitor::visit(ir_constant *ir)
-{
-   /* Set this->result to reg at the bottom of the function because some code
-    * paths will cause this visitor to be applied to other fields.  This will
-    * cause the value stored in this->result to be modified.
-    *
-    * Make reg constant so that it doesn't get accidentally modified along the
-    * way.  Yes, I actually had this problem. :(
-    */
-   const fs_reg reg(this, ir->type);
-   fs_reg dst_reg = reg;
-
-   if (ir->type->is_array()) {
-      const unsigned size = type_size(ir->type->fields.array);
-
-      for (unsigned i = 0; i < ir->type->length; i++) {
-	 ir->array_elements[i]->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else if (ir->type->is_record()) {
-      foreach_in_list(ir_constant, field, &ir->components) {
-	 const unsigned size = type_size(field->type);
-
-	 field->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else {
-      const unsigned size = type_size(ir->type);
-
-      for (unsigned i = 0; i < size; i++) {
-	 switch (ir->type->base_type) {
-	 case GLSL_TYPE_FLOAT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
-	    break;
-	 case GLSL_TYPE_UINT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
-	    break;
-	 case GLSL_TYPE_INT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
-	    break;
-	 case GLSL_TYPE_BOOL:
-            emit(MOV(dst_reg,
-                     fs_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
-                                                : 0)));
-	    break;
-	 default:
-	    unreachable("Non-float/uint/int/bool constant");
-	 }
-	 dst_reg = offset(dst_reg, 1);
-      }
-   }
-
-   this->result = reg;
-}
-
-void
-fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
-{
-   ir_expression *expr = ir->as_expression();
-
-   if (!expr || expr->operation == ir_binop_ubo_load) {
-      ir->accept(this);
-
-      fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      return;
-   }
-
-   fs_reg op[3];
-   fs_inst *inst;
-
-   assert(expr->get_num_operands() <= 3);
-   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-      assert(expr->operands[i]->type->is_scalar());
-
-      expr->operands[i]->accept(this);
-      op[i] = this->result;
-
-      resolve_ud_negate(&op[i]);
-   }
-
-   switch (expr->operation) {
-   case ir_unop_logic_not:
-      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_Z;
-      break;
-
-   case ir_binop_logic_xor:
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         fs_reg dst = fs_reg(this, glsl_type::uint_type);
-         emit(XOR(dst, op[0], op[1]));
-         inst = emit(AND(reg_null_d, dst, fs_reg(1)));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      } else {
-         inst = emit(XOR(reg_null_d, op[0], op[1]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_binop_logic_or:
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         fs_reg dst = fs_reg(this, glsl_type::uint_type);
-         emit(OR(dst, op[0], op[1]));
-         inst = emit(AND(reg_null_d, dst, fs_reg(1)));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      } else {
-         inst = emit(OR(reg_null_d, op[0], op[1]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_binop_logic_and:
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         fs_reg dst = fs_reg(this, glsl_type::uint_type);
-         emit(AND(dst, op[0], op[1]));
-         inst = emit(AND(reg_null_d, dst, fs_reg(1)));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      } else {
-         inst = emit(AND(reg_null_d, op[0], op[1]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_unop_f2b:
-      if (brw->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_f, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_unop_i2b:
-      if (brw->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_binop_greater:
-   case ir_binop_gequal:
-   case ir_binop_less:
-   case ir_binop_lequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         resolve_bool_comparison(expr->operands[0], &op[0]);
-         resolve_bool_comparison(expr->operands[1], &op[1]);
-      }
-
-      emit(CMP(reg_null_d, op[0], op[1],
-               brw_conditional_for_comparison(expr->operation)));
-      break;
-
-   case ir_triop_csel: {
-      /* Expand the boolean condition into the flag register. */
-      inst = emit(MOV(reg_null_d, op[0]));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-      /* Select which boolean to return. */
-      fs_reg temp(this, expr->operands[1]->type);
-      inst = emit(SEL(temp, op[1], op[2]));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-
-      /* Expand the result to a condition code. */
-      inst = emit(MOV(reg_null_d, temp));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-   }
-
-   default:
-      unreachable("not reached");
-   }
-}
-
-/**
- * Emit a gen6 IF statement with the comparison folded into the IF
- * instruction.
- */
-void
-fs_visitor::emit_if_gen6(ir_if *ir)
-{
-   ir_expression *expr = ir->condition->as_expression();
-
-   if (expr && expr->operation != ir_binop_ubo_load) {
-      fs_reg op[3];
-      fs_inst *inst;
-      fs_reg temp;
-
-      assert(expr->get_num_operands() <= 3);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-         emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
-         return;
-
-      case ir_binop_logic_xor:
-         emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_or:
-         temp = fs_reg(this, glsl_type::bool_type);
-         emit(OR(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_and:
-         temp = fs_reg(this, glsl_type::bool_type);
-         emit(AND(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_unop_f2b:
-	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_unop_i2b:
-	 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_all_equal:
-      case ir_binop_nequal:
-      case ir_binop_any_nequal:
-         if (ctx->Const.UniformBooleanTrue == 1) {
-            resolve_bool_comparison(expr->operands[0], &op[0]);
-            resolve_bool_comparison(expr->operands[1], &op[1]);
-         }
-
-	 emit(IF(op[0], op[1],
-                 brw_conditional_for_comparison(expr->operation)));
-	 return;
-
-      case ir_triop_csel: {
-         /* Expand the boolean condition into the flag register. */
-         fs_inst *inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-         /* Select which boolean to use as the result. */
-         fs_reg temp(this, expr->operands[1]->type);
-         inst = emit(SEL(temp, op[1], op[2]));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-	 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-      }
-
-      default:
-	 unreachable("not reached");
-      }
-   }
-
-   ir->condition->accept(this);
-   emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
-}
-
 /**
  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
  *
@@ -2558,19 +888,19 @@ fs_visitor::try_replace_with_sel()
       if (src0.file == IMM) {
          src0 = fs_reg(this, glsl_type::float_type);
          src0.type = then_mov->src[0].type;
-         emit(MOV(src0, then_mov->src[0]));
+         bld.MOV(src0, then_mov->src[0]);
       }
 
       fs_inst *sel;
       if (if_inst->conditional_mod) {
          /* Sandybridge-specific IF with embedded comparison */
-         emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                  if_inst->conditional_mod));
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
+         bld.CMP(bld.reg_null_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
+         sel = bld.emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
          sel->predicate = BRW_PREDICATE_NORMAL;
       } else {
          /* Separate CMP and IF instructions */
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
+         sel = bld.emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
          sel->predicate = if_inst->predicate;
          sel->predicate_inverse = if_inst->predicate_inverse;
       }
@@ -2578,165 +908,6 @@ fs_visitor::try_replace_with_sel()
 }
 
 void
-fs_visitor::visit(ir_if *ir)
-{
-   if (brw->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
-   /* Don't point the annotation at the if statement, because then it plus
-    * the then and else blocks get printed.
-    */
-   this->base_ir = ir->condition;
-
-   if (brw->gen == 6) {
-      emit_if_gen6(ir);
-   } else {
-      emit_bool_to_cond_code(ir->condition);
-
-      emit(IF(BRW_PREDICATE_NORMAL));
-   }
-
-   foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   if (!ir->else_instructions.is_empty()) {
-      emit(BRW_OPCODE_ELSE);
-
-      foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-
-   emit(BRW_OPCODE_ENDIF);
-
-   try_replace_with_sel();
-}
-
-void
-fs_visitor::visit(ir_loop *ir)
-{
-   if (brw->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_DO);
-
-   foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_WHILE);
-}
-
-void
-fs_visitor::visit(ir_loop_jump *ir)
-{
-   switch (ir->mode) {
-   case ir_loop_jump::jump_break:
-      emit(BRW_OPCODE_BREAK);
-      break;
-   case ir_loop_jump::jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
-      break;
-   }
-}
-
-void
-fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
-{
-   ir_dereference *deref = static_cast<ir_dereference *>(
-      ir->actual_parameters.get_head());
-   ir_variable *location = deref->variable_referenced();
-   unsigned surf_index = (stage_prog_data->binding_table.abo_start +
-                          location->data.binding);
-
-   /* Calculate the surface offset */
-   fs_reg offset(this, glsl_type::uint_type);
-   ir_dereference_array *deref_array = deref->as_dereference_array();
-
-   if (deref_array) {
-      deref_array->array_index->accept(this);
-
-      fs_reg tmp(this, glsl_type::uint_type);
-      emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
-      emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
-   } else {
-      offset = fs_reg(location->data.atomic.offset);
-   }
-
-   /* Emit the appropriate machine instruction */
-   const char *callee = ir->callee->function_name();
-   ir->return_deref->accept(this);
-   fs_reg dst = this->result;
-
-   if (!strcmp("__intrinsic_atomic_read", callee)) {
-      emit_untyped_surface_read(surf_index, dst, offset);
-
-   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
-      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-
-   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
-      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-   }
-}
-
-void
-fs_visitor::visit(ir_call *ir)
-{
-   const char *callee = ir->callee->function_name();
-
-   if (!strcmp("__intrinsic_atomic_read", callee) ||
-       !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee)) {
-      visit_atomic_counter_intrinsic(ir);
-   } else {
-      unreachable("Unsupported intrinsic.");
-   }
-}
-
-void
-fs_visitor::visit(ir_return *)
-{
-   unreachable("FINISHME");
-}
-
-void
-fs_visitor::visit(ir_function *ir)
-{
-   /* Ignore function bodies other than main() -- we shouldn't see calls to
-    * them since they should all be inlined before we get to ir_to_mesa.
-    */
-   if (strcmp(ir->name, "main") == 0) {
-      const ir_function_signature *sig;
-      exec_list empty;
-
-      sig = ir->matching_signature(NULL, &empty, false);
-
-      assert(sig);
-
-      foreach_in_list(ir_instruction, ir_, &sig->body) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-}
-
-void
-fs_visitor::visit(ir_function_signature *)
-{
-   unreachable("not reached");
-}
-
-void
 fs_visitor::visit(ir_emit_vertex *)
 {
    unreachable("not reached");
@@ -2748,129 +919,6 @@ fs_visitor::visit(ir_end_primitive *)
    unreachable("not reached");
 }
 
-void
-fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                fs_reg dst, fs_reg offset, fs_reg src0,
-                                fs_reg src1)
-{
-   bool uses_kill =
-      (stage == MESA_SHADER_FRAGMENT) &&
-      ((brw_wm_prog_data*) this->prog_data)->uses_kill;
-   int reg_width = dispatch_width / 8;
-   int length = 0;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
-
-   if (uses_kill) {
-      emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-         ->force_writemask_all = true;
-   } else {
-      emit(MOV(component(sources[0], 7),
-               retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-         ->force_writemask_all = true;
-   }
-   length++;
-
-   /* Set the atomic operation offset. */
-   sources[1] = fs_reg(this, glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
-   length++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      sources[length] = fs_reg(this, glsl_type::uint_type);
-      emit(MOV(sources[length], src0));
-      length++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      sources[length] = fs_reg(this, glsl_type::uint_type);
-      emit(MOV(sources[length], src1));
-      length++;
-   }
-
-   int mlen = 1 + (length - 1) * reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD);
-   emit(LOAD_PAYLOAD(src_payload, sources, length));
-
-   /* Emit the instruction. */
-   fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                        fs_reg(atomic_op), fs_reg(surf_index));
-   inst->mlen = mlen;
-}
-
-void
-fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
-                                      fs_reg offset)
-{
-   bool uses_kill =
-      (stage == MESA_SHADER_FRAGMENT) &&
-      ((brw_wm_prog_data*) this->prog_data)->uses_kill;
-   int reg_width = dispatch_width / 8;
-
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
-
-   sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
-
-   if (uses_kill) {
-      emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-         ->force_writemask_all = true;
-   } else {
-      emit(MOV(component(sources[0], 7),
-               retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-         ->force_writemask_all = true;
-   }
-
-   /* Set the surface read offset. */
-   sources[1] = fs_reg(this, glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
-
-   int mlen = 1 + reg_width;
-   fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
-                               BRW_REGISTER_TYPE_UD);
-   fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
-
-   /* Emit the instruction. */
-   inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-               fs_reg(surf_index));
-   inst->mlen = mlen;
-}
-
-fs_inst *
-fs_visitor::emit(fs_inst *inst)
-{
-   if (force_uncompressed_stack > 0)
-      inst->exec_size = 8;
-
-   if (dispatch_width == 16 && inst->exec_size == 8)
-      inst->force_uncompressed = true;
-
-   inst->annotation = this->current_annotation;
-   inst->ir = this->base_ir;
-
-   this->instructions.push_tail(inst);
-
-   return inst;
-}
-
-void
-fs_visitor::emit(exec_list list)
-{
-   foreach_in_list_safe(fs_inst, inst, &list) {
-      inst->exec_node::remove();
-      emit(inst);
-   }
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -2878,13 +926,13 @@ fs_visitor::emit_dummy_fs()
    int reg_width = dispatch_width / 8;
 
    /* Everyone's favorite color. */
-   emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
-   emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
-   emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
-   emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
+   bld.MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
+   bld.MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
+   bld.MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
+   bld.MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
 
    fs_inst *write;
-   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
+   write = bld.emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
    write->base_mrf = 2;
    write->mlen = 4 * reg_width;
    write->eot = true;
@@ -2911,16 +959,16 @@ fs_visitor::interp_reg(int location, int channel)
 void
 fs_visitor::emit_interpolation_setup_gen4()
 {
-   this->current_annotation = "compute pixel centers";
+   bld.set_annotation("compute pixel centers");
    this->pixel_x = fs_reg(this, glsl_type::uint_type);
    this->pixel_y = fs_reg(this, glsl_type::uint_type);
    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
 
-   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
-   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
+   bld.emit(FS_OPCODE_PIXEL_X, this->pixel_x);
+   bld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
 
-   this->current_annotation = "compute pixel deltas from v0";
+   bld.set_annotation("compute pixel deltas from v0");
    if (brw->has_pln) {
       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
          fs_reg(this, glsl_type::vec2_type);
@@ -2932,24 +980,24 @@ fs_visitor::emit_interpolation_setup_gen4()
       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
          fs_reg(this, glsl_type::float_type);
    }
-   emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-            this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
-   emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
-            this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
+   bld.ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
+            this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
+   bld.ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
+            this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
 
-   this->current_annotation = "compute pos.w and 1/pos.w";
+   bld.set_annotation("compute pos.w and 1/pos.w");
    /* Compute wpos.w.  It's always in our setup, since it's needed to
     * interpolate the other attributes.
     */
    this->wpos_w = fs_reg(this, glsl_type::float_type);
-   emit(FS_OPCODE_LINTERP, wpos_w,
+   bld.emit(FS_OPCODE_LINTERP, wpos_w,
         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 	interp_reg(VARYING_SLOT_POS, 3));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = fs_reg(this, glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
-   this->current_annotation = NULL;
+   bld.emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
+   bld.set_annotation(NULL);
 }
 
 /** Emits the interpolation for the varying inputs. */
@@ -2959,17 +1007,17 @@ fs_visitor::emit_interpolation_setup_gen6()
    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 
    /* If the pixel centers end up used, the setup is the same as for gen4. */
-   this->current_annotation = "compute pixel centers";
+   bld.set_annotation("compute pixel centers");
    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
-   emit(ADD(int_pixel_x,
+   bld.ADD(int_pixel_x,
             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x10101010))));
-   emit(ADD(int_pixel_y,
+            fs_reg(brw_imm_v(0x10101010)));
+   bld.ADD(int_pixel_y,
             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x11001100))));
+            fs_reg(brw_imm_v(0x11001100)));
 
    /* As of gen6, we can no longer mix float and int sources.  We have
     * to turn the integer pixel centers into floats for their actual
@@ -2977,13 +1025,13 @@ fs_visitor::emit_interpolation_setup_gen6()
     */
    this->pixel_x = fs_reg(this, glsl_type::float_type);
    this->pixel_y = fs_reg(this, glsl_type::float_type);
-   emit(MOV(this->pixel_x, int_pixel_x));
-   emit(MOV(this->pixel_y, int_pixel_y));
+   bld.MOV(this->pixel_x, int_pixel_x);
+   bld.MOV(this->pixel_y, int_pixel_y);
 
-   this->current_annotation = "compute pos.w";
+   bld.set_annotation("compute pos.w");
    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
    this->wpos_w = fs_reg(this, glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   bld.emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 
    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
       uint8_t reg = payload.barycentric_coord_reg[i];
@@ -2991,7 +1039,7 @@ fs_visitor::emit_interpolation_setup_gen6()
       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
    }
 
-   this->current_annotation = NULL;
+   bld.set_annotation(NULL);
 }
 
 int
@@ -3035,7 +1083,7 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components)
          if (colors_enabled & (1 << i)) {
             dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
                               color.type, color.width);
-            inst = emit(MOV(dst[len], offset(color, i)));
+            inst = bld.MOV(dst[len], offset(color, i));
             inst->saturate = key->clamp_fragment_color;
          } else if (color.width == 16) {
             /* We need two BAD_FILE slots for a 16-wide color */
@@ -3058,11 +1106,11 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components)
       for (unsigned i = 0; i < 4; ++i) {
          if (colors_enabled & (1 << i)) {
             dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
-            inst = emit(MOV(dst[i], half(offset(color, i), 0)));
+            inst = bld.MOV(dst[i], half(offset(color, i), 0));
             inst->saturate = key->clamp_fragment_color;
 
             dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
-            inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
+            inst = bld.MOV(dst[i + 4], half(offset(color, i), 1));
             inst->saturate = key->clamp_fragment_color;
             inst->force_sechalf = true;
          }
@@ -3101,7 +1149,7 @@ fs_visitor::emit_alpha_test()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   this->current_annotation = "Alpha test";
+   bld.set_annotation("Alpha test");
 
    fs_inst *cmp;
    if (key->alpha_test_func == GL_ALWAYS)
@@ -3111,15 +1159,15 @@ fs_visitor::emit_alpha_test()
       /* f0.1 = 0 */
       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                       BRW_REGISTER_TYPE_UW));
-      cmp = emit(CMP(reg_null_f, some_reg, some_reg,
-                     BRW_CONDITIONAL_NEQ));
+      cmp = bld.CMP(bld.reg_null_f(), some_reg, some_reg,
+                     BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
       fs_reg color = offset(outputs[0], 3);
 
       /* f0.1 &= func(color, ref) */
-      cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
-                     cond_for_alpha_func(key->alpha_test_func)));
+      cmp = bld.CMP(bld.reg_null_f(), color, fs_reg(key->alpha_test_ref),
+                     cond_for_alpha_func(key->alpha_test_func));
    }
    cmp->predicate = BRW_PREDICATE_NORMAL;
    cmp->flag_subreg = 1;
@@ -3133,7 +1181,7 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
-   this->current_annotation = "FB write header";
+   bld.set_annotation("FB write header");
    bool header_present = true;
    int reg_size = dispatch_width / 8;
 
@@ -3163,22 +1211,22 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
 
    if (payload.aa_dest_stencil_reg) {
       sources[length] = fs_reg(GRF, alloc.allocate(1));
-      emit(MOV(sources[length],
-               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
+      bld.MOV(sources[length],
+               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
       length++;
    }
 
    prog_data->uses_omask =
       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    if (prog_data->uses_omask) {
-      this->current_annotation = "FB write oMask";
+      bld.set_annotation("FB write oMask");
       assert(this->sample_mask.file != BAD_FILE);
       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
        * it's unsinged single words, one vgrf is always 16-wide.
        */
       sources[length] = fs_reg(GRF, alloc.allocate(1),
                                BRW_REGISTER_TYPE_UW, 16);
-      emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
+      bld.emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
       length++;
    }
 
@@ -3192,7 +1240,7 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
       if (src0_alpha.file != BAD_FILE) {
          sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
                                   src0_alpha.type, src0_alpha.width);
-         fs_inst *inst = emit(MOV(sources[length], src0_alpha));
+         fs_inst *inst = bld.MOV(sources[length], src0_alpha);
          inst->saturate = key->clamp_fragment_color;
          length++;
       }
@@ -3217,19 +1265,19 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth.file != BAD_FILE);
-	 emit(MOV(sources[length], this->frag_depth));
+	 bld.MOV(sources[length], this->frag_depth);
       } else {
 	 /* Pass through the payload depth. */
-	 emit(MOV(sources[length],
-                  fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+	 bld.MOV(sources[length],
+                  fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
       }
       length++;
    }
 
    if (payload.dest_depth_reg) {
       sources[length] = fs_reg(this, glsl_type::float_type);
-      emit(MOV(sources[length],
-               fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
+      bld.MOV(sources[length],
+               fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)));
       length++;
    }
 
@@ -3238,16 +1286,16 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
    if (brw->gen >= 7) {
       /* Send from the GRF */
       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
-      load = emit(LOAD_PAYLOAD(payload, sources, length));
+      load = bld.LOAD_PAYLOAD(payload, sources, length);
       payload.reg = alloc.allocate(load->regs_written);
       load->dst = payload;
-      write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
+      write = bld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
       write->base_mrf = -1;
    } else {
       /* Send from the MRF */
-      load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
-                               sources, length));
-      write = emit(FS_OPCODE_FB_WRITE);
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length);
+      write = bld.emit(FS_OPCODE_FB_WRITE);
       write->base_mrf = 1;
    }
 
@@ -3278,17 +1326,17 @@ fs_visitor::emit_fb_writes()
       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
          emit_shader_time_end();
 
-      this->current_annotation = ralloc_asprintf(this->mem_ctx,
-						 "FB dual-source write");
+      bld.set_annotation(ralloc_asprintf(this->mem_ctx,
+                                         "FB dual-source write"));
       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
                                   reg_undef, 4);
       inst->target = 0;
       prog_data->dual_src_blend = true;
    } else if (key->nr_color_regions > 0) {
       for (int target = 0; target < key->nr_color_regions; target++) {
-         this->current_annotation = ralloc_asprintf(this->mem_ctx,
+         bld.set_annotation(ralloc_asprintf(this->mem_ctx,
                                                     "FB write target %d",
-                                                    target);
+                                            target));
          fs_reg src0_alpha;
          if (brw->gen >= 6 && key->replicate_alpha && target != 0)
             src0_alpha = offset(outputs[0], 3);
@@ -3315,32 +1363,7 @@ fs_visitor::emit_fb_writes()
    }
 
    inst->eot = true;
-   this->current_annotation = NULL;
-}
-
-void
-fs_visitor::resolve_ud_negate(fs_reg *reg)
-{
-   if (reg->type != BRW_REGISTER_TYPE_UD ||
-       !reg->negate)
-      return;
-
-   fs_reg temp = fs_reg(this, glsl_type::uint_type);
-   emit(MOV(temp, *reg));
-   *reg = temp;
-}
-
-void
-fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
-{
-   assert(ctx->Const.UniformBooleanTrue == 1);
-
-   if (rvalue->type != glsl_type::bool_type)
-      return;
-
-   fs_reg temp = fs_reg(this, glsl_type::bool_type);
-   emit(AND(temp, *reg, fs_reg(1)));
-   *reg = temp;
+   bld.set_annotation(NULL);
 }
 
 fs_visitor::fs_visitor(struct brw_context *brw,
@@ -3350,59 +1373,53 @@ fs_visitor::fs_visitor(struct brw_context *brw,
                        struct gl_shader_program *shader_prog,
                        struct gl_fragment_program *fp,
                        unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
-                     MESA_SHADER_FRAGMENT),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
+   : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base, mem_ctx,
+                     MESA_SHADER_FRAGMENT, INTEL_DEBUG & DEBUG_WM,
+                     prog_data->uses_kill,
+                     brw::fs_builder(brw, mem_ctx, alloc, instructions,
+                                     dispatch_width),
+                     (dispatch_width == 16 ? ST_FS16 : ST_FS8),
+                     prog_data->base.nr_params),
      key(key), prog_data(&prog_data->base),
      dispatch_width(dispatch_width)
 {
-   this->mem_ctx = mem_ctx;
    init();
 }
 
 void
 fs_visitor::init()
 {
-   this->failed = false;
    this->simd16_unsupported = false;
    this->no16_msg = NULL;
-   this->variable_ht = hash_table_ctor(0,
-                                       hash_table_pointer_hash,
-                                       hash_table_pointer_compare);
 
    memset(&this->payload, 0, sizeof(this->payload));
    memset(this->outputs, 0, sizeof(this->outputs));
    memset(this->output_components, 0, sizeof(this->output_components));
    this->source_depth_to_render_target = false;
    this->runtime_check_aads_emit = false;
-   this->first_non_payload_grf = 0;
-   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
-
-   this->current_annotation = NULL;
-   this->base_ir = NULL;
 
-   this->virtual_grf_start = NULL;
-   this->virtual_grf_end = NULL;
    this->live_intervals = NULL;
    this->regs_live_at_ip = NULL;
 
-   this->uniforms = 0;
    this->last_scratch = 0;
    this->pull_constant_loc = NULL;
    this->push_constant_loc = NULL;
 
-   this->force_uncompressed_stack = 0;
-
    this->spilled_any_registers = false;
    this->do_dual_src = false;
-
-   if (dispatch_width == 8)
-      this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
 }
 
-fs_visitor::~fs_visitor()
+fs_reg
+fs_visitor::emit_untyped_surface_header()
 {
-   hash_table_dtor(this->variable_ht);
+   using namespace brw;
+   const fs_reg payload = half(bld.natural_reg(BRW_REGISTER_TYPE_UD), 0);
+   const fs_reg sample_mask =
+      (uses_kill ? brw_flag_reg(0, 1) :
+       retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
+
+   exec_all(exec_half(0, bld.MOV(payload, fs_reg(0u))));
+   exec_all(bld.MOV(component(payload, 7), sample_mask));
+
+   return payload;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index daedb35a88c..31582635056 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -47,7 +47,6 @@ public:
    fs_reg(fs_visitor *v, const struct glsl_type *type);
 
    bool equals(const fs_reg &r) const;
-   bool is_valid_3src() const;
    bool is_contiguous() const;
 
    fs_reg &apply_stride(unsigned stride);
@@ -82,6 +81,18 @@ public:
    uint8_t stride;
 };
 
+namespace brw {
+   template<>
+   struct reg_traits<fs_reg> {
+      typedef fs_reg src_reg;
+      typedef fs_reg dst_reg;
+
+      static const unsigned alloc_size = 1;
+      static const bool allows_swizzle = false;
+      static const bool allows_writemask = false;
+   };
+}
+
 static inline fs_reg
 byte_offset(fs_reg reg, unsigned delta)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 2d5610b712d..cd495e8cb5f 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -30,7 +30,6 @@
 
 namespace brw {
 
-class vec4_visitor;
 class dst_reg;
 
 class src_reg : public backend_reg
diff --git a/src/mesa/drivers/dri/i965/brw_ir_visitor.cpp b/src/mesa/drivers/dri/i965/brw_ir_visitor.cpp
new file mode 100644
index 00000000000..3e67aeda0af
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_ir_visitor.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_ir_visitor.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+base_visitor::base_visitor(struct brw_context *brw,
+                           struct gl_shader_program *shader_prog,
+                           struct gl_program *prog,
+                           struct brw_stage_prog_data *stage_prog_data,
+                           void *mem_ctx,
+                           gl_shader_stage stage,
+                           bool debug_flag,
+                           unsigned uniform_array_size)
+   : brw(brw),
+     ctx(&brw->ctx),
+     shader(shader_prog ?
+        (struct brw_shader *)shader_prog->_LinkedShaders[stage] : NULL),
+     shader_prog(shader_prog),
+     prog(prog),
+     stage_prog_data(stage_prog_data),
+     mem_ctx(mem_ctx),
+     cfg(NULL),
+     stage(stage),
+     fail_msg(NULL),
+     debug_flag(debug_flag),
+     failed(false),
+     first_non_payload_grf(0),
+     max_grf(brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF),
+     virtual_grf_start(NULL),
+     virtual_grf_end(NULL),
+     uniforms(0),
+     uniform_size(rzalloc_array(mem_ctx, int, uniform_array_size)),
+     uniform_vector_size(rzalloc_array(mem_ctx, int, uniform_array_size)),
+     uniform_array_size(uniform_array_size)
+{
+}
+
+void
+base_visitor::dump_instructions()
+{
+   dump_instructions(NULL);
+}
+
+void
+base_visitor::dump_instructions(const char *name)
+{
+   FILE *file = stderr;
+   if (name && geteuid() != 0) {
+      file = fopen(name, "w");
+      if (!file)
+         file = stderr;
+   }
+
+   int ip = 0;
+   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+      if (!name)
+         fprintf(stderr, "%d: ", ip++);
+      dump_instruction(inst, file);
+   }
+
+   if (file != stderr) {
+      fclose(file);
+   }
+}
+
+void
+base_visitor::calculate_cfg()
+{
+   if (this->cfg)
+      return;
+   cfg = new(mem_ctx) cfg_t(&this->instructions);
+}
+
+void
+base_visitor::invalidate_cfg()
+{
+   ralloc_free(this->cfg);
+   this->cfg = NULL;
+}
+
+/**
+ * Sets up the starting offsets for the groups of binding table entries
+ * commong to all pipeline stages.
+ *
+ * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
+ * unused but also make sure that addition of small offsets to them will
+ * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
+ */
+void
+base_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
+{
+   int num_textures = _mesa_fls(prog->SamplersUsed);
+
+   stage_prog_data->binding_table.texture_start = next_binding_table_offset;
+   next_binding_table_offset += num_textures;
+
+   if (shader) {
+      stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
+      next_binding_table_offset += shader->base.NumUniformBlocks;
+   } else {
+      stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
+   }
+
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
+      next_binding_table_offset++;
+   } else {
+      stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
+   }
+
+   if (prog->UsesGather) {
+      if (brw->gen >= 8) {
+         stage_prog_data->binding_table.gather_texture_start =
+            stage_prog_data->binding_table.texture_start;
+      } else {
+         stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
+         next_binding_table_offset += num_textures;
+      }
+   } else {
+      stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
+   }
+
+   if (shader_prog && shader_prog->NumAtomicBuffers) {
+      stage_prog_data->binding_table.abo_start = next_binding_table_offset;
+      next_binding_table_offset += shader_prog->NumAtomicBuffers;
+   } else {
+      stage_prog_data->binding_table.abo_start = 0xd0d0d0d0;
+   }
+
+   /* This may or may not be used depending on how the compile goes. */
+   stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
+   next_binding_table_offset++;
+
+   assert(next_binding_table_offset <= BRW_MAX_SURFACES);
+
+   /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
+}
+
+void
+base_visitor::vfail(const char *format, va_list va)
+{
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   msg = ralloc_asprintf(mem_ctx, "compile failed: %s\n", msg);
+
+   this->fail_msg = msg;
+
+   if (debug_flag) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+void
+base_visitor::fail(const char *format, ...)
+{
+   va_list va;
+
+   va_start(va, format);
+   vfail(format, va);
+   va_end(va);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_ir_visitor.h b/src/mesa/drivers/dri/i965/brw_ir_visitor.h
new file mode 100644
index 00000000000..876f162b91e
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_ir_visitor.h
@@ -0,0 +1,2353 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_VISITOR_H
+#define BRW_IR_VISITOR_H
+
+#include "brw_ir_builder.h"
+#include "brw_program.h"
+#include "program/hash_table.h"
+#include "glsl/ir_uniform.h"
+
+extern "C" {
+#include "program/sampler.h"
+}
+
+namespace brw {
+
+class base_visitor : public ir_visitor {
+protected:
+   base_visitor(struct brw_context *brw,
+                struct gl_shader_program *shader_prog,
+                struct gl_program *prog,
+                struct brw_stage_prog_data *stage_prog_data,
+                void *mem_ctx,
+                gl_shader_stage stage,
+                bool debug_flag,
+                unsigned uniform_array_size);
+
+public:
+   struct brw_context *const brw;
+   struct gl_context *const ctx;
+   struct brw_shader *const shader;
+   struct gl_shader_program *const shader_prog;
+   struct gl_program *const prog;
+   struct brw_stage_prog_data *const stage_prog_data;
+
+   /** ralloc context for temporary data used during compile */
+   void *mem_ctx;
+
+   /**
+    * List of either fs_inst or vec4_instruction (inheriting from
+    * backend_instruction)
+    */
+   exec_list instructions;
+
+   cfg_t *cfg;
+
+   gl_shader_stage stage;
+
+   virtual void dump_instruction(backend_instruction *inst) = 0;
+   virtual void dump_instruction(backend_instruction *inst, FILE *file) = 0;
+   virtual void dump_instructions();
+   virtual void dump_instructions(const char *name);
+
+   void calculate_cfg();
+   void invalidate_cfg();
+
+   void assign_common_binding_table_offsets(uint32_t next_binding_table_offset);
+
+   virtual void invalidate_live_intervals() = 0;
+
+   void vfail(const char *msg, va_list args);
+   void fail(const char *msg, ...);
+
+   char *fail_msg;
+   bool debug_flag;
+   bool failed;
+
+   int first_non_payload_grf;
+   /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
+   unsigned max_grf;
+   int *virtual_grf_start;
+   int *virtual_grf_end;
+
+   /** Number of uniform variable components visited. */
+   unsigned uniforms;
+   int *uniform_size;
+   int *uniform_vector_size;
+   unsigned uniform_array_size; /*< Size of uniform_[vector_]size arrays */
+};
+
+template<typename V, typename B>
+class backend_visitor : public base_visitor {
+protected:
+   typedef typename B::src_reg src_reg;
+   typedef typename B::dst_reg dst_reg;
+   typedef typename B::vector_builder::src_reg src_vector;
+   typedef typename B::vector_builder::dst_reg dst_vector;
+   typedef typename B::instruction instruction;
+
+   V &
+   self() {
+      return static_cast<V &>(*this);
+   }
+
+   backend_visitor(struct brw_context *brw,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog,
+                   struct brw_stage_prog_data *stage_prog_data,
+                   void *mem_ctx,
+                   gl_shader_stage stage,
+                   bool debug_flag,
+                   bool uses_kill,
+                   const B &bld,
+                   shader_time_shader_type st_type,
+                   unsigned uniform_array_size) :
+      base_visitor(brw, shader_prog, prog, stage_prog_data, mem_ctx, stage,
+                   debug_flag, uniform_array_size),
+      variable_ht(hash_table_ctor(0,
+                                  hash_table_pointer_hash,
+                                  hash_table_pointer_compare)),
+      bld(bld), uses_kill(uses_kill), st_type(st_type)
+   {
+   }
+
+   ~backend_visitor()
+   {
+      hash_table_dtor(this->variable_ht);
+   }
+
+   src_reg
+   visit_result(ir_instruction *ir)
+   {
+      ir->accept(this);
+      assert(this->result.file != BAD_FILE);
+      return this->result;
+   }
+
+   unsigned
+   emit_constant_values(const dst_reg &dst, ir_constant *ir)
+   {
+      unsigned size = 0;
+
+      if (ir->type->is_record()) {
+         foreach_in_list(ir_constant, field_value, &ir->components)
+            size += emit_constant_values(offset(dst, size), field_value);
+
+      } else if (ir->type->is_array()) {
+         for (unsigned i = 0; i < ir->type->length; i++)
+            size += emit_constant_values(offset(dst, size),
+                                         ir->array_elements[i]);
+
+      } else {
+         const int n = ir->type->vector_elements;
+         typename B::vector_builder vbld = bld.vector();
+
+         for (int j = 0; j < ir->type->matrix_columns; j++) {
+            dst_vector tmp = retype(offset(dst_vector_n(dst, 4), size),
+                                    brw_type_for_base_type(ir->type));
+            unsigned mask = (1 << n) - 1;
+
+            while (mask) {
+               const int i = ffs(mask) - 1;
+
+               tmp.writemask = 1 << i;
+
+               /* Find other components that match the one we're about to
+                * write.  Emits fewer instructions for things like vec4(0.5,
+                * 1.5, 1.5, 1.5).
+                */
+               for (int k = i + 1; k < n; k++) {
+                  if (ir->type->base_type == GLSL_TYPE_BOOL) {
+                     if (ir->value.b[j * n + i] == ir->value.b[j * n + k])
+                        tmp.writemask |= 1 << k;
+                  } else {
+                     /* u, i, and f storage all line up, so no need for a
+                      * switch case for comparing each type.
+                      */
+                     if (ir->value.u[j * n + i] == ir->value.u[j * n + k])
+                        tmp.writemask |= 1 << k;
+                  }
+               }
+
+               switch (ir->type->base_type) {
+                  case GLSL_TYPE_FLOAT:
+                     vbld.MOV(tmp, src_reg(ir->value.f[j * n + i]));
+                     break;
+                  case GLSL_TYPE_INT:
+                     vbld.MOV(tmp, src_reg(ir->value.i[j * n + i]));
+                     break;
+                  case GLSL_TYPE_UINT:
+                     vbld.MOV(tmp, src_reg(ir->value.u[j * n + i]));
+                     break;
+                  case GLSL_TYPE_BOOL:
+                     vbld.MOV(tmp, src_reg(ir->value.b[j * n + i] ?
+                                           ctx->Const.UniformBooleanTrue : 0));
+                     break;
+                  default:
+                     unreachable("Non-float/uint/int/bool constant");
+               }
+
+               mask &= ~tmp.writemask;
+            }
+
+            size += CEILING(n, alloc_size);
+         }
+      }
+
+      return size;
+   }
+
+   void
+   visit(ir_constant *ir)
+   {
+      dst_reg dst = self().temporary_reg(ir->type);
+      emit_constant_values(dst, ir);
+      this->result = src_reg(dst);
+   }
+
+   dst_reg *
+   variable_storage(ir_variable *var)
+   {
+      return (dst_reg *)hash_table_find(this->variable_ht, var);
+   }
+
+   /* Our support for builtin uniforms is even scarier than non-builtin.
+    * It sits on top of the PROG_STATE_VAR parameters that are
+    * automatically updated from GL context state.
+    */
+   void
+   setup_builtin_uniform_values(ir_variable *ir)
+   {
+      const ir_state_slot *const slots = ir->get_state_slots();
+
+      for (unsigned i = 0; i < ir->get_num_state_slots(); i++) {
+         /* This state reference has already been setup by ir_to_mesa, but we'll
+          * get the same index back here.
+          */
+         int index = _mesa_add_state_reference(this->prog->Parameters,
+                                               (gl_state_index *)slots[i].tokens);
+         gl_constant_value *values = prog->Parameters->ParameterValues[index];
+         const unsigned n = size_for_swizzle(
+            from_glsl_swizzle(WRITEMASK_XYZW, slots[i].swizzle));
+
+         /* Add each of the unique swizzles of the element as a parameter.
+          * This'll end up matching the expected layout of the
+          * array/matrix/structure we're trying to fill in.
+          */
+         for (unsigned j = 0; j < MAX2(n, alloc_size); j++)
+            stage_prog_data->param[uniforms * alloc_size + j] =
+               &values[GET_SWZ(slots[i].swizzle, j)];
+
+         uniform_vector_size[uniforms] = n;
+         uniforms += CEILING(n, alloc_size);
+      }
+   }
+
+   /* Our support for uniforms is piggy-backed on the struct
+    * gl_fragment_program, because that's where the values actually
+    * get stored, rather than in some global gl_shader_program uniform
+    * store.
+    */
+   void
+   setup_uniform_values(ir_variable *ir)
+   {
+      int namelen = strlen(ir->name);
+
+      /* The data for our (non-builtin) uniforms is stored in a series of
+       * gl_uniform_driver_storage structs for each subcomponent that
+       * glGetUniformLocation() could name.  We know it's been set up in the same
+       * order we'd walk the type, so walk the list of storage and find anything
+       * with our name, or the prefix of a component that starts with our name.
+       */
+      for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
+         struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
+
+         if (strncmp(ir->name, storage->name, namelen) != 0 ||
+             (storage->name[namelen] != 0 &&
+              storage->name[namelen] != '.' &&
+              storage->name[namelen] != '[')) {
+            continue;
+         }
+
+         gl_constant_value *components = storage->storage;
+         unsigned vector_count = (MAX2(storage->array_elements, 1) *
+                                  storage->type->matrix_columns);
+
+         for (unsigned s = 0; s < vector_count; s++) {
+            unsigned i;
+            assert(uniforms < uniform_array_size);
+
+            for (i = 0; i < storage->type->vector_elements; i++) {
+               stage_prog_data->param[uniforms * alloc_size + i] =
+                  &components[s * storage->type->vector_elements + i];
+            }
+            for (; i < alloc_size; i++) {
+               static const gl_constant_value zero = { 0.0 };
+               stage_prog_data->param[uniforms * alloc_size + i] = &zero;
+            }
+
+            uniform_vector_size[uniforms] = storage->type->vector_elements;
+            uniforms += CEILING(storage->type->vector_elements, alloc_size);
+         }
+      }
+   }
+
+   unsigned
+   type_vector_size(const struct glsl_type *type)
+   {
+      if (type->is_scalar() || type->is_vector() || type->is_matrix())
+         return type->vector_elements;
+      else
+         return 4;
+   }
+
+   void
+   visit(ir_variable *ir)
+   {
+      dst_reg *reg = NULL;
+
+      if (variable_storage(ir))
+         return;
+
+      if (ir->data.mode == ir_var_auto ||
+          ir->data.mode == ir_var_temporary) {
+         reg = new(mem_ctx) dst_reg(self().temporary_reg(ir->type));
+
+      } else if (ir->data.mode == ir_var_uniform) {
+         /* Thanks to the lower_ubo_reference pass, we will see only
+          * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
+          * variables, so no need for them to be in variable_ht.
+          *
+          * Some uniforms, such as samplers and atomic counters, have no actual
+          * storage, so we should ignore them.
+          */
+         if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
+            return;
+
+         if (bld.dispatch_width() == 16) {
+            fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
+            return;
+         }
+
+         reg = new(mem_ctx) dst_reg(
+            resize(retype(dst_reg(UNIFORM, this->uniforms),
+                          brw_type_for_base_type(ir->type)),
+                   type_vector_size(ir->type)));
+
+         /* Track how big the whole uniform variable is, in case we need to put a
+          * copy of its data into pull constants for array access.
+          */
+         assert(this->uniforms < uniform_array_size);
+         this->uniform_size[this->uniforms] = type_size(ir->type);
+
+         if (!strncmp(ir->name, "gl_", 3)) {
+            setup_builtin_uniform_values(ir);
+         } else {
+            setup_uniform_values(ir);
+         }
+
+      } else {
+         unreachable("not reached");
+      }
+
+      hash_table_insert(this->variable_ht, reg, ir);
+   }
+
+   /** Walks an exec_list of ir_instruction and sends it through this visitor. */
+   void
+   visit_instructions(const exec_list *list)
+   {
+      foreach_in_list(ir_instruction, ir, list) {
+         bld.set_base_ir(ir);
+         ir->accept(this);
+      }
+   }
+
+   void
+   resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
+   {
+      assert(ctx->Const.UniformBooleanTrue == 1);
+
+      if (rvalue->type != glsl_type::bool_type)
+         return;
+
+      dst_reg temp = bld.natural_reg(BRW_REGISTER_TYPE_D);
+      bld.AND(temp, *reg, src_reg(1));
+      *reg = src_reg(temp);
+   }
+
+   void
+   visit(ir_dereference_variable *ir)
+   {
+      dst_reg *reg = variable_storage(ir->var);
+
+      if (!reg) {
+         fail("Failed to find variable storage for %s\n", ir->var->name);
+         this->result = src_reg(bld.reg_null_d());
+         return;
+      }
+
+      this->result = resize(src_reg(*reg), type_vector_size(ir->type));
+   }
+
+   void
+   visit(ir_dereference_record *ir)
+   {
+      const glsl_type *struct_type = ir->record->type;
+      unsigned off = 0;
+
+      ir->record->accept(this);
+
+      for (unsigned i = 0; i < struct_type->length; i++) {
+         if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
+            break;
+         off += type_size(struct_type->fields.structure[i].type);
+      }
+
+      this->result = retype(resize(offset(this->result, off),
+                                   type_vector_size(ir->type)),
+                            brw_type_for_base_type(ir->type));
+   }
+
+   virtual unsigned
+   get_array_stride(ir_dereference_array *ir)
+   {
+      /* Under normal circumstances array elements are stored consecutively, so
+       * the stride is equal to the size of the array element.
+       */
+      return type_size(ir->type);
+   }
+
+   void
+   visit(ir_dereference_array *ir)
+   {
+      ir_constant *constant_index = ir->array_index->constant_expression_value();
+      src_reg src = retype(visit_result(ir->array),
+                           brw_type_for_base_type(ir->type));
+
+      if (constant_index) {
+         src = offset(src, constant_index->value.i[0] * get_array_stride(ir));
+      } else {
+         /* Variable index array dereference.  We attach the variable index
+          * component to the reg as a pointer to a register containing the
+          * offset.  Currently only uniform arrays are supported in this
+          * patch, and that reladdr pointer is resolved by
+          * move_uniform_array_access_to_pull_constants().  All other array
+          * types are lowered by lower_variable_index_to_cond_assign().
+          */
+         src_reg index_reg = visit_result(ir->array_index);
+
+         if (get_array_stride(ir) != 1) {
+            dst_reg tmp = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+            bld.MUL(tmp, index_reg, src_reg(get_array_stride(ir)));
+            index_reg = src_reg(tmp);
+         }
+
+         if (src.reladdr) {
+            dst_reg tmp = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+            bld.ADD(tmp, index_reg, *src.reladdr);
+            index_reg = src_reg(tmp);
+         }
+
+         src.reladdr = new(mem_ctx) src_reg(index_reg);
+      }
+
+      /* If the type is smaller than a vec4, replicate the last channel out. */
+      this->result = resize(src, type_vector_size(ir->type));
+   }
+
+   /**
+    * Emit a gen6 IF statement with the comparison folded into the IF
+    * instruction.
+    */
+   void
+   emit_if_gen6(ir_if *ir)
+   {
+      ir_expression *expr = ir->condition->as_expression();
+
+      if (expr && expr->operation != ir_binop_ubo_load) {
+         bool is_scalar = true;
+         src_reg op[3];
+
+         assert(expr->get_num_operands() <= 3);
+         for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+            is_scalar &= expr->operands[i]->type->is_scalar();
+            op[i] = visit_result(expr->operands[i]);
+         }
+
+         switch (expr->operation) {
+         case ir_unop_logic_not:
+            bld.IF(op[0], src_reg(0), BRW_CONDITIONAL_Z);
+            return;
+
+         case ir_binop_logic_xor:
+            bld.IF(op[0], op[1], BRW_CONDITIONAL_NZ);
+            return;
+
+         case ir_binop_logic_or: {
+            dst_reg temp = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+            bld.OR(temp, op[0], op[1]);
+            bld.IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ);
+            return;
+         }
+         case ir_binop_logic_and: {
+            dst_reg temp = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+            bld.AND(temp, op[0], op[1]);
+            bld.IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ);
+            return;
+         }
+         case ir_unop_f2b:
+            exec_condmod(BRW_CONDITIONAL_NZ,
+                         bld.emit(BRW_OPCODE_IF, bld.reg_null_f(),
+                                  op[0], src_reg(0)));
+            return;
+
+         case ir_unop_i2b:
+            bld.IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+            return;
+
+         case ir_binop_greater:
+         case ir_binop_gequal:
+         case ir_binop_less:
+         case ir_binop_lequal:
+         case ir_binop_equal:
+         case ir_binop_nequal:
+         case ir_binop_all_equal:
+         case ir_binop_any_nequal:
+            if (ctx->Const.UniformBooleanTrue == 1) {
+               resolve_bool_comparison(expr->operands[0], &op[0]);
+               resolve_bool_comparison(expr->operands[1], &op[1]);
+            }
+
+            if (is_scalar) {
+               bld.IF(op[0], op[1],
+                      brw_conditional_for_comparison(expr->operation));
+            } else {
+               bld.CMP(bld.reg_null_d(), op[0], op[1],
+                       brw_conditional_for_comparison(expr->operation));
+               bld.IF(expr->operation == ir_binop_all_equal ?
+                      BRW_PREDICATE_ALIGN16_ALL4H :
+                      BRW_PREDICATE_ALIGN16_ANY4H);
+            }
+            return;
+
+         case ir_unop_any:
+            assert(!is_scalar);
+            bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+            bld.IF(BRW_PREDICATE_ALIGN16_ANY4H);
+            return;
+
+         case ir_triop_csel: {
+            /* Expand the boolean condition into the flag register. */
+            exec_condmod(BRW_CONDITIONAL_NZ,
+                         bld.MOV(bld.reg_null_d(), op[0]));
+
+            /* Select which boolean to return. */
+            dst_reg temp = bld.scalar_reg(op[1].type);
+            exec_predicate(BRW_PREDICATE_NORMAL,
+                           bld.emit(BRW_OPCODE_SEL, temp, op[1], op[2]));
+            bld.IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ);
+            return;
+         }
+         default:
+            unreachable("not reached");
+         }
+      }
+
+      bld.IF(visit_result(ir->condition), src_reg(0), BRW_CONDITIONAL_NZ);
+   }
+
+   enum brw_predicate
+   emit_bool_to_cond_code(ir_rvalue *ir)
+   {
+      ir_expression *expr = ir->as_expression();
+      enum brw_predicate predicate = BRW_PREDICATE_NORMAL;
+
+      if (expr && expr->operation != ir_binop_ubo_load) {
+         bool is_scalar = true;
+         src_reg op[3];
+
+         assert(expr->get_num_operands() <= 3);
+         for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+            is_scalar &= expr->operands[i]->type->is_scalar();
+            op[i] = bld.fix_condmod_negate(visit_result(expr->operands[i]));
+         }
+
+         switch (expr->operation) {
+         case ir_unop_logic_not:
+            exec_condmod(BRW_CONDITIONAL_Z,
+                         bld.AND(bld.reg_null_d(), op[0], src_reg(1)));
+            break;
+
+         case ir_binop_logic_xor:
+            if (ctx->Const.UniformBooleanTrue == 1) {
+               dst_reg dst = bld.natural_reg(BRW_REGISTER_TYPE_UD);
+               bld.XOR(dst, op[0], op[1]);
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.AND(bld.reg_null_d(), src_reg(dst), src_reg(1)));
+            } else {
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.XOR(bld.reg_null_d(), op[0], op[1]));
+            }
+            break;
+
+         case ir_binop_logic_or:
+            if (ctx->Const.UniformBooleanTrue == 1) {
+               dst_reg dst = bld.natural_reg(BRW_REGISTER_TYPE_UD);
+               bld.OR(dst, op[0], op[1]);
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.AND(bld.reg_null_d(), src_reg(dst), src_reg(1)));
+            } else {
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.OR(bld.reg_null_d(), op[0], op[1]));
+            }
+            break;
+
+         case ir_binop_logic_and:
+            if (ctx->Const.UniformBooleanTrue == 1) {
+               dst_reg dst = bld.natural_reg(BRW_REGISTER_TYPE_UD);
+               bld.AND(dst, op[0], op[1]);
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.AND(bld.reg_null_d(), src_reg(dst), src_reg(1)));
+            } else {
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.AND(bld.reg_null_d(), op[0], op[1]));
+            }
+            break;
+
+         case ir_unop_f2b:
+            if (brw->gen >= 6)
+               bld.CMP(bld.reg_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ);
+            else
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.MOV(bld.reg_null_f(), op[0]));
+            break;
+
+         case ir_unop_i2b:
+            if (brw->gen >= 6)
+               bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+            else
+               exec_condmod(BRW_CONDITIONAL_NZ,
+                            bld.MOV(bld.reg_null_d(), op[0]));
+            break;
+
+         case ir_binop_greater:
+         case ir_binop_gequal:
+         case ir_binop_less:
+         case ir_binop_lequal:
+         case ir_binop_equal:
+         case ir_binop_nequal:
+         case ir_binop_all_equal:
+         case ir_binop_any_nequal:
+            if (ctx->Const.UniformBooleanTrue == 1) {
+               resolve_bool_comparison(expr->operands[0], &op[0]);
+               resolve_bool_comparison(expr->operands[1], &op[1]);
+            }
+
+            bld.CMP(bld.reg_null_d(), op[0], op[1],
+                    brw_conditional_for_comparison(expr->operation));
+
+            if (!is_scalar)
+               predicate = (expr->operation == ir_binop_all_equal ?
+                            BRW_PREDICATE_ALIGN16_ALL4H :
+                            BRW_PREDICATE_ALIGN16_ANY4H);
+            break;
+
+         case ir_unop_any:
+            assert(!is_scalar);
+            bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+            predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+            break;
+
+         case ir_triop_csel: {
+            /* Expand the boolean condition into the flag register. */
+            exec_condmod(BRW_CONDITIONAL_NZ,
+                         bld.MOV(bld.reg_null_d(), op[0]));
+
+            /* Select which boolean to return. */
+            dst_reg temp = bld.natural_reg(op[1].type);
+            exec_predicate(BRW_PREDICATE_NORMAL,
+                           bld.SEL(temp, op[1], op[2]));
+
+            /* Expand the result to a condition code. */
+            exec_condmod(BRW_CONDITIONAL_NZ,
+                         bld.MOV(bld.reg_null_d(), src_reg(temp)));
+            break;
+         }
+
+         default:
+            unreachable("not reached");
+         }
+      } else {
+         exec_condmod(BRW_CONDITIONAL_NZ,
+                      bld.AND(bld.reg_null_d(), visit_result(ir), src_reg(1)));
+      }
+
+      return predicate;
+   }
+
+   void
+   visit(ir_if *ir)
+   {
+      /* Don't point the annotation at the if statement, because then it plus
+       * the then and else blocks get printed.
+       */
+      bld.set_base_ir(ir->condition);
+
+      if (brw->gen == 6) {
+         emit_if_gen6(ir);
+      } else {
+         bld.IF(emit_bool_to_cond_code(ir->condition));
+      }
+
+      visit_instructions(&ir->then_instructions);
+
+      if (!ir->else_instructions.is_empty()) {
+         bld.set_base_ir(ir->condition);
+         bld.emit(BRW_OPCODE_ELSE);
+
+         visit_instructions(&ir->else_instructions);
+      }
+
+      bld.set_base_ir(ir->condition);
+      bld.emit(BRW_OPCODE_ENDIF);
+
+      self().try_replace_with_sel();
+   }
+
+   void
+   visit(ir_loop *ir)
+   {
+      if (brw->gen < 6)
+         self().no16("Can't support (non-uniform) control flow on SIMD16\n");
+
+      /* We don't want debugging output to print the whole body of the
+       * loop as the annotation.
+       */
+      bld.set_base_ir(NULL);
+      bld.emit(BRW_OPCODE_DO);
+
+      visit_instructions(&ir->body_instructions);
+
+      bld.set_base_ir(NULL);
+      bld.emit(BRW_OPCODE_WHILE);
+   }
+
+   void
+   visit(ir_loop_jump *ir)
+   {
+      switch (ir->mode) {
+      case ir_loop_jump::jump_break:
+         bld.emit(BRW_OPCODE_BREAK);
+         break;
+      case ir_loop_jump::jump_continue:
+         bld.emit(BRW_OPCODE_CONTINUE);
+         break;
+      }
+   }
+
+   src_reg
+   get_timestamp()
+   {
+      assert(brw->gen >= 7);
+      dst_reg dst = bld.natural_reg(BRW_REGISTER_TYPE_UD);
+
+      /* The caller wants the low 32 bits of the timestamp.  Since it's running
+       * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+       * which is plenty of time for our purposes.  It is identical across the
+       * EUs, but since it's tracking GPU core speed it will increment at a
+       * varying rate as render P-states change.
+       *
+       * The caller could also check if render P-states have changed (or anything
+       * else that might disrupt timing) by reading back subregister 2 and
+       * checking if that field is != 0.
+       */
+      exec_all(bld.MOV(dst, brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                    BRW_ARF_TIMESTAMP,
+                                    0,
+                                    BRW_REGISTER_TYPE_UD,
+                                    BRW_VERTICAL_STRIDE_0,
+                                    BRW_WIDTH_4,
+                                    BRW_HORIZONTAL_STRIDE_4,
+                                    BRW_SWIZZLE_XYZW,
+                                    WRITEMASK_XYZW)));
+
+      return src_reg(dst);
+   }
+
+   void
+   emit_shader_time_begin()
+   {
+      bld.set_annotation("shader time start");
+      shader_start_time = get_timestamp();
+   }
+
+   void
+   emit_shader_time_end()
+   {
+      B ubld = bld.force_uncompressed();
+
+      bld.set_annotation("shader time end");
+      src_reg shader_end_time = get_timestamp();
+
+      /* Check that there weren't any timestamp reset events (assuming these
+       * were the only two timestamp reads that happened).
+       */
+      src_reg reset_end = component(shader_end_time, 3);
+
+      exec_condmod(BRW_CONDITIONAL_Z,
+                   bld.AND(bld.reg_null_d(), reset_end, src_reg(1u)));
+      bld.IF(BRW_PREDICATE_NORMAL);
+
+      /* Take the current timestamp and get the delta. */
+      dst_reg diff = bld.scalar_reg(BRW_REGISTER_TYPE_UD);
+      ubld.ADD(diff, component(negate(shader_start_time), 0),
+               component(shader_end_time, 0));
+
+      /* If there were no instructions between the two timestamp gets, the diff
+       * is 2 cycles.  Remove that overhead, so I can forget about that when
+       * trying to determine the time taken for single instructions.
+       */
+      ubld.ADD(diff, src_reg(diff), src_reg(-2u));
+
+      emit_shader_time_write(st_type, src_reg(diff));
+      emit_shader_time_write(st_type + ST_WRITTEN, src_reg(1u));
+      bld.emit(BRW_OPCODE_ELSE);
+      emit_shader_time_write(st_type + ST_RESET, src_reg(1u));
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   void
+   emit_shader_time_write(int type, const src_reg &value)
+   {
+      B ubld = bld.force_uncompressed();
+      const int shader_time_index =
+         brw_get_shader_time_index(brw, shader_prog, prog,
+                                   (enum shader_time_shader_type)type);
+      const dst_reg payload = bld.natural_reg(BRW_REGISTER_TYPE_UD, 2);
+
+      ubld.MOV(payload, src_reg(shader_time_index * SHADER_TIME_STRIDE));
+      ubld.MOV(offset(payload, 1), value);
+      ubld.emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(payload));
+   }
+
+   void
+   emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
+                       const dst_reg &dst, const src_reg &addr,
+                       const src_reg &src0, const src_reg &src1)
+   {
+      const dst_reg payload = half(bld.natural_reg(BRW_REGISTER_TYPE_UD, 4), 0);
+      src_reg srcs[4];
+      unsigned h, n = 0;
+
+      /* Initialize the message header if necessary. */
+      srcs[n] = self().emit_untyped_surface_header();
+      n += h = (srcs[n].file == BAD_FILE ? 0 : 1);
+
+      /* Set the atomic operation offset. */
+      srcs[n] = src_reg(bld.natural_reg(BRW_REGISTER_TYPE_UD));
+      bld.MOV(dst_reg(srcs[n++]), addr);
+
+      /* Set the atomic operation arguments. */
+      if (src0.file != BAD_FILE) {
+         srcs[n] = src_reg(bld.natural_reg(BRW_REGISTER_TYPE_UD));
+         bld.MOV(dst_reg(srcs[n++]), src0);
+      }
+
+      if (src1.file != BAD_FILE) {
+         srcs[n] = src_reg(bld.natural_reg(BRW_REGISTER_TYPE_UD));
+         bld.MOV(dst_reg(srcs[n++]), src1);
+      }
+
+      /* Emit the instruction.  Note that this maps to the normal
+       * SIMD8 untyped atomic message on Ivy Bridge when we are doing
+       * SIMD4x2, but that's OK because unused channels will be masked
+       * out.
+       */
+      bld.LOAD_PAYLOAD(payload, srcs, n);
+      bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_reg(payload),
+               src_reg(atomic_op), src_reg(surf_index))
+         ->mlen = (n - h) * bld.dispatch_width() / 8 + h;
+   }
+
+   void
+   emit_untyped_surface_read(unsigned surf_index, const dst_reg &dst,
+                             const src_reg &addr)
+   {
+      const dst_reg payload = half(bld.natural_reg(BRW_REGISTER_TYPE_UD, 2), 0);
+      src_reg srcs[2];
+      unsigned h, n = 0;
+
+      /* Initialize the message header if necessary. */
+      srcs[n] = self().emit_untyped_surface_header();
+      n += h = (srcs[n].file == BAD_FILE ? 0 : 1);
+
+      /* Set the surface read offset. */
+      srcs[n] = src_reg(bld.natural_reg(BRW_REGISTER_TYPE_UD));
+      bld.MOV(dst_reg(srcs[n++]), addr);
+
+      /* Emit the instruction.  Note that this maps to the normal
+       * SIMD8 untyped atomic message on Ivy Bridge when we are doing
+       * SIMD4x2, but that's OK because unused channels will be masked
+       * out.
+       */
+      bld.LOAD_PAYLOAD(payload, srcs, n);
+      bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
+               src_reg(payload), src_reg(surf_index))
+         ->mlen = h + bld.dispatch_width() / 8;
+   }
+
+   void
+   visit_atomic_counter_intrinsic(ir_call *ir)
+   {
+      ir_dereference *deref = static_cast<ir_dereference *>(
+         ir->actual_parameters.get_head());
+      ir_variable *location = deref->variable_referenced();
+      unsigned surf_index = (stage_prog_data->binding_table.abo_start +
+                             location->data.binding);
+
+      /* Calculate the surface offset */
+      src_reg offset(bld.scalar_reg(BRW_REGISTER_TYPE_UD));
+      ir_dereference_array *deref_array = deref->as_dereference_array();
+
+      if (deref_array) {
+         src_reg tmp(bld.scalar_reg(BRW_REGISTER_TYPE_UD));
+         bld.MUL(dst_reg(tmp), visit_result(deref_array->array_index),
+                 src_reg(ATOMIC_COUNTER_SIZE));
+         bld.ADD(dst_reg(offset), tmp, src_reg(location->data.atomic.offset));
+      } else {
+         offset = src_reg(location->data.atomic.offset);
+      }
+
+      /* Emit the appropriate machine instruction */
+      const char *callee = ir->callee->function_name();
+      dst_reg dst(visit_result(ir->return_deref));
+
+      if (!strcmp("__intrinsic_atomic_read", callee)) {
+         emit_untyped_surface_read(surf_index, dst, offset);
+
+      } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
+         emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
+                             src_reg(), src_reg());
+
+      } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
+         emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
+                             src_reg(), src_reg());
+      }
+   }
+
+   void
+   visit(ir_call *ir)
+   {
+      const char *callee = ir->callee->function_name();
+
+      if (!strcmp("__intrinsic_atomic_read", callee) ||
+          !strcmp("__intrinsic_atomic_increment", callee) ||
+          !strcmp("__intrinsic_atomic_predecrement", callee)) {
+         visit_atomic_counter_intrinsic(ir);
+      } else {
+         unreachable("Unsupported intrinsic.");
+      }
+   }
+
+   void
+   visit(ir_return *)
+   {
+      unreachable("FINISHME");
+   }
+
+   void
+   visit(ir_function_signature *)
+   {
+      unreachable("not reached");
+   }
+
+   void
+   visit(ir_function *ir)
+   {
+      /* Ignore function bodies other than main() -- we shouldn't see calls to
+       * them since they should all be inlined.
+       */
+      if (strcmp(ir->name, "main") == 0) {
+         const ir_function_signature *sig;
+         exec_list empty;
+
+         sig = ir->matching_signature(NULL, &empty, false);
+         assert(sig);
+
+         visit_instructions(&sig->body);
+      }
+   }
+
+   bool
+   try_emit_mad(ir_expression *ir)
+   {
+      /* 3-src instructions were introduced in gen6. */
+      if (brw->gen < 6)
+         return false;
+
+      /* MAD can only handle floating-point data. */
+      if (ir->type->base_type != GLSL_TYPE_FLOAT)
+         return false;
+
+      ir_rvalue *nonmul = ir->operands[1];
+      ir_expression *mul = ir->operands[0]->as_expression();
+
+      if (!mul || mul->operation != ir_binop_mul) {
+         nonmul = ir->operands[0];
+         mul = ir->operands[1]->as_expression();
+
+         if (!mul || mul->operation != ir_binop_mul)
+            return false;
+      }
+
+      if (nonmul->as_constant() ||
+          mul->operands[0]->as_constant() ||
+          mul->operands[1]->as_constant())
+         return false;
+
+      dst_reg result = self().temporary_reg(ir->type);
+      bld.MAD(result, bld.fix_3src_operand(visit_result(nonmul)),
+              bld.fix_3src_operand(visit_result(mul->operands[0])),
+              bld.fix_3src_operand(visit_result(mul->operands[1])));
+
+      this->result = src_reg(result);
+      return true;
+   }
+
+   /**
+    * Possibly returns an instruction that set up @param reg.
+    *
+    * Sometimes we want to take the result of some expression/variable
+    * dereference tree and rewrite the instruction generating the result
+    * of the tree.  When processing the tree, we know that the
+    * instructions generated are all writing temporaries that are dead
+    * outside of this tree.  So, if we have some instructions that write
+    * a temporary, we're free to point that temp write somewhere else.
+    *
+    * Note that this doesn't guarantee that the instruction generated
+    * only reg -- it might be the size=4 destination of a texture instruction.
+    */
+   instruction *
+   get_instruction_generating_reg(instruction *start,
+                                  instruction *end,
+                                  const src_vector &reg)
+   {
+      if (end == start ||
+          (end->predicate && end->opcode != BRW_OPCODE_SEL) ||
+          reg.reladdr || reg.abs || reg.negate ||
+          !is_identity_swizzle(get_writemask(end), reg.swizzle) ||
+          !storage(reg).equals(src_reg(end->dst))) {
+         return NULL;
+      } else {
+         return end;
+      }
+   }
+
+   bool
+   try_emit_saturate(ir_expression *ir)
+   {
+      instruction *pre_inst = (instruction *)this->instructions.get_tail();
+      src_reg src = visit_result(ir->operands[0]);
+      instruction *last_inst = (instruction *)this->instructions.get_tail();
+
+      /* If the last instruction from our accept() generated our
+       * src, just set the saturate flag instead of emmitting a separate mov.
+       */
+      instruction *modify = get_instruction_generating_reg(
+         pre_inst, last_inst, src);
+
+      if (modify && modify->can_do_saturate() &&
+          get_writemask(modify) == (1u << ir->type->vector_elements) - 1) {
+         modify->saturate = true;
+         this->result = src;
+         return true;
+      }
+
+      return false;
+   }
+
+   bool
+   try_emit_b2f_of_compare(ir_expression *ir)
+   {
+      /* This optimization relies on CMP setting the destination to 0 when
+       * false.  Early hardware only sets the least significant bit, and
+       * leaves the other bits undefined.  So we can't use it.
+       */
+      if (brw->gen < 6)
+         return false;
+
+      ir_expression *const cmp = ir->operands[0]->as_expression();
+      if (cmp == NULL ||
+          !(cmp->operation == ir_binop_less ||
+            cmp->operation == ir_binop_greater ||
+            cmp->operation == ir_binop_lequal ||
+            cmp->operation == ir_binop_gequal ||
+            cmp->operation == ir_binop_equal ||
+            cmp->operation == ir_binop_nequal))
+         return false;
+
+      const src_reg src0 = visit_result(cmp->operands[0]);
+      const src_reg src1 = visit_result(cmp->operands[1]);
+
+      this->result = src_reg(self().temporary_reg(ir->type));
+      bld.CMP(dst_reg(this->result), src0, src1,
+              brw_conditional_for_comparison(cmp->operation));
+
+      /* If the comparison is false, this->result will just happen to be zero.
+       */
+      exec_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                         bld.emit(BRW_OPCODE_SEL, dst_reg(this->result),
+                                  this->result, src_reg(1.0f)));
+      return true;
+   }
+
+   /**
+    * Emit the correct dot-product instruction for the type of arguments
+    */
+   void
+   emit_dp(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+           unsigned elements)
+   {
+      static enum opcode dot_opcodes[] = {
+         BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
+      };
+
+      bld.emit(dot_opcodes[elements - 2], dst, src0, src1);
+   }
+
+   void
+   visit(ir_expression *ir)
+   {
+      unsigned int operand;
+      src_reg op[Elements(ir->operands)];
+      dst_reg temp;
+      bool is_scalar = true;
+
+      /* Deal with the real oddball stuff first */
+      switch (ir->operation) {
+      case ir_binop_add:
+         if (try_emit_mad(ir))
+            return;
+         break;
+      case ir_unop_b2f:
+         if (try_emit_b2f_of_compare(ir))
+            return;
+         break;
+      case ir_unop_saturate:
+         if (try_emit_saturate(ir))
+            return;
+         break;
+      case ir_unop_interpolate_at_centroid:
+      case ir_binop_interpolate_at_offset:
+      case ir_binop_interpolate_at_sample:
+         self().emit_interpolate_expression(ir);
+         return;
+      default:
+         break;
+      }
+
+      for (operand = 0; operand < ir->get_num_operands(); operand++) {
+         is_scalar &= ir->operands[operand]->type->is_scalar();
+         op[operand] = visit_result(ir->operands[operand]);
+
+         /* Matrix expression operands should have been broken down to vector
+          * operations already.
+          */
+         assert(!ir->operands[operand]->type->is_matrix());
+      }
+
+      /* Storage for our result.  If our result goes into an assignment, it
+       * will just get copy-propagated out, so no worries.
+       */
+      dst_reg result_dst = self().temporary_reg(ir->type);
+      this->result = src_reg(result_dst);
+
+      switch (ir->operation) {
+      case ir_unop_logic_not:
+         if (ctx->Const.UniformBooleanTrue != 1) {
+            bld.NOT(result_dst, op[0]);
+         } else {
+            bld.XOR(result_dst, op[0], src_reg(1));
+         }
+         break;
+      case ir_unop_neg:
+         op[0].negate = !op[0].negate;
+         bld.MOV(result_dst, op[0]);
+         break;
+      case ir_unop_abs:
+         op[0].abs = true;
+         op[0].negate = false;
+         bld.MOV(result_dst, op[0]);
+         break;
+      case ir_unop_sign:
+         if (ir->type->is_float()) {
+            /* AND(val, 0x80000000) gives the sign bit.
+             *
+             * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+             * zero.
+             */
+            bld.CMP(bld.reg_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ);
+
+            op[0].type = BRW_REGISTER_TYPE_UD;
+            result_dst.type = BRW_REGISTER_TYPE_UD;
+            bld.AND(result_dst, op[0], src_reg(0x80000000u));
+
+            exec_predicate(BRW_PREDICATE_NORMAL,
+                           bld.OR(result_dst, src_reg(result_dst),
+                                  src_reg(0x3f800000u)));
+            this->result.type = BRW_REGISTER_TYPE_F;
+         } else {
+            /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+             *               -> non-negative val generates 0x00000000.
+             *  Predicated OR sets 1 if val is positive.
+             */
+            bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G);
+
+            bld.ASR(result_dst, op[0], src_reg(31));
+
+            exec_predicate(BRW_PREDICATE_NORMAL,
+                           bld.OR(result_dst, this->result, src_reg(1)));
+         }
+         break;
+      case ir_unop_rcp:
+         bld.emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
+         break;
+      case ir_unop_exp2:
+         bld.emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
+         break;
+      case ir_unop_log2:
+         bld.emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
+         break;
+      case ir_unop_exp:
+      case ir_unop_log:
+         unreachable("not reached: should be handled by ir_explog_to_explog2");
+
+      case ir_unop_sin:
+      case ir_unop_sin_reduced:
+         bld.emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
+         break;
+      case ir_unop_cos:
+      case ir_unop_cos_reduced:
+         bld.emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
+         break;
+      case ir_unop_dFdx:
+         bld.emit(FS_OPCODE_DDX, result_dst, op[0], src_reg(BRW_DERIVATIVE_BY_HINT));
+         break;
+      case ir_unop_dFdx_coarse:
+         bld.emit(FS_OPCODE_DDX, result_dst, op[0], src_reg(BRW_DERIVATIVE_COARSE));
+         break;
+      case ir_unop_dFdx_fine:
+         bld.emit(FS_OPCODE_DDX, result_dst, op[0], src_reg(BRW_DERIVATIVE_FINE));
+         break;
+      case ir_unop_dFdy:
+         bld.emit(FS_OPCODE_DDY, result_dst, op[0], src_reg(BRW_DERIVATIVE_BY_HINT));
+         break;
+      case ir_unop_dFdy_coarse:
+         bld.emit(FS_OPCODE_DDY, result_dst, op[0], src_reg(BRW_DERIVATIVE_COARSE));
+         break;
+      case ir_unop_dFdy_fine:
+         bld.emit(FS_OPCODE_DDY, result_dst, op[0], src_reg(BRW_DERIVATIVE_FINE));
+         break;
+      case ir_binop_add:
+         bld.ADD(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_sub:
+         unreachable("not reached: should be handled by ir_sub_to_add_neg");
+      case ir_binop_mul:
+         if (brw->gen < 8 && ir->type->is_integer()) {
+            /* For integer multiplication, the MUL uses the low 16 bits
+             * of one of the operands (src0 on gen6, src1 on gen7).  The
+             * MACH accumulates in the contribution of the upper 16 bits
+             * of that operand.
+             */
+            if (ir->operands[0]->is_uint16_constant()) {
+               if (brw->gen < 7)
+                  bld.MUL(result_dst, op[0], op[1]);
+               else
+                  bld.MUL(result_dst, op[1], op[0]);
+            } else if (ir->operands[1]->is_uint16_constant()) {
+               if (brw->gen < 7)
+                  bld.MUL(result_dst, op[1], op[0]);
+               else
+                  bld.MUL(result_dst, op[0], op[1]);
+            } else {
+               if (brw->gen >= 7)
+                  self().no16("SIMD16 explicit accumulator operands unsupported\n");
+
+               dst_reg acc(retype(brw_acc_reg(bld.dispatch_width()),
+                                  this->result.type));
+
+               bld.MUL(acc, op[0], op[1]);
+               bld.MACH(bld.reg_null_d(), op[0], op[1]);
+               bld.MOV(result_dst, src_reg(acc));
+            }
+         } else {
+            bld.MUL(result_dst, op[0], op[1]);
+         }
+         break;
+      case ir_binop_imul_high: {
+         if (brw->gen == 7)
+            self().no16("SIMD16 explicit accumulator operands unsupported\n");
+
+         dst_reg acc(retype(brw_acc_reg(bld.dispatch_width()),
+                            this->result.type));
+
+         instruction *mul = bld.MUL(acc, op[0], op[1]);
+         bld.MACH(result_dst, op[0], op[1]);
+
+         /* Until Gen8, integer multiplies read 32-bits from one source, and
+          * 16-bits from the other, and relying on the MACH instruction to
+          * generate the high bits of the result.
+          *
+          * On Gen8, the multiply instruction does a full 32x32-bit multiply,
+          * but in order to do a 64x64-bit multiply we have to simulate the
+          * previous behavior and then use a MACH instruction.
+          *
+          * FINISHME: Don't use source modifiers on src1.
+          */
+         if (brw->gen >= 8) {
+            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
+            if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
+               mul->src[1].type = BRW_REGISTER_TYPE_W;
+            } else {
+               mul->src[1].type = BRW_REGISTER_TYPE_UW;
+            }
+         }
+         break;
+      }
+      case ir_binop_div:
+         /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
+         assert(ir->type->is_integer());
+         bld.emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
+         break;
+      case ir_binop_carry: {
+         if (brw->gen == 7)
+            self().no16("SIMD16 explicit accumulator operands unsupported\n");
+
+         src_reg acc(retype(brw_acc_reg(bld.dispatch_width()),
+                            BRW_REGISTER_TYPE_UD));
+
+         bld.ADDC(bld.reg_null_ud(), op[0], op[1]);
+         bld.MOV(result_dst, acc);
+         break;
+      }
+      case ir_binop_borrow: {
+         if (brw->gen == 7)
+            self().no16("SIMD16 explicit accumulator operands unsupported\n");
+
+         src_reg acc(retype(brw_acc_reg(bld.dispatch_width()),
+                            BRW_REGISTER_TYPE_UD));
+
+         bld.SUBB(bld.reg_null_ud(), op[0], op[1]);
+         bld.MOV(result_dst, acc);
+         break;
+      }
+      case ir_binop_mod:
+         /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
+         assert(ir->type->is_integer());
+         bld.emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
+         break;
+      case ir_binop_less:
+      case ir_binop_greater:
+      case ir_binop_lequal:
+      case ir_binop_gequal:
+      case ir_binop_equal:
+      case ir_binop_nequal:
+      case ir_binop_all_equal:
+      case ir_binop_any_nequal:
+         if (ctx->Const.UniformBooleanTrue == 1) {
+            resolve_bool_comparison(ir->operands[0], &op[0]);
+            resolve_bool_comparison(ir->operands[1], &op[1]);
+         }
+
+         if (!is_scalar && ir->type->is_scalar()) {
+            bld.CMP(bld.reg_null_d(), op[0], op[1],
+                    brw_conditional_for_comparison(ir->operation));
+            bld.MOV(result_dst, src_reg(0));
+            exec_predicate((ir->operation == ir_binop_all_equal ?
+                            BRW_PREDICATE_ALIGN16_ALL4H :
+                            BRW_PREDICATE_ALIGN16_ANY4H),
+                           bld.MOV(result_dst,
+                                   src_reg(ctx->Const.UniformBooleanTrue)));
+         } else {
+            bld.CMP(result_dst, op[0], op[1],
+                    brw_conditional_for_comparison(ir->operation));
+         }
+         break;
+      case ir_unop_any:
+         bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+         bld.MOV(result_dst, src_reg(0));
+         exec_predicate(BRW_PREDICATE_ALIGN16_ANY4H,
+                        bld.MOV(result_dst,
+                                src_reg(ctx->Const.UniformBooleanTrue)));
+         break;
+      case ir_binop_logic_xor:
+         bld.XOR(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_logic_or:
+         bld.OR(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_logic_and:
+         bld.AND(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_dot:
+         assert(ir->operands[0]->type->is_vector());
+         assert(ir->operands[0]->type == ir->operands[1]->type);
+         emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
+         break;
+      case ir_unop_noise:
+         unreachable("not reached: should be handled by lower_noise");
+
+      case ir_quadop_vector:
+         unreachable("not reached: should be handled by lower_quadop_vector");
+
+      case ir_binop_vector_extract:
+         unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
+
+      case ir_triop_vector_insert:
+         unreachable("not reached: should be handled by lower_vector_insert()");
+
+      case ir_binop_ldexp:
+         unreachable("not reached: should be handled by ldexp_to_arith()");
+
+      case ir_unop_sqrt:
+         bld.emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
+         break;
+      case ir_unop_rsq:
+         bld.emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
+         break;
+      case ir_unop_bitcast_i2f:
+      case ir_unop_bitcast_u2f:
+         op[0].type = BRW_REGISTER_TYPE_F;
+         this->result = op[0];
+         break;
+      case ir_unop_i2u:
+      case ir_unop_bitcast_f2u:
+         op[0].type = BRW_REGISTER_TYPE_UD;
+         this->result = op[0];
+         break;
+      case ir_unop_u2i:
+      case ir_unop_bitcast_f2i:
+         op[0].type = BRW_REGISTER_TYPE_D;
+         this->result = op[0];
+         break;
+      case ir_unop_i2f:
+      case ir_unop_u2f:
+      case ir_unop_f2i:
+      case ir_unop_f2u:
+         bld.MOV(result_dst, op[0]);
+         break;
+      case ir_unop_b2i:
+         bld.AND(result_dst, op[0], src_reg(1));
+         break;
+      case ir_unop_b2f:
+         if (ctx->Const.UniformBooleanTrue != 1) {
+            op[0].type = BRW_REGISTER_TYPE_UD;
+            result_dst.type = BRW_REGISTER_TYPE_UD;
+            bld.AND(result_dst, op[0], src_reg(0x3f800000u));
+            this->result.type = BRW_REGISTER_TYPE_F;
+         } else {
+            temp = self().temporary_reg(ir->operands[0]->type);
+            bld.AND(temp, op[0], src_reg(1));
+            bld.MOV(result_dst, src_reg(temp));
+         }
+         break;
+      case ir_unop_f2b:
+         bld.CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ);
+         break;
+      case ir_unop_i2b:
+         bld.CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+         break;
+      case ir_unop_trunc:
+         bld.RNDZ(result_dst, op[0]);
+         break;
+      case ir_unop_ceil:
+         op[0].negate = !op[0].negate;
+         bld.RNDD(result_dst, op[0]);
+         this->result.negate = true;
+         break;
+      case ir_unop_floor:
+         bld.RNDD(result_dst, op[0]);
+         break;
+      case ir_unop_fract:
+         bld.FRC(result_dst, op[0]);
+         break;
+      case ir_unop_round_even:
+         bld.RNDE(result_dst, op[0]);
+         break;
+      case ir_binop_min:
+      case ir_binop_max:
+         bld.emit_minmax(ir->operation == ir_binop_min ?
+                         BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
+                         result_dst, op[0], op[1]);
+         break;
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
+      case ir_unop_unpack_snorm_2x16:
+      case ir_unop_unpack_snorm_4x8:
+      case ir_unop_unpack_unorm_2x16:
+      case ir_unop_unpack_unorm_4x8:
+         unreachable("not reached: should be handled by lower_packing_builtins");
+      case ir_unop_pack_half_2x16:
+         self().emit_pack_half_2x16(result_dst, op[0]);
+         break;
+      case ir_unop_unpack_half_2x16:
+         self().emit_unpack_half_2x16(result_dst, op[0]);
+         break;
+      case ir_unop_unpack_half_2x16_split_x:
+         bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result_dst, op[0]);
+         break;
+      case ir_unop_unpack_half_2x16_split_y:
+         bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result_dst, op[0]);
+         break;
+      case ir_binop_pow:
+         bld.emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
+         break;
+      case ir_unop_bitfield_reverse:
+         bld.BFREV(result_dst, op[0]);
+         break;
+      case ir_unop_bit_count:
+         bld.CBIT(result_dst, op[0]);
+         break;
+      case ir_unop_find_msb:
+         temp = retype(self().temporary_reg(ir->type),
+                       BRW_REGISTER_TYPE_UD);
+         bld.FBH(temp, op[0]);
+
+         /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
+          * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
+          * subtract the result from 31 to convert the MSB count into an LSB count.
+          */
+
+         /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
+         bld.MOV(result_dst, src_reg(temp));
+         bld.CMP(bld.reg_null_d(), this->result, src_reg(-1), BRW_CONDITIONAL_NZ);
+         exec_predicate(BRW_PREDICATE_NORMAL,
+                        bld.ADD(result_dst, negate(src_reg(temp)),
+                                src_reg(31)));
+         break;
+      case ir_unop_find_lsb:
+         bld.FBL(result_dst, op[0]);
+         break;
+      case ir_unop_saturate:
+         bld.MOV(result_dst, op[0])
+            ->saturate = true;
+         break;
+      case ir_triop_bitfield_extract:
+         /* Note that the instruction's argument order is reversed from GLSL
+          * and the IR.
+          */
+         bld.BFE(result_dst, bld.fix_3src_operand(op[2]),
+                 bld.fix_3src_operand(op[1]),
+                 bld.fix_3src_operand(op[0]));
+         break;
+      case ir_binop_bfm:
+         bld.BFI1(result_dst, op[0], op[1]);
+         break;
+      case ir_triop_bfi:
+         bld.BFI2(result_dst, bld.fix_3src_operand(op[0]),
+                  bld.fix_3src_operand(op[1]),
+                  bld.fix_3src_operand(op[2]));
+         break;
+      case ir_quadop_bitfield_insert:
+         unreachable("not reached: should be handled by "
+                     "lower_instructions::bitfield_insert_to_bfm_bfi");
+
+      case ir_unop_bit_not:
+         bld.NOT(result_dst, op[0]);
+         break;
+      case ir_binop_bit_and:
+         bld.AND(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_bit_xor:
+         bld.XOR(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_bit_or:
+         bld.OR(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_lshift:
+         bld.SHL(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_rshift:
+         if (ir->type->base_type == GLSL_TYPE_INT)
+            bld.ASR(result_dst, op[0], op[1]);
+         else
+            bld.SHR(result_dst, op[0], op[1]);
+         break;
+      case ir_binop_pack_half_2x16_split:
+         bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result_dst, op[0], op[1]);
+         break;
+      case ir_binop_ubo_load: {
+         /* This IR node takes a constant uniform block and a constant or
+          * variable byte offset within the block and loads a vector from that.
+          */
+         ir_constant *const_uniform_block = ir->operands[0]->as_constant();
+         ir_constant *const_offset = ir->operands[1]->as_constant();
+         src_reg surf_index;
+
+         if (const_uniform_block) {
+            /* The block index is a constant, so just emit the binding table entry
+             * as an immediate.
+             */
+            surf_index = src_reg(stage_prog_data->binding_table.ubo_start +
+                                 const_uniform_block->value.u[0]);
+         } else {
+            /* The block index is not a constant. Evaluate the index expression
+             * per-channel and add the base UBO index; the generator will select
+             * a value from any live channel.
+             */
+            surf_index = src_reg(bld.scalar_reg(BRW_REGISTER_TYPE_UD));
+            exec_all(bld.ADD(dst_reg(surf_index), op[0],
+                             src_reg(stage_prog_data->binding_table.ubo_start)));
+
+            /* Assume this may touch any UBO. It would be nice to provide
+             * a tighter bound, but the array information is already lowered away.
+             */
+            brw_mark_surface_used(stage_prog_data,
+                                  stage_prog_data->binding_table.ubo_start +
+                                  shader_prog->NumUniformBlocks - 1);
+         }
+
+         if (const_offset) {
+            self().emit_pull_constant_load(bld, result_dst, surf_index,
+                                           const_offset->value.u[0], NULL,
+                                           ir->type->vector_elements);
+         } else {
+            src_reg reladdr(bld.scalar_reg(BRW_REGISTER_TYPE_D));
+
+            /* Turn the byte offset into alloc_size units. */
+            bld.SHR(dst_reg(reladdr), op[1], src_reg(alloc_size == 4 ? 4 : 2));
+
+            self().emit_pull_constant_load(bld, result_dst, surf_index, 0,
+                                           &reladdr, ir->type->vector_elements);
+         }
+
+         if (ir->type->base_type == GLSL_TYPE_BOOL) {
+            for (unsigned i = 0; i < CEILING(ir->type->vector_elements,
+                                             alloc_size); i++) {
+               /* UBO bools are any nonzero value.  We consider bools to be
+                * values with the low bit set to 1.  Convert them using CMP.
+                */
+               bld.CMP(offset(result_dst, i), offset(result, i),
+                       src_reg(0u), BRW_CONDITIONAL_NZ);
+            }
+         }
+         break;
+      }
+      case ir_triop_fma:
+         /* Note that the instruction's argument order is reversed from GLSL
+          * and the IR.
+          */
+         bld.MAD(result_dst, bld.fix_3src_operand(op[2]),
+                 bld.fix_3src_operand(op[1]),
+                 bld.fix_3src_operand(op[0]));
+         break;
+      case ir_triop_lrp:
+         bld.LRP(result_dst, op[0], op[1], op[2]);
+         break;
+      case ir_triop_csel:
+         bld.CMP(bld.reg_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ);
+         exec_predicate(BRW_PREDICATE_NORMAL,
+                        bld.emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]));
+         break;
+      case ir_unop_interpolate_at_centroid:
+      case ir_binop_interpolate_at_offset:
+      case ir_binop_interpolate_at_sample:
+         unreachable("already handled above");
+         break;
+      }
+   }
+
+   void
+   visit(ir_swizzle *ir)
+   {
+      const unsigned swz = compose_swizzle(
+         BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w),
+         swizzle_for_size(ir->type->vector_elements));
+      dst_vector dst = dst_vector_n(self().temporary_reg(ir->type),
+                                    ir->type->vector_elements);
+      src_vector src = swizzle(src_vector_n(visit_result(ir->val), 4), swz);
+
+      if (reg_traits<src_reg>::allows_swizzle) {
+         this->result = storage(src);
+      } else {
+         bld.vector().MOV(dst, src);
+         this->result = src_reg(storage(dst));
+      }
+   }
+
+   unsigned
+   emit_assignment_writes(const dst_vector &l, const src_vector &r,
+                          const glsl_type *type, enum brw_predicate predicate)
+   {
+      unsigned size = 0;
+
+      switch (type->base_type) {
+      case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_UINT:
+      case GLSL_TYPE_INT:
+      case GLSL_TYPE_BOOL: {
+         typename B::vector_builder vbld = bld.vector();
+
+         for (int j = 0; j < type->matrix_columns; j++) {
+            dst_vector dst = retype(offset(l, size),
+                                    brw_type_for_base_type(type));
+            src_vector src = retype(offset(r, size),
+                                    brw_type_for_base_type(type));
+
+            exec_predicate(predicate,
+                           vbld.MOV(resize(dst, type->vector_elements), src));
+
+            size += CEILING(type->vector_elements, alloc_size);
+         }
+         break;
+      }
+      case GLSL_TYPE_ARRAY:
+         for (unsigned i = 0; i < type->length; i++)
+            size += emit_assignment_writes(offset(l, size),
+                                           offset(r, size),
+                                           type->fields.array,
+                                           predicate);
+         break;
+
+      case GLSL_TYPE_STRUCT:
+         for (unsigned i = 0; i < type->length; i++)
+            size += emit_assignment_writes(offset(l, size),
+                                           offset(r, size),
+                                           type->fields.structure[i].type,
+                                           predicate);
+         break;
+
+      case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_IMAGE:
+      case GLSL_TYPE_ATOMIC_UINT:
+         break;
+
+      case GLSL_TYPE_VOID:
+      case GLSL_TYPE_ERROR:
+      case GLSL_TYPE_INTERFACE:
+         unreachable("not reached");
+      }
+
+      return size;
+   }
+
+   /* If the RHS processing resulted in an instruction generating a
+    * temporary value, and it would be easy to rewrite the instruction to
+    * generate its result right into the LHS instead, do so.  This ends
+    * up reliably removing instructions where it can be tricky to do so
+    * later without real UD chain information.
+    */
+   bool
+   try_rewrite_rhs_to_dst(ir_assignment *ir,
+                          const dst_vector &dst, const src_vector &src,
+                          instruction *pre_rhs_inst,
+                          instruction *last_rhs_inst)
+   {
+      /* Only attempt if we're doing a direct assignment. */
+      if (ir->condition ||
+          !(ir->lhs->type->is_scalar() ||
+            (ir->lhs->type->is_vector())))
+         return false;
+
+      /* Make sure the last instruction generated our source reg. */
+      instruction *modify = get_instruction_generating_reg(
+         pre_rhs_inst, last_rhs_inst, src);
+      if (!modify)
+         return false;
+
+      /* If last_rhs_inst wrote a different number of components than our LHS,
+       * we can't safely rewrite it.
+       */
+      if ((dst.writemask & ~get_writemask(modify)) ||
+          ((~dst.writemask & get_writemask(modify)) &&
+           !reg_traits<dst_reg>::allows_writemask))
+         return false;
+
+      /* Success!  Rewrite the instruction. */
+      modify->dst = storage(dst);
+      return true;
+   }
+
+   void
+   visit(ir_assignment *ir)
+   {
+      const unsigned mask = (ir->lhs->type->is_vector() ? ir->write_mask :
+                             ir->lhs->type->is_scalar() ? 0x1 : 0xf);
+      dst_vector l = writemask(dst_vector_n(visit_result(ir->lhs), 4), mask);
+      instruction *pre_rhs_inst = (instruction *)this->instructions.get_tail();
+      src_vector r = swizzle(src_vector_n(visit_result(ir->rhs), 4),
+                             from_glsl_swizzle(mask, SWIZZLE_XYZW));
+      instruction *last_rhs_inst = (instruction *)this->instructions.get_tail();
+      enum brw_predicate predicate = BRW_PREDICATE_NONE;
+
+      if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
+         return;
+
+      if (ir->condition)
+         predicate = emit_bool_to_cond_code(ir->condition);
+
+      emit_assignment_writes(l, r, ir->lhs->type, predicate);
+   }
+
+   /* Sample from the MCS surface attached to this multisample texture. */
+   src_reg
+   emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
+   {
+      typename B::vector_builder vbld = bld.vector();
+      const unsigned reg_width = bld.dispatch_width() / 8;
+      const unsigned length = ir->coordinate->type->vector_elements;
+      const unsigned coord_mask = (1 << length) - 1;
+      const unsigned zero_mask = ((1 << alloc_size) - 1) & ~coord_mask;
+      dst_vector payload = vbld.natural_reg(brw_type_for_base_type(
+                                            ir->coordinate->type));
+      dst_vector dst = vbld.natural_reg(BRW_REGISTER_TYPE_UD);
+
+      vbld.MOV(writemask(payload, coord_mask), src_vector_n(coordinate, length));
+      vbld.MOV(writemask(payload, zero_mask), src_reg(0));
+
+      instruction *inst = bld.emit(SHADER_OPCODE_TXF_MCS, storage(dst),
+                                   src_reg(storage(payload)), sampler);
+      inst->base_mrf = -1;
+      inst->mlen = CEILING(length, alloc_size) * reg_width;
+      /* We only care about one component of response, but the sampler always
+       * writes 4.
+       */
+      inst->regs_written = CEILING(4, alloc_size) * reg_width;
+      return src_reg(storage(dst));
+   }
+
+   /**
+    * Apply workarounds for Gen6 gather with UINT/SINT
+    */
+   void
+   emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
+   {
+      if (!wa)
+         return;
+
+      typename B::vector_builder vbld = bld.vector();
+      const unsigned width = (wa & WA_8BIT) ? 8 : 16;
+      dst_vector vdst = dst_vector_n(dst, 4);
+      dst_vector vdst_f = retype(vdst, BRW_REGISTER_TYPE_F);
+
+      /* Convert from UNORM to UINT */
+      vbld.MUL(vdst_f, src_vector(vdst_f), src_reg((float)((1 << width) - 1)));
+      vbld.MOV(vdst, src_vector(vdst_f));
+
+      if (wa & WA_SIGN) {
+         /* Reinterpret the UINT value as a signed INT value by shifting the
+          * sign bit into place, then shifting back preserving sign.
+          */
+         vbld.SHL(vdst, src_vector(vdst), src_reg(32 - width));
+         vbld.ASR(vdst, src_vector(vdst), src_reg(32 - width));
+      }
+   }
+
+   /**
+    * Swizzle the result of a texture instruction.  This is necessary for
+    * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow
+    * comparisons.
+    */
+   void
+   swizzle_result(ir_texture *ir, const src_reg &orig_val, uint32_t sampler)
+   {
+      const unsigned swz = self().sampler_prog_key()->swizzles[sampler];
+
+      if (ir->op == ir_query_levels) {
+         /* # levels is in .w */
+         this->result = component(src_vector_n(orig_val, 4), 3);
+         return;
+      }
+
+      this->result = resize(orig_val, ir->type->vector_elements);
+
+      /* txs,lod don't actually sample the texture, so swizzling the result
+       * makes no sense.
+       */
+      if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
+         return;
+
+      if (ir->type == glsl_type::float_type) {
+         /* Ignore DEPTH_TEXTURE_MODE swizzling. */
+         assert(ir->sampler->type->sampler_shadow);
+
+      } else if (swz != SWIZZLE_NOOP) {
+         typename B::vector_builder vbld = bld.vector();
+         dst_vector dst = vbld.natural_reg(orig_val.type);
+         unsigned zero_mask = 0, one_mask = 0, copy_mask = 0;
+
+         for (int i = 0; i < 4; i++) {
+            switch (GET_SWZ(swz, i)) {
+            case SWIZZLE_ZERO:
+               zero_mask |= (1 << i);
+               break;
+            case SWIZZLE_ONE:
+               one_mask |= (1 << i);
+               break;
+            default:
+               copy_mask |= (1 << i);
+               break;
+            }
+         }
+
+         if (copy_mask)
+            vbld.MOV(writemask(dst, copy_mask),
+                     swizzle(src_vector_n(orig_val, 4),
+                             from_glsl_swizzle(0xf, swz)));
+
+         if (zero_mask)
+            vbld.MOV(writemask(dst, zero_mask), src_reg(0.0f));
+
+         if (one_mask)
+            vbld.MOV(writemask(dst, one_mask), src_reg(1.0f));
+
+         this->result = src_reg(storage(dst));
+      }
+   }
+
+   /**
+    * Set up the gather channel based on the swizzle, for gather4.
+    */
+   uint32_t
+   gather_channel(ir_texture *ir, uint32_t sampler)
+   {
+      const struct brw_sampler_prog_key_data *tex = self().sampler_prog_key();
+      ir_constant *chan = ir->lod_info.component->as_constant();
+      const unsigned swiz = GET_SWZ(tex->swizzles[sampler], chan->value.i[0]);
+
+      switch (swiz) {
+      case SWIZZLE_X: return 0;
+      case SWIZZLE_Y:
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         return (tex->gather_channel_quirk_mask & (1 << sampler) ? 2 : 1);
+      case SWIZZLE_Z: return 2;
+      case SWIZZLE_W: return 3;
+      default:
+         unreachable("Not reached"); /* zero, one swizzles handled already */
+      }
+   }
+
+   src_reg
+   rescale_texcoord(ir_texture *ir, src_reg coordinate,
+                    bool is_rect, uint32_t sampler, int texunit)
+   {
+      typename B::vector_builder vbld = bld.vector();
+      const struct brw_sampler_prog_key_data *tex = self().sampler_prog_key();
+      const unsigned clamp_mask =
+         ((tex->gl_clamp_mask[0] & (1 << sampler) ? 1 : 0) << 0) |
+         ((tex->gl_clamp_mask[1] & (1 << sampler) ? 1 : 0) << 1) |
+         ((tex->gl_clamp_mask[2] & (1 << sampler) ? 1 : 0) << 2);
+      src_vector scale;
+
+      /* The 965 requires the EU to do the normalization of GL rectangle
+       * texture coordinates.  We use the program parameter state
+       * tracking to get the scaling factor.
+       */
+      if (is_rect && (brw->gen < 6 || (brw->gen >= 6 && clamp_mask))) {
+         struct gl_program_parameter_list *params = prog->Parameters;
+         int tokens[STATE_LENGTH] = {
+            STATE_INTERNAL,
+            STATE_TEXRECT_SCALE,
+            texunit,
+            0,
+            0
+         };
+
+         self().no16("rectangle scale uniform setup not supported on SIMD16\n");
+         if (bld.dispatch_width() == 16) {
+            return coordinate;
+         }
+
+         GLuint index = _mesa_add_state_reference(params,
+                                                  (gl_state_index *)tokens);
+         /* Try to find existing copies of the texrect scale uniforms. */
+         for (unsigned i = 0; i < uniforms; i++) {
+            if (stage_prog_data->param[alloc_size * i] ==
+                &prog->Parameters->ParameterValues[index][0]) {
+               scale = src_vector_n(src_reg(UNIFORM, i), 2);
+               break;
+            }
+         }
+
+         /* If we didn't already set them up, do so now. */
+         if (storage(scale).file == BAD_FILE) {
+            scale = src_vector_n(src_reg(UNIFORM, uniforms), 2);
+            stage_prog_data->param[alloc_size * uniforms] =
+               &prog->Parameters->ParameterValues[index][0];
+            stage_prog_data->param[alloc_size * uniforms + 1] =
+               &prog->Parameters->ParameterValues[index][1];
+            uniform_vector_size[uniforms] = 2;
+            uniforms += CEILING(2, alloc_size);
+         }
+      }
+
+      /* The 965 requires the EU to do the normalization of GL rectangle
+       * texture coordinates.  We use the program parameter state
+       * tracking to get the scaling factor.
+       */
+      if (brw->gen >= 6 && is_rect) {
+         /* On gen6+, the sampler handles the rectangle coordinates
+          * natively, without needing rescaling.  But that means we have
+          * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
+          * not [0, 1] like the default case below.
+          */
+         dst_vector dst = dst_vector_n(coordinate, 4);
+         exec_condmod(BRW_CONDITIONAL_G,
+                      vbld.emit(BRW_OPCODE_SEL, writemask(dst, clamp_mask),
+                                src_vector(dst), src_reg(0.0f)));
+
+         /* Our parameter comes in as 1.0/width or 1.0/height,
+          * because that's what people normally want for doing
+          * texture rectangle handling.  We need width or height
+          * for clamping, but we don't care enough to make a new
+          * parameter type, so just invert back.
+          */
+         dst_vector limit = vbld.natural_reg(BRW_REGISTER_TYPE_F);
+         vbld.MOV(limit, scale);
+         vbld.emit(SHADER_OPCODE_RCP, writemask(limit, clamp_mask),
+                   src_vector(limit));
+
+         exec_condmod(BRW_CONDITIONAL_L,
+                      vbld.emit(BRW_OPCODE_SEL, writemask(dst, clamp_mask),
+                                src_vector(dst), src_vector(limit)));
+      } else {
+         if (is_rect) {
+            dst_vector dst = vbld.natural_reg(brw_type_for_base_type(ir->type));
+            src_vector src = src_vector_n(coordinate, 4);
+            coordinate = src_reg(storage(dst));
+            vbld.MUL(writemask(dst, WRITEMASK_XY), src, scale);
+         }
+
+         if (ir->coordinate) {
+            dst_vector dst = dst_vector_n(coordinate, 4);
+            exec_saturate(true,
+                          vbld.MOV(writemask(dst, clamp_mask), src_vector(dst)));
+         }
+      }
+
+      return coordinate;
+   }
+
+   void
+   visit(ir_texture *ir)
+   {
+      typename B::vector_builder vbld = bld.vector();
+      const struct brw_sampler_prog_key_data *tex = self().sampler_prog_key();
+      uint32_t sampler =
+         _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
+      ir_rvalue *nonconst_sampler_index =
+         _mesa_get_sampler_array_nonconst_index(ir->sampler);
+
+      /* Handle non-constant sampler array indexing */
+      src_reg sampler_reg;
+      if (nonconst_sampler_index) {
+         /* The highest sampler which may be used by this operation is
+          * the last element of the array. Mark it here, because the generator
+          * doesn't have enough information to determine the bound.
+          */
+         uint32_t array_size = ir->sampler->as_dereference_array()
+            ->array->type->array_size();
+
+         uint32_t max_used = sampler + array_size - 1;
+         if (ir->op == ir_tg4 && brw->gen < 8) {
+            max_used += stage_prog_data->binding_table.gather_texture_start;
+         } else {
+            max_used += stage_prog_data->binding_table.texture_start;
+         }
+
+         brw_mark_surface_used(stage_prog_data, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         dst_reg tmp = bld.scalar_reg(BRW_REGISTER_TYPE_UD);
+         exec_all(bld.ADD(tmp, visit_result(nonconst_sampler_index),
+                         src_reg(sampler)));
+         sampler_reg = src_reg(tmp);
+      } else {
+         /* Single sampler, or constant array index; the indexing expression
+          * is just an immediate.
+          */
+         sampler_reg = src_reg(sampler);
+      }
+
+      /* FINISHME: We're failing to recompile our programs when the sampler is
+       * updated.  This only matters for the texture rectangle scale parameters
+       * (pre-gen6, or gen6+ with GL_CLAMP).
+       */
+      int texunit = prog->SamplerUnits[sampler];
+
+      if (ir->op == ir_tg4) {
+         /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
+          * emitting anything other than setting up the constant result.
+          */
+         ir_constant *chan = ir->lod_info.component->as_constant();
+         int swiz = GET_SWZ(tex->swizzles[sampler], chan->value.i[0]);
+         if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
+            dst_vector res = vbld.natural_reg(BRW_REGISTER_TYPE_F);
+            vbld.MOV(res, src_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
+            this->result = src_reg(storage(res));
+            return;
+         }
+      }
+
+      /* Should be lowered by do_lower_texture_projection */
+      assert(!ir->projector);
+
+      /* Should be lowered */
+      assert(!ir->offset || !ir->offset->type->is_array());
+
+      /* Generate code to compute all the subexpression trees.  This has to be
+       * done before loading any values into MRFs for the sampler message since
+       * generating these values may involve SEND messages that need the MRFs.
+       */
+      src_reg coordinate;
+      if (ir->coordinate)
+         coordinate = rescale_texcoord(ir, visit_result(ir->coordinate),
+                                       ir->sampler->type->sampler_dimensionality ==
+                                       GLSL_SAMPLER_DIM_RECT,
+                                       sampler, texunit);
+
+      src_reg shadow_comparitor;
+      if (ir->shadow_comparitor)
+         shadow_comparitor = visit_result(ir->shadow_comparitor);
+
+      src_reg offset_val;
+      if (ir->offset && !ir->offset->as_constant())
+         offset_val = visit_result(ir->offset);
+
+      src_reg lod(0.0f), lod2, sample_index, mcs;
+      switch (ir->op) {
+      case ir_tex:
+      case ir_lod:
+      case ir_tg4:
+      case ir_query_levels:
+         break;
+      case ir_txb:
+         lod = visit_result(ir->lod_info.bias);
+         break;
+      case ir_txd:
+         lod = visit_result(ir->lod_info.grad.dPdx);
+         lod2 = visit_result(ir->lod_info.grad.dPdy);
+         break;
+      case ir_txf:
+      case ir_txl:
+      case ir_txs:
+         lod = visit_result(ir->lod_info.lod);
+         break;
+      case ir_txf_ms:
+         sample_index = visit_result(ir->lod_info.sample_index);
+
+         if (brw->gen >= 7 && tex->compressed_multisample_layout_mask & (1<<sampler))
+            mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
+         else
+            mcs = src_reg(0u);
+         break;
+      default:
+         unreachable("Unrecognized texture opcode");
+      };
+
+      /* Writemasking doesn't eliminate channels on SIMD8 texture
+       * samples, so don't worry about them.
+       */
+      dst_reg dst = storage(vbld.natural_reg(brw_type_for_base_type(ir->type)));
+      instruction *inst = self().emit_texture(
+         ir, dst, coordinate, shadow_comparitor,
+         lod, lod2, offset_val, sample_index, mcs, sampler_reg);
+
+      if (ir->offset != NULL && ir->op != ir_txf)
+         inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
+
+      if (ir->op == ir_tg4)
+         inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
+
+      if (ir->shadow_comparitor)
+         inst->shadow_compare = true;
+
+      /* fixup #layers for cube map arrays */
+      if (ir->op == ir_txs) {
+         glsl_type const *type = ir->sampler->type;
+         if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+             type->sampler_array) {
+            const unsigned components = ir->type->vector_elements;
+            dst_vector vdst = dst_vector_n(dst, components);
+            dst_reg depth = bld.scalar_reg(BRW_REGISTER_TYPE_D);
+            src_reg payload[4];
+
+            bld.emit_math(SHADER_OPCODE_INT_QUOTIENT, depth,
+                          component(vdst, 2), src_reg(6));
+
+            for (unsigned i = 0; i < components; ++i)
+               payload[i] = (i == 2 ? src_reg(depth) : component(vdst, i));
+
+            vbld.LOAD_VECTOR(vdst, payload);
+         }
+      }
+
+      if (brw->gen == 6 && ir->op == ir_tg4)
+         emit_gen6_gather_wa(tex->gen6_gather_wa[sampler], dst);
+
+      swizzle_result(ir, src_reg(dst), sampler);
+   }
+
+   struct hash_table *variable_ht;
+
+   typename B::src_reg shader_start_time;
+   B bld;
+
+   const bool uses_kill;
+
+public:
+   int
+   type_size(const struct glsl_type *type)
+   {
+      unsigned int size, i;
+
+      switch (type->base_type) {
+      case GLSL_TYPE_UINT:
+      case GLSL_TYPE_INT:
+      case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_BOOL:
+         if (type->is_matrix()) {
+            return type->matrix_columns * type_size(type->column_type());
+         } else {
+            return CEILING(type->components(), alloc_size);
+         }
+      case GLSL_TYPE_ARRAY:
+         return type_size(type->fields.array) * type->length;
+      case GLSL_TYPE_STRUCT:
+         size = 0;
+         for (i = 0; i < type->length; i++) {
+            size += type_size(type->fields.structure[i].type);
+         }
+         return size;
+      case GLSL_TYPE_SAMPLER:
+         /* Samplers take up no register space, since they're baked in at
+          * link time.
+          */
+         return 0;
+      case GLSL_TYPE_ATOMIC_UINT:
+         return 0;
+      case GLSL_TYPE_IMAGE:
+      case GLSL_TYPE_VOID:
+      case GLSL_TYPE_ERROR:
+      case GLSL_TYPE_INTERFACE:
+         unreachable("not reached");
+      }
+
+      return 0;
+   }
+
+   /**
+    * Returns how many MRFs an opcode will write over.
+    *
+    * Note that this is not the 0 or 1 implied writes in an actual gen
+    * instruction -- the generate_* functions generate additional MOVs
+    * for setup.
+    */
+   int
+   implied_mrf_writes(instruction *inst)
+   {
+      if (inst->mlen == 0 || inst->base_mrf == -1)
+         return 0;
+
+      switch (inst->opcode) {
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+         return 1 * bld.dispatch_width() / 8;
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+      case SHADER_OPCODE_POW:
+         return 2 * bld.dispatch_width() / 8;
+      case VS_OPCODE_URB_WRITE:
+         return 1;
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+         return 2;
+      case SHADER_OPCODE_GEN4_SCRATCH_READ:
+         return inst->mlen;
+      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+         return inst->mlen;
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+         return 1;
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+         return inst->mlen;
+      case GS_OPCODE_URB_WRITE:
+      case GS_OPCODE_URB_WRITE_ALLOCATE:
+      case GS_OPCODE_THREAD_END:
+         return 0;
+      case GS_OPCODE_FF_SYNC:
+         return 1;
+      case SHADER_OPCODE_SHADER_TIME_ADD:
+         return 0;
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_LOD:
+      case FS_OPCODE_TXB:
+         return inst->header_present ? 1 : 0;
+      case SHADER_OPCODE_UNTYPED_ATOMIC:
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      case FS_OPCODE_INTERPOLATE_AT_CENTROID:
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+         return 0;
+      case FS_OPCODE_FB_WRITE:
+         return 2;
+      default:
+         unreachable("not reached");
+      }
+   }
+
+   static const unsigned alloc_size = reg_traits<src_reg>::alloc_size;
+
+   brw::simple_allocator alloc;
+
+   /* Result of last visit() method. */
+   typename B::src_reg result;
+
+   const shader_time_shader_type st_type;
+};
+
+} /* namespace brw */
+
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index b37da4ead62..1eaef45bf68 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -284,10 +284,10 @@ get_written_and_reset(struct brw_context *brw, int i,
    /* Find where we recorded written and reset. */
    int wi, ri;
 
-   for (wi = i; brw->shader_time.types[wi] != type + 1; wi++)
+   for (wi = i; brw->shader_time.types[wi] != type + ST_WRITTEN; wi++)
       ;
 
-   for (ri = i; brw->shader_time.types[ri] != type + 2; ri++)
+   for (ri = i; brw->shader_time.types[ri] != type + ST_RESET; ri++)
       ;
 
    *written = brw->shader_time.cumulative[wi];
@@ -328,27 +328,18 @@ brw_report_shader_time(struct brw_context *brw)
 
       sorted[i] = &scaled[i];
 
-      switch (type) {
-      case ST_VS_WRITTEN:
-      case ST_VS_RESET:
-      case ST_GS_WRITTEN:
-      case ST_GS_RESET:
-      case ST_FS8_WRITTEN:
-      case ST_FS8_RESET:
-      case ST_FS16_WRITTEN:
-      case ST_FS16_RESET:
+      switch (type % ST_NUM_ENTRIES) {
+      case ST_BASE:
+         get_written_and_reset(brw, i, &written, &reset);
+         break;
+
+      case ST_WRITTEN:
+      case ST_RESET:
          /* We'll handle these when along with the time. */
          scaled[i] = 0;
          continue;
 
-      case ST_VS:
-      case ST_GS:
-      case ST_FS8:
-      case ST_FS16:
-         get_written_and_reset(brw, i, &written, &reset);
-         break;
-
-      default:
+      case ST_SUM:
          /* I sometimes want to print things that aren't the 3 shader times.
           * Just print the sum in that case.
           */
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index a8650c3454b..9cd391471da 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -24,6 +24,8 @@
 #ifndef BRW_PROGRAM_H
 #define BRW_PROGRAM_H
 
+#include "main/mtypes.h"
+
 enum gen6_gather_sampler_wa {
    WA_SIGN = 1,      /* whether we need to sign extend */
    WA_8BIT = 2,      /* if we have an 8bit format needing wa */
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 19af0ae09fc..a27a3bad396 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -80,6 +80,7 @@ struct brw_context;
 #define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
 #define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
 #define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
+#define BRW_SWIZZLE_XXYY      BRW_SWIZZLE4(0,0,1,1)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
 #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
 #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index db94e527ca7..4017b14ddc0 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -395,7 +395,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
 
 class instruction_scheduler {
 public:
-   instruction_scheduler(backend_visitor *v, int grf_count,
+   instruction_scheduler(base_visitor *v, int grf_count,
                          instruction_scheduler_mode mode)
    {
       this->bv = v;
@@ -451,7 +451,7 @@ public:
    int grf_count;
    int time;
    exec_list instructions;
-   backend_visitor *bv;
+   base_visitor *bv;
 
    instruction_scheduler_mode mode;
 
@@ -1081,12 +1081,14 @@ vec4_instruction_scheduler::calculate_deps()
          }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-         /* It looks like the MRF regs are released in the send
-          * instruction once it's sent, not when the result comes
-          * back.
-          */
-         add_dep(last_mrf_write[inst->base_mrf + i], n);
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+         }
       }
 
       if (inst->reads_flag()) {
@@ -1116,7 +1118,7 @@ vec4_instruction_scheduler::calculate_deps()
          add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             add_dep(last_mrf_write[inst->base_mrf + i], n);
             last_mrf_write[inst->base_mrf + i] = n;
@@ -1171,12 +1173,14 @@ vec4_instruction_scheduler::calculate_deps()
          }
       }
 
-      for (int i = 0; i < inst->mlen; i++) {
-         /* It looks like the MRF regs are released in the send
-          * instruction once it's sent, not when the result comes
-          * back.
-          */
-         add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+         }
       }
 
       if (inst->reads_flag()) {
@@ -1203,7 +1207,7 @@ vec4_instruction_scheduler::calculate_deps()
          add_barrier_deps(n);
       }
 
-      if (inst->mlen > 0) {
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             last_mrf_write[inst->base_mrf + i] = n;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 05f6fe78523..18cee8722af 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -544,23 +544,6 @@ brw_instruction_name(enum opcode op)
    }
 }
 
-backend_visitor::backend_visitor(struct brw_context *brw,
-                                 struct gl_shader_program *shader_prog,
-                                 struct gl_program *prog,
-                                 struct brw_stage_prog_data *stage_prog_data,
-                                 gl_shader_stage stage)
-   : brw(brw),
-     ctx(&brw->ctx),
-     shader(shader_prog ?
-        (struct brw_shader *)shader_prog->_LinkedShaders[stage] : NULL),
-     shader_prog(shader_prog),
-     prog(prog),
-     stage_prog_data(stage_prog_data),
-     cfg(NULL),
-     stage(stage)
-{
-}
-
 bool
 backend_reg::is_zero() const
 {
@@ -829,104 +812,3 @@ backend_instruction::remove(bblock_t *block)
 
    exec_node::remove();
 }
-
-void
-backend_visitor::dump_instructions()
-{
-   dump_instructions(NULL);
-}
-
-void
-backend_visitor::dump_instructions(const char *name)
-{
-   FILE *file = stderr;
-   if (name && geteuid() != 0) {
-      file = fopen(name, "w");
-      if (!file)
-         file = stderr;
-   }
-
-   int ip = 0;
-   foreach_block_and_inst(block, backend_instruction, inst, cfg) {
-      if (!name)
-         fprintf(stderr, "%d: ", ip++);
-      dump_instruction(inst, file);
-   }
-
-   if (file != stderr) {
-      fclose(file);
-   }
-}
-
-void
-backend_visitor::calculate_cfg()
-{
-   if (this->cfg)
-      return;
-   cfg = new(mem_ctx) cfg_t(&this->instructions);
-}
-
-void
-backend_visitor::invalidate_cfg()
-{
-   ralloc_free(this->cfg);
-   this->cfg = NULL;
-}
-
-/**
- * Sets up the starting offsets for the groups of binding table entries
- * commong to all pipeline stages.
- *
- * Unused groups are initialized to 0xd0d0d0d0 to make it obvious that they're
- * unused but also make sure that addition of small offsets to them will
- * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
- */
-void
-backend_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
-{
-   int num_textures = _mesa_fls(prog->SamplersUsed);
-
-   stage_prog_data->binding_table.texture_start = next_binding_table_offset;
-   next_binding_table_offset += num_textures;
-
-   if (shader) {
-      stage_prog_data->binding_table.ubo_start = next_binding_table_offset;
-      next_binding_table_offset += shader->base.NumUniformBlocks;
-   } else {
-      stage_prog_data->binding_table.ubo_start = 0xd0d0d0d0;
-   }
-
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
-      stage_prog_data->binding_table.shader_time_start = next_binding_table_offset;
-      next_binding_table_offset++;
-   } else {
-      stage_prog_data->binding_table.shader_time_start = 0xd0d0d0d0;
-   }
-
-   if (prog->UsesGather) {
-      if (brw->gen >= 8) {
-         stage_prog_data->binding_table.gather_texture_start =
-            stage_prog_data->binding_table.texture_start;
-      } else {
-         stage_prog_data->binding_table.gather_texture_start = next_binding_table_offset;
-         next_binding_table_offset += num_textures;
-      }
-   } else {
-      stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
-   }
-
-   if (shader_prog && shader_prog->NumAtomicBuffers) {
-      stage_prog_data->binding_table.abo_start = next_binding_table_offset;
-      next_binding_table_offset += shader_prog->NumAtomicBuffers;
-   } else {
-      stage_prog_data->binding_table.abo_start = 0xd0d0d0d0;
-   }
-
-   /* This may or may not be used depending on how the compile goes. */
-   stage_prog_data->binding_table.pull_constants_start = next_binding_table_offset;
-   next_binding_table_offset++;
-
-   assert(next_binding_table_offset <= BRW_MAX_SURFACES);
-
-   /* prog_data->base.binding_table.size will be set by brw_mark_surface_used. */
-}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 0f927acfc4a..0c75b4c3ee2 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -21,18 +21,14 @@
  * IN THE SOFTWARE.
  */
 
+#pragma once
+
 #include <stdint.h>
 #include "brw_reg.h"
 #include "brw_defines.h"
 #include "main/compiler.h"
 #include "glsl/ir.h"
 
-#ifdef __cplusplus
-#include "brw_ir_allocator.h"
-#endif
-
-#pragma once
-
 enum PACKED register_file {
    BAD_FILE,
    GRF,
@@ -149,52 +145,6 @@ enum instruction_scheduler_mode {
    SCHEDULE_POST,
 };
 
-class backend_visitor : public ir_visitor {
-protected:
-
-   backend_visitor(struct brw_context *brw,
-                   struct gl_shader_program *shader_prog,
-                   struct gl_program *prog,
-                   struct brw_stage_prog_data *stage_prog_data,
-                   gl_shader_stage stage);
-
-public:
-
-   struct brw_context * const brw;
-   struct gl_context * const ctx;
-   struct brw_shader * const shader;
-   struct gl_shader_program * const shader_prog;
-   struct gl_program * const prog;
-   struct brw_stage_prog_data * const stage_prog_data;
-
-   /** ralloc context for temporary data used during compile */
-   void *mem_ctx;
-
-   /**
-    * List of either fs_inst or vec4_instruction (inheriting from
-    * backend_instruction)
-    */
-   exec_list instructions;
-
-   cfg_t *cfg;
-
-   gl_shader_stage stage;
-
-   brw::simple_allocator alloc;
-
-   virtual void dump_instruction(backend_instruction *inst) = 0;
-   virtual void dump_instruction(backend_instruction *inst, FILE *file) = 0;
-   virtual void dump_instructions();
-   virtual void dump_instructions(const char *name);
-
-   void calculate_cfg();
-   void invalidate_cfg();
-
-   void assign_common_binding_table_offsets(uint32_t next_binding_table_offset);
-
-   virtual void invalidate_live_intervals() = 0;
-};
-
 uint32_t brw_texture_offset(struct gl_context *ctx, ir_constant *offset);
 
 #endif /* __cplusplus */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 51a2390e764..49c762aec45 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -169,6 +169,21 @@ dst_reg::dst_reg(const src_reg &reg)
 }
 
 bool
+dst_reg::equals(const dst_reg &r) const
+{
+   return (file == r.file &&
+	   reg == r.reg &&
+	   reg_offset == r.reg_offset &&
+	   type == r.type &&
+	   negate == r.negate &&
+	   abs == r.abs &&
+	   writemask == r.writemask &&
+	   !reladdr && !r.reladdr &&
+	   memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
+		  sizeof(fixed_hw_reg)) == 0);
+}
+
+bool
 vec4_instruction::is_send_from_grf()
 {
    switch (opcode) {
@@ -195,66 +210,6 @@ vec4_instruction::can_do_source_mods(struct brw_context *brw)
    return true;
 }
 
-/**
- * Returns how many MRFs an opcode will write over.
- *
- * Note that this is not the 0 or 1 implied writes in an actual gen
- * instruction -- the generate_* functions generate additional MOVs
- * for setup.
- */
-int
-vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
-{
-   if (inst->mlen == 0)
-      return 0;
-
-   switch (inst->opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      return 1;
-   case SHADER_OPCODE_INT_QUOTIENT:
-   case SHADER_OPCODE_INT_REMAINDER:
-   case SHADER_OPCODE_POW:
-      return 2;
-   case VS_OPCODE_URB_WRITE:
-      return 1;
-   case VS_OPCODE_PULL_CONSTANT_LOAD:
-      return 2;
-   case SHADER_OPCODE_GEN4_SCRATCH_READ:
-      return 2;
-   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-      return 3;
-   case GS_OPCODE_URB_WRITE:
-   case GS_OPCODE_URB_WRITE_ALLOCATE:
-   case GS_OPCODE_THREAD_END:
-      return 0;
-   case GS_OPCODE_FF_SYNC:
-      return 1;
-   case SHADER_OPCODE_SHADER_TIME_ADD:
-      return 0;
-   case SHADER_OPCODE_TEX:
-   case SHADER_OPCODE_TXL:
-   case SHADER_OPCODE_TXD:
-   case SHADER_OPCODE_TXF:
-   case SHADER_OPCODE_TXF_CMS:
-   case SHADER_OPCODE_TXF_MCS:
-   case SHADER_OPCODE_TXS:
-   case SHADER_OPCODE_TG4:
-   case SHADER_OPCODE_TG4_OFFSET:
-      return inst->header_present ? 1 : 0;
-   case SHADER_OPCODE_UNTYPED_ATOMIC:
-   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-      return 0;
-   default:
-      unreachable("not reached");
-   }
-}
-
 bool
 src_reg::equals(const src_reg &r) const
 {
@@ -545,7 +500,7 @@ vec4_visitor::split_uniform_registers()
    }
 
    /* Update that everything is now vector-sized. */
-   for (int i = 0; i < this->uniforms; i++) {
+   for (unsigned i = 0; i < this->uniforms; i++) {
       this->uniform_size[i] = 1;
    }
 }
@@ -574,12 +529,12 @@ vec4_visitor::pack_uniform_registers()
       }
    }
 
-   int new_uniform_count = 0;
+   unsigned new_uniform_count = 0;
 
    /* Now, figure out a packing of the live uniform vectors into our
     * push constants.
     */
-   for (int src = 0; src < uniforms; src++) {
+   for (unsigned src = 0; src < uniforms; src++) {
       assert(src < uniform_array_size);
       int size = this->uniform_vector_size[src];
 
@@ -588,7 +543,7 @@ vec4_visitor::pack_uniform_registers()
 	 continue;
       }
 
-      int dst;
+      unsigned dst;
       /* Find the lowest place we can slot this uniform in. */
       for (dst = 0; dst < src; dst++) {
 	 if (this->uniform_vector_size[dst] + size <= 4)
@@ -725,7 +680,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
     * If changing this value, note the limitation about total_regs in
     * brw_curbe.c.
     */
-   int max_uniform_components = 32 * 8;
+   unsigned max_uniform_components = 32 * 8;
    if (this->uniforms * 4 <= max_uniform_components)
       return;
 
@@ -734,7 +689,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
     * look for the most infrequently used uniform vec4s, but leave
     * that for later.
     */
-   for (int i = 0; i < this->uniforms * 4; i += 4) {
+   for (unsigned i = 0; i < this->uniforms * 4; i += 4) {
       pull_constant_loc[i / 4] = -1;
 
       if (i >= max_uniform_components) {
@@ -778,12 +733,13 @@ vec4_visitor::move_push_constants_to_pull_constants()
 	     pull_constant_loc[inst->src[i].reg] == -1)
 	    continue;
 
-	 int uniform = inst->src[i].reg;
-
-	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+         vec4_builder ibld = bld.at(block, inst);
+	 int loc = pull_constant_loc[inst->src[i].reg] + inst->src[i].reg_offset;
+         src_reg surf_index(prog_data->base.binding_table.pull_constants_start);
+	 dst_reg temp = ibld.vector().natural_reg(BRW_REGISTER_TYPE_F);
 
-	 emit_pull_constant_load(block, inst, temp, inst->src[i],
-				 pull_constant_loc[uniform]);
+         emit_pull_constant_load(ibld, temp, surf_index, 16 * loc,
+                                 inst->src[i].reladdr, 4);
 
 	 inst->src[i].file = temp.file;
 	 inst->src[i].reg = temp.reg;
@@ -1578,97 +1534,6 @@ vec4_visitor::assign_binding_table_offsets()
    assign_common_binding_table_offsets(0);
 }
 
-src_reg
-vec4_visitor::get_timestamp()
-{
-   assert(brw->gen >= 7);
-
-   src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
-                                BRW_ARF_TIMESTAMP,
-                                0,
-                                BRW_REGISTER_TYPE_UD,
-                                BRW_VERTICAL_STRIDE_0,
-                                BRW_WIDTH_4,
-                                BRW_HORIZONTAL_STRIDE_4,
-                                BRW_SWIZZLE_XYZW,
-                                WRITEMASK_XYZW));
-
-   dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
-
-   vec4_instruction *mov = emit(MOV(dst, ts));
-   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
-    * even if it's not enabled in the dispatch.
-    */
-   mov->force_writemask_all = true;
-
-   return src_reg(dst);
-}
-
-void
-vec4_visitor::emit_shader_time_begin()
-{
-   current_annotation = "shader time start";
-   shader_start_time = get_timestamp();
-}
-
-void
-vec4_visitor::emit_shader_time_end()
-{
-   current_annotation = "shader time end";
-   src_reg shader_end_time = get_timestamp();
-
-
-   /* Check that there weren't any timestamp reset events (assuming these
-    * were the only two timestamp reads that happened).
-    */
-   src_reg reset_end = shader_end_time;
-   reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
-   vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
-   test->conditional_mod = BRW_CONDITIONAL_Z;
-
-   emit(IF(BRW_PREDICATE_NORMAL));
-
-   /* Take the current timestamp and get the delta. */
-   shader_start_time.negate = true;
-   dst_reg diff = dst_reg(this, glsl_type::uint_type);
-   emit(ADD(diff, shader_start_time, shader_end_time));
-
-   /* If there were no instructions between the two timestamp gets, the diff
-    * is 2 cycles.  Remove that overhead, so I can forget about that when
-    * trying to determine the time taken for single instructions.
-    */
-   emit(ADD(diff, src_reg(diff), src_reg(-2u)));
-
-   emit_shader_time_write(st_base, src_reg(diff));
-   emit_shader_time_write(st_written, src_reg(1u));
-   emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(st_reset, src_reg(1u));
-   emit(BRW_OPCODE_ENDIF);
-}
-
-void
-vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                     src_reg value)
-{
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-
-   dst_reg dst =
-      dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
-
-   dst_reg offset = dst;
-   dst_reg time = dst;
-   time.reg_offset++;
-
-   offset.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
-
-   time.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(time, src_reg(value)));
-
-   emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
-}
-
 bool
 vec4_visitor::run()
 {
@@ -1689,13 +1554,13 @@ vec4_visitor::run()
    } else {
       emit_program_code();
    }
-   base_ir = NULL;
 
    if (key->userclip_active && !prog->UsesClipDistanceOut)
       setup_uniform_clipplane_values();
 
    emit_thread_end();
 
+   bld = bld.at(NULL, NULL);
    calculate_cfg();
 
    /* Before any optimization, push array accesses out to scratch
@@ -1731,7 +1596,7 @@ vec4_visitor::run()
          snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
                   stage_name, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                        \
-         backend_visitor::dump_instructions(filename);                 \
+         base_visitor::dump_instructions(filename);                 \
       }                                                                \
                                                                        \
       progress = progress || this_progress;                            \
@@ -1743,7 +1608,7 @@ vec4_visitor::run()
       snprintf(filename, 64, "%s-%04d-00-start",
                stage_name, shader_prog ? shader_prog->Name : 0);
 
-      backend_visitor::dump_instructions(filename);
+      base_visitor::dump_instructions(filename);
    }
 
    bool progress;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 39c65b7b8ed..ab71a5a13ad 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -25,13 +25,10 @@
 #define BRW_VEC4_H
 
 #include <stdint.h>
-#include "brw_shader.h"
 #include "main/compiler.h"
-#include "program/hash_table.h"
 #include "brw_program.h"
-
 #ifdef __cplusplus
-#include "brw_ir_vec4.h"
+#include "brw_ir_visitor.h"
 
 extern "C" {
 #endif
@@ -93,7 +90,7 @@ namespace brw {
  * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
  * fixed-function) into VS IR.
  */
-class vec4_visitor : public backend_visitor
+class vec4_visitor : public backend_visitor<vec4_visitor, vec4_builder>
 {
 public:
    vec4_visitor(struct brw_context *brw,
@@ -106,53 +103,17 @@ public:
 		void *mem_ctx,
                 bool debug_flag,
                 bool no_spills,
-                shader_time_shader_type st_base,
-                shader_time_shader_type st_written,
-                shader_time_shader_type st_reset);
-   ~vec4_visitor();
-
-   dst_reg dst_null_f()
-   {
-      return dst_reg(brw_null_reg());
-   }
-
-   dst_reg dst_null_d()
-   {
-      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
-   }
-
-   dst_reg dst_null_ud()
-   {
-      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
-   }
+                shader_time_shader_type st_type);
 
    struct brw_vec4_compile * const c;
    const struct brw_vec4_prog_key * const key;
    struct brw_vec4_prog_data * const prog_data;
    unsigned int sanity_param_count;
 
-   char *fail_msg;
-   bool failed;
-
-   /**
-    * GLSL IR currently being processed, which is associated with our
-    * driver IR instructions for debugging purposes.
-    */
-   const void *base_ir;
-   const char *current_annotation;
-
-   int first_non_payload_grf;
-   unsigned int max_grf;
-   int *virtual_grf_start;
-   int *virtual_grf_end;
    dst_reg userplane[MAX_CLIP_PLANES];
 
    bool live_intervals_valid;
 
-   dst_reg *variable_storage(ir_variable *var);
-
-   void reladdr_to_temp(ir_instruction *ir, src_reg *reg, int *num_reladdr);
-
    bool need_all_constants_in_pull_buffer;
 
    /**
@@ -164,48 +125,29 @@ public:
     */
    /*@{*/
    virtual void visit(ir_variable *);
-   virtual void visit(ir_loop *);
-   virtual void visit(ir_loop_jump *);
-   virtual void visit(ir_function_signature *);
-   virtual void visit(ir_function *);
-   virtual void visit(ir_expression *);
-   virtual void visit(ir_swizzle *);
-   virtual void visit(ir_dereference_variable  *);
-   virtual void visit(ir_dereference_array *);
-   virtual void visit(ir_dereference_record *);
-   virtual void visit(ir_assignment *);
-   virtual void visit(ir_constant *);
-   virtual void visit(ir_call *);
-   virtual void visit(ir_return *);
    virtual void visit(ir_discard *);
-   virtual void visit(ir_texture *);
-   virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
    /*@}*/
 
-   src_reg result;
+   dst_reg
+   temporary_reg(const glsl_type *type)
+   {
+      const unsigned n = (type->is_array() || type->is_record() ?
+                           4 : type->vector_elements);
+      return resize(bld.natural_reg(brw_type_for_base_type(type),
+                                    type_size(type)), n);
+   }
 
    /* Regs for vertex results.  Generated at ir_variable visiting time
     * for the ir->location's used.
     */
    dst_reg output_reg[BRW_VARYING_SLOT_COUNT];
    const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT];
-   int *uniform_size;
-   int *uniform_vector_size;
-   int uniform_array_size; /*< Size of uniform_[vector_]size arrays */
-   int uniforms;
-
-   src_reg shader_start_time;
-
-   struct hash_table *variable_ht;
 
    bool run(void);
-   void fail(const char *msg, ...);
 
    void setup_uniform_clipplane_values();
-   void setup_uniform_values(ir_variable *ir);
-   void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
    bool reg_allocate_trivial();
    bool reg_allocate();
@@ -231,120 +173,21 @@ public:
    void opt_set_dependency_control();
    void opt_schedule_instructions();
 
-   vec4_instruction *emit(vec4_instruction *inst);
-
-   vec4_instruction *emit(enum opcode opcode);
-
-   vec4_instruction *emit(enum opcode opcode, dst_reg dst);
-
-   vec4_instruction *emit(enum opcode opcode, dst_reg dst, src_reg src0);
-
-   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
-			  src_reg src0, src_reg src1);
-
-   vec4_instruction *emit(enum opcode opcode, dst_reg dst,
-			  src_reg src0, src_reg src1, src_reg src2);
-
-   vec4_instruction *emit_before(bblock_t *block,
-                                 vec4_instruction *inst,
-				 vec4_instruction *new_inst);
-
-   vec4_instruction *MOV(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *NOT(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *RNDD(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *RNDE(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *RNDZ(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *FRC(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *F32TO16(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *F16TO32(const dst_reg &dst, const src_reg &src0);
-   vec4_instruction *ADD(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *MUL(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *MACH(const dst_reg &dst, const src_reg &src0,
-                          const src_reg &src1);
-   vec4_instruction *MAC(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *AND(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *OR(const dst_reg &dst, const src_reg &src0,
-                        const src_reg &src1);
-   vec4_instruction *XOR(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *DP3(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *DP4(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *DPH(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *SHL(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *SHR(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *ASR(const dst_reg &dst, const src_reg &src0,
-                         const src_reg &src1);
-   vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
-			 enum brw_conditional_mod condition);
-   vec4_instruction *IF(src_reg src0, src_reg src1,
-                        enum brw_conditional_mod condition);
-   vec4_instruction *IF(enum brw_predicate predicate);
-   vec4_instruction *PULL_CONSTANT_LOAD(const dst_reg &dst,
-                                        const src_reg &index);
-   vec4_instruction *SCRATCH_READ(const dst_reg &dst, const src_reg &index);
-   vec4_instruction *SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
-                                   const src_reg &index);
-   vec4_instruction *LRP(const dst_reg &dst, const src_reg &a,
-                         const src_reg &y, const src_reg &x);
-   vec4_instruction *BFREV(const dst_reg &dst, const src_reg &value);
-   vec4_instruction *BFE(const dst_reg &dst, const src_reg &bits,
-                         const src_reg &offset, const src_reg &value);
-   vec4_instruction *BFI1(const dst_reg &dst, const src_reg &bits,
-                          const src_reg &offset);
-   vec4_instruction *BFI2(const dst_reg &dst, const src_reg &bfi1_dst,
-                          const src_reg &insert, const src_reg &base);
-   vec4_instruction *FBH(const dst_reg &dst, const src_reg &value);
-   vec4_instruction *FBL(const dst_reg &dst, const src_reg &value);
-   vec4_instruction *CBIT(const dst_reg &dst, const src_reg &value);
-   vec4_instruction *MAD(const dst_reg &dst, const src_reg &c,
-                         const src_reg &b, const src_reg &a);
-   vec4_instruction *ADDC(const dst_reg &dst, const src_reg &src0,
-                          const src_reg &src1);
-   vec4_instruction *SUBB(const dst_reg &dst, const src_reg &src0,
-                          const src_reg &src1);
-
-   int implied_mrf_writes(vec4_instruction *inst);
-
-   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
-			       dst_reg dst,
-			       src_reg src,
-			       vec4_instruction *pre_rhs_inst,
-			       vec4_instruction *last_rhs_inst);
-
-   /** Walks an exec_list of ir_instruction and sends it through this visitor. */
-   void visit_instructions(const exec_list *list);
+   instruction *SCRATCH_READ(vec4_builder &bld, const dst_reg &dst,
+                             const src_reg &index);
+   instruction *SCRATCH_WRITE(vec4_builder &bld, const dst_reg &dst,
+                              const src_reg &src, const src_reg &index);
+
+   void emit_pull_constant_load(vec4_builder &bld,
+                                const dst_reg &dst,
+                                const src_reg &surf_index,
+                                uint32_t off,
+                                const src_reg *reladdr,
+                                unsigned num_components);
 
    void emit_vp_sop(enum brw_conditional_mod condmod, dst_reg dst,
                     src_reg src0, src_reg src1, src_reg one);
 
-   void emit_bool_to_cond_code(ir_rvalue *ir, enum brw_predicate *predicate);
-   void emit_if_gen6(ir_if *ir);
-
-   void emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
-                    src_reg src0, src_reg src1);
-
-   void emit_lrp(const dst_reg &dst,
-                 const src_reg &x, const src_reg &y, const src_reg &a);
-
-   void emit_block_move(dst_reg *dst, src_reg *src,
-                        const struct glsl_type *type, brw_predicate predicate);
-
-   void emit_constant_values(dst_reg *dst, ir_constant *value);
-
-   /**
-    * Emit the correct dot-product instruction for the type of arguments
-    */
-   void emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements);
-
    void emit_scalar(ir_instruction *ir, enum prog_opcode op,
 		    dst_reg dst, src_reg src0);
 
@@ -354,69 +197,53 @@ public:
    void emit_scs(ir_instruction *ir, enum prog_opcode op,
 		 dst_reg dst, const src_reg &src);
 
-   src_reg fix_3src_operand(src_reg src);
-
-   void emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src);
-   void emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src);
-   void emit_math(enum opcode opcode, dst_reg dst, src_reg src);
-   void emit_math2_gen6(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
-   void emit_math2_gen4(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
-   void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
-   src_reg fix_math_operand(src_reg src);
-
    void emit_pack_half_2x16(dst_reg dst, src_reg src0);
    void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
 
-   uint32_t gather_channel(ir_texture *ir, uint32_t sampler);
-   src_reg emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler);
-   void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
-   void swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler);
-
    void emit_ndc_computation();
    void emit_psiz_and_flags(dst_reg reg);
    void emit_clip_distances(dst_reg reg, int offset);
    void emit_generic_urb_slot(dst_reg reg, int varying);
    void emit_urb_slot(dst_reg reg, int varying);
 
-   void emit_shader_time_begin();
-   void emit_shader_time_end();
-   void emit_shader_time_write(enum shader_time_shader_type type,
-                               src_reg value);
-
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            dst_reg dst, src_reg offset, src_reg src0,
-                            src_reg src1);
-
-   void emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                  src_reg offset);
-
    src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
 			      src_reg *reladdr, int reg_offset);
-   src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst,
-				    src_reg *reladdr, int reg_offset);
    void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
 			  dst_reg dst,
 			  src_reg orig_src,
 			  int base_offset);
    void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
 			   int base_offset);
-   void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
-				dst_reg dst,
-				src_reg orig_src,
-				int base_offset);
 
-   bool try_emit_mad(ir_expression *ir);
-   bool try_emit_b2f_of_compare(ir_expression *ir);
-   void resolve_ud_negate(src_reg *reg);
+   vec4_instruction *emit_texture(ir_texture *ir, const dst_reg &dst,
+                                  const src_reg &coordinate,
+                                  const src_reg &shadow_c,
+                                  const src_reg &lod, const src_reg &lod2,
+                                  const src_reg &offset_val,
+                                  const src_reg &sample_index,
+                                  const src_reg &mcs, const src_reg &sampler);
 
-   src_reg get_timestamp();
+   src_reg emit_untyped_surface_header();
 
    bool process_move_condition(ir_rvalue *ir);
 
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
-   void visit_atomic_counter_intrinsic(ir_call *ir);
+   void try_replace_with_sel() {}
+
+   bool
+   emit_interpolate_expression(ir_expression *ir)
+   {
+      unreachable("not reached");
+   }
+
+   const struct brw_sampler_prog_key_data *
+   sampler_prog_key() const {
+      return &key->tex;
+   }
+
+   void no16(const char *msg, ...) {}
 
 protected:
    void emit_vertex();
@@ -432,7 +259,6 @@ protected:
    virtual void emit_thread_end() = 0;
    virtual void emit_urb_write_header(int mrf) = 0;
    virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
-   virtual int compute_array_stride(ir_dereference_array *ir);
 
    const bool debug_flag;
 
@@ -441,10 +267,6 @@ private:
     * If true, then register allocation should fail instead of spilling.
     */
    const bool no_spills;
-
-   const shader_time_shader_type st_base;
-   const shader_time_shader_type st_written;
-   const shader_time_shader_type st_reset;
 };
 
 
@@ -537,11 +359,13 @@ private:
 
    void generate_untyped_atomic(vec4_instruction *inst,
                                 struct brw_reg dst,
+                                struct brw_reg payload,
                                 struct brw_reg atomic_op,
                                 struct brw_reg surf_index);
 
    void generate_untyped_surface_read(vec4_instruction *inst,
                                       struct brw_reg dst,
+                                      struct brw_reg payload,
                                       struct brw_reg surf_index);
 
    struct brw_context *brw;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index b0a5c0a65e9..3ccac54e436 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -163,18 +163,17 @@ vec4_visitor::opt_cse_local(bblock_t *block)
                entry->tmp = src_reg(this, glsl_type::float_type);
                entry->tmp.type = inst->dst.type;
                entry->tmp.swizzle = BRW_SWIZZLE_XYZW;
-
-               vec4_instruction *copy = MOV(entry->generator->dst, entry->tmp);
-               entry->generator->insert_after(block, copy);
+               bld.at(block, (vec4_instruction *)entry->generator->next)
+                  .MOV(entry->generator->dst, entry->tmp);
                entry->generator->dst = dst_reg(entry->tmp);
             }
 
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                assert(inst->dst.type == entry->tmp.type);
-               vec4_instruction *copy = MOV(inst->dst, entry->tmp);
+               vec4_instruction *copy =
+                  bld.at(block, inst).MOV(inst->dst, entry->tmp);
                copy->force_writemask_all = inst->force_writemask_all;
-               inst->insert_before(block, copy);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index e5225673812..308a2114212 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1098,6 +1098,7 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
 void
 vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
                                         struct brw_reg dst,
+                                        struct brw_reg payload,
                                         struct brw_reg atomic_op,
                                         struct brw_reg surf_index)
 {
@@ -1106,8 +1107,7 @@ vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
           surf_index.file == BRW_IMMEDIATE_VALUE &&
 	  surf_index.type == BRW_REGISTER_TYPE_UD);
 
-   brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
-                      atomic_op.dw1.ud, surf_index.dw1.ud,
+   brw_untyped_atomic(p, dst, payload, atomic_op.dw1.ud, surf_index.dw1.ud,
                       inst->mlen, 1);
 
    brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
@@ -1116,13 +1116,13 @@ vec4_generator::generate_untyped_atomic(vec4_instruction *inst,
 void
 vec4_generator::generate_untyped_surface_read(vec4_instruction *inst,
                                               struct brw_reg dst,
+                                              struct brw_reg payload,
                                               struct brw_reg surf_index)
 {
    assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
 	  surf_index.type == BRW_REGISTER_TYPE_UD);
 
-   brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
-                            surf_index.dw1.ud,
+   brw_untyped_surface_read(p, dst, payload, surf_index.dw1.ud,
                             inst->mlen, 1);
 
    brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
@@ -1479,11 +1479,11 @@ vec4_generator::generate_code(const cfg_t *cfg)
          break;
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
-         generate_untyped_atomic(inst, dst, src[0], src[1]);
+         generate_untyped_atomic(inst, dst, src[0], src[1], src[2]);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
-         generate_untyped_surface_read(inst, dst, src[0]);
+         generate_untyped_surface_read(inst, dst, src[0], src[1]);
          break;
 
       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index c569e0aa4ca..ce3ed7f65b3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -41,8 +41,7 @@ vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
                                  bool no_spills)
    : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
-                  INTEL_DEBUG & DEBUG_GS, no_spills,
-                  ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
+                  INTEL_DEBUG & DEBUG_GS, no_spills, ST_GS),
      c(c)
 {
 }
@@ -55,8 +54,8 @@ vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
 
    switch (ir->data.location) {
    case SYSTEM_VALUE_INVOCATION_ID:
-      this->current_annotation = "initialize gl_InvocationID";
-      emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      bld.set_annotation("initialize gl_InvocationID");
+      bld.emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
       break;
    default:
       unreachable("not reached");
@@ -148,17 +147,17 @@ vec4_gs_visitor::emit_prolog()
     * reads/writes to garbage memory).  So just set it to zero at the top of
     * the shader.
     */
-   this->current_annotation = "clear r0.2";
+   bld.set_annotation("clear r0.2");
    dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
-   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
+   vec4_instruction *inst = bld.emit(GS_OPCODE_SET_DWORD_2, r0, 0u);
    inst->force_writemask_all = true;
 
    /* Create a virtual register to hold the vertex count */
    this->vertex_count = src_reg(this, glsl_type::uint_type);
 
    /* Initialize the vertex_count register to 0 */
-   this->current_annotation = "initialize vertex_count";
-   inst = emit(MOV(dst_reg(this->vertex_count), 0u));
+   bld.set_annotation("initialize vertex_count");
+   inst = bld.MOV(dst_reg(this->vertex_count), 0u);
    inst->force_writemask_all = true;
 
    if (c->control_data_header_size_bits > 0) {
@@ -172,8 +171,8 @@ vec4_gs_visitor::emit_prolog()
        * Otherwise, we need to initialize it to 0 here.
        */
       if (c->control_data_header_size_bits <= 32) {
-         this->current_annotation = "initialize control data bits";
-         inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
+         bld.set_annotation("initialize control data bits");
+         inst = bld.MOV(dst_reg(this->control_data_bits), 0u);
          inst->force_writemask_all = true;
       }
    }
@@ -183,7 +182,7 @@ vec4_gs_visitor::emit_prolog()
     * component of VARYING_SLOT_PSIZ.
     */
    if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
-      this->current_annotation = "swizzle gl_PointSize input";
+      bld.set_annotation("swizzle gl_PointSize input");
       for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
          dst_reg dst(ATTR,
                      BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
@@ -191,7 +190,7 @@ vec4_gs_visitor::emit_prolog()
          src_reg src(dst);
          dst.writemask = WRITEMASK_X;
          src.swizzle = BRW_SWIZZLE_WWWW;
-         inst = emit(MOV(dst, src));
+         inst = bld.MOV(dst, src);
 
          /* In dual instanced dispatch mode, dst has a width of 4, so we need
           * to make sure the MOV happens regardless of which channels are
@@ -201,7 +200,7 @@ vec4_gs_visitor::emit_prolog()
       }
    }
 
-   this->current_annotation = NULL;
+   bld.set_annotation(NULL);
 }
 
 
@@ -222,7 +221,7 @@ vec4_gs_visitor::emit_thread_end()
        * corresponding to the most recently output vertex still need to be
        * emitted.
        */
-      current_annotation = "thread end: emit control data bits";
+      bld.set_annotation("thread end: emit control data bits");
       emit_control_data_bits();
    }
 
@@ -231,15 +230,15 @@ vec4_gs_visitor::emit_thread_end()
     */
    int base_mrf = 1;
 
-   current_annotation = "thread end";
+   bld.set_annotation("thread end");
    dst_reg mrf_reg(MRF, base_mrf);
    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   vec4_instruction *inst = bld.MOV(mrf_reg, r0);
    inst->force_writemask_all = true;
-   emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
+   bld.emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       emit_shader_time_end();
-   inst = emit(GS_OPCODE_THREAD_END);
+   inst = bld.emit(GS_OPCODE_THREAD_END);
    inst->base_mrf = base_mrf;
    inst->mlen = 1;
 }
@@ -258,10 +257,10 @@ vec4_gs_visitor::emit_urb_write_header(int mrf)
     */
    dst_reg mrf_reg(MRF, mrf);
    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-   this->current_annotation = "URB write header";
-   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   bld.set_annotation("URB write header");
+   vec4_instruction *inst = bld.MOV(mrf_reg, r0);
    inst->force_writemask_all = true;
-   emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
+   bld.emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
         (uint32_t) c->prog_data.output_vertex_size_hwords);
 }
 
@@ -275,7 +274,7 @@ vec4_gs_visitor::emit_urb_write_opcode(bool complete)
     */
    (void) complete;
 
-   vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
+   vec4_instruction *inst = bld.emit(GS_OPCODE_URB_WRITE);
    inst->offset = c->prog_data.control_data_header_size_hwords;
 
    /* We need to increment Global Offset by 1 to make room for Broadwell's
@@ -288,9 +287,8 @@ vec4_gs_visitor::emit_urb_write_opcode(bool complete)
    return inst;
 }
 
-
-int
-vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
+unsigned
+vec4_gs_visitor::get_array_stride(ir_dereference_array *ir)
 {
    /* Geometry shader inputs are arrays, but they use an unusual array layout:
     * instead of all array elements for a given geometry shader input being
@@ -303,7 +301,7 @@ vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
    if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
       return BRW_VARYING_SLOT_COUNT;
    else
-      return vec4_visitor::compute_array_stride(ir);
+      return backend_visitor::get_array_stride(ir);
 }
 
 
@@ -349,8 +347,8 @@ vec4_gs_visitor::emit_control_data_bits()
    /* If vertex_count is 0, then no control data bits have been accumulated
     * yet, so we should do nothing.
     */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.CMP(bld.reg_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
       /* If we are using either channel masks or a per-slot offset, then we
        * need to figure out which DWORD we are trying to write to, using the
@@ -366,11 +364,11 @@ vec4_gs_visitor::emit_control_data_bits()
       src_reg dword_index(this, glsl_type::uint_type);
       if (urb_write_flags) {
          src_reg prev_count(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+         bld.ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu);
          unsigned log2_bits_per_vertex =
             _mesa_fls(c->control_data_bits_per_vertex);
-         emit(SHR(dst_reg(dword_index), prev_count,
-                  (uint32_t) (6 - log2_bits_per_vertex)));
+         bld.SHR(dst_reg(dword_index), prev_count,
+                  (uint32_t) (6 - log2_bits_per_vertex));
       }
 
       /* Start building the URB write message.  The first MRF gets a copy of
@@ -379,7 +377,7 @@ vec4_gs_visitor::emit_control_data_bits()
       int base_mrf = 1;
       dst_reg mrf_reg(MRF, base_mrf);
       src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-      vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+      vec4_instruction *inst = bld.MOV(mrf_reg, r0);
       inst->force_writemask_all = true;
 
       if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
@@ -387,8 +385,8 @@ vec4_gs_visitor::emit_control_data_bits()
           * the appropriate OWORD within the control data header.
           */
          src_reg per_slot_offset(this, glsl_type::uint_type);
-         emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
-         emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
+         bld.SHR(dst_reg(per_slot_offset), dword_index, 2u);
+         bld.emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
       }
 
       if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
@@ -400,24 +398,24 @@ vec4_gs_visitor::emit_control_data_bits()
           * together.
           */
          src_reg channel(this, glsl_type::uint_type);
-         inst = emit(AND(dst_reg(channel), dword_index, 3u));
+         inst = bld.AND(dst_reg(channel), dword_index, 3u);
          inst->force_writemask_all = true;
          src_reg one(this, glsl_type::uint_type);
-         inst = emit(MOV(dst_reg(one), 1u));
+         inst = bld.MOV(dst_reg(one), 1u);
          inst->force_writemask_all = true;
          src_reg channel_mask(this, glsl_type::uint_type);
-         inst = emit(SHL(dst_reg(channel_mask), one, channel));
+         inst = bld.SHL(dst_reg(channel_mask), one, channel);
          inst->force_writemask_all = true;
-         emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+         bld.emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
                                                channel_mask);
-         emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+         bld.emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
       }
 
       /* Store the control data bits in the message payload and send it. */
       dst_reg mrf_reg2(MRF, base_mrf + 1);
-      inst = emit(MOV(mrf_reg2, this->control_data_bits));
+      inst = bld.MOV(mrf_reg2, this->control_data_bits);
       inst->force_writemask_all = true;
-      inst = emit(GS_OPCODE_URB_WRITE);
+      inst = bld.emit(GS_OPCODE_URB_WRITE);
       inst->urb_write_flags = urb_write_flags;
       /* We need to increment Global Offset by 256-bits to make room for
        * Broadwell's extra "Vertex Count" payload at the beginning of the
@@ -429,7 +427,7 @@ vec4_gs_visitor::emit_control_data_bits()
       inst->base_mrf = base_mrf;
       inst->mlen = 2;
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 }
 
 void
@@ -455,11 +453,11 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
 
    /* reg::sid = stream_id */
    src_reg sid(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(sid), stream_id));
+   bld.MOV(dst_reg(sid), stream_id);
 
    /* reg:shift_count = 2 * (vertex_count - 1) */
    src_reg shift_count(this, glsl_type::uint_type);
-   emit(SHL(dst_reg(shift_count), this->vertex_count, 1u));
+   bld.SHL(dst_reg(shift_count), this->vertex_count, 1u);
 
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
@@ -467,23 +465,23 @@ vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
     * stream_id << ((2 * (vertex_count - 1)) % 32).
     */
    src_reg mask(this, glsl_type::uint_type);
-   emit(SHL(dst_reg(mask), sid, shift_count));
-   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+   bld.SHL(dst_reg(mask), sid, shift_count);
+   bld.OR(dst_reg(this->control_data_bits), this->control_data_bits, mask);
 }
 
 void
 vec4_gs_visitor::visit(ir_emit_vertex *ir)
 {
-   this->current_annotation = "emit vertex: safety check";
+   bld.set_annotation("emit vertex: safety check");
 
    /* To ensure that we don't output more vertices than the shader specified
     * using max_vertices, do the logic inside a conditional of the form "if
     * (vertex_count < MAX)"
     */
    unsigned num_output_vertices = c->gp->program.VerticesOut;
-   emit(CMP(dst_null_d(), this->vertex_count,
-            src_reg(num_output_vertices), BRW_CONDITIONAL_L));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.CMP(bld.reg_null_d(), this->vertex_count,
+            src_reg(num_output_vertices), BRW_CONDITIONAL_L);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
       /* If we're outputting 32 control data bits or less, then we can wait
        * until the shader is over to output them all.  Otherwise we need to
@@ -493,7 +491,7 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
        * correct.
        */
       if (c->control_data_header_size_bits > 32) {
-         this->current_annotation = "emit vertex: emit control data bits";
+         bld.set_annotation("emit vertex: emit control data bits");
          /* Only emit control data bits if we've finished accumulating a batch
           * of 32 bits.  This is the case when:
           *
@@ -513,10 +511,10 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
           *     vertex_count & (32 / bits_per_vertex - 1) == 0
           */
          vec4_instruction *inst =
-            emit(AND(dst_null_d(), this->vertex_count,
-                     (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
+            bld.AND(bld.reg_null_d(), this->vertex_count,
+                     (uint32_t) (32 / c->control_data_bits_per_vertex - 1));
          inst->conditional_mod = BRW_CONDITIONAL_Z;
-         emit(IF(BRW_PREDICATE_NORMAL));
+         bld.IF(BRW_PREDICATE_NORMAL);
          {
             emit_control_data_bits();
 
@@ -527,13 +525,13 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
              * effect of any call to EndPrimitive() that the shader may have
              * made before outputting its first vertex.
              */
-            inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
+            inst = bld.MOV(dst_reg(this->control_data_bits), 0u);
             inst->force_writemask_all = true;
          }
-         emit(BRW_OPCODE_ENDIF);
+         bld.emit(BRW_OPCODE_ENDIF);
       }
 
-      this->current_annotation = "emit vertex: vertex data";
+      bld.set_annotation("emit vertex: vertex data");
       emit_vertex();
 
       /* In stream mode we have to set control data bits for all vertices
@@ -543,17 +541,17 @@ vec4_gs_visitor::visit(ir_emit_vertex *ir)
       if (c->control_data_header_size_bits > 0 &&
           c->prog_data.control_data_format ==
              GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
-          this->current_annotation = "emit vertex: Stream control data bits";
+         bld.set_annotation("emit vertex: Stream control data bits");
           set_stream_control_data_bits(ir->stream_id());
       }
 
-      this->current_annotation = "emit vertex: increment vertex count";
-      emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
-               src_reg(1u)));
+      bld.set_annotation("emit vertex: increment vertex count");
+      bld.ADD(dst_reg(this->vertex_count), this->vertex_count,
+               src_reg(1u));
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 
-   this->current_annotation = NULL;
+   bld.set_annotation(NULL);
 }
 
 void
@@ -594,17 +592,17 @@ vec4_gs_visitor::visit(ir_end_primitive *)
 
    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
    src_reg one(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(one), 1u));
+   bld.MOV(dst_reg(one), 1u);
    src_reg prev_count(this, glsl_type::uint_type);
-   emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
+   bld.ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu);
    src_reg mask(this, glsl_type::uint_type);
    /* Note: we're relying on the fact that the GEN SHL instruction only pays
     * attention to the lower 5 bits of its second source argument, so on this
     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
     * ((vertex_count - 1) % 32).
     */
-   emit(SHL(dst_reg(mask), one, prev_count));
-   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+   bld.SHL(dst_reg(mask), one, prev_count);
+   bld.OR(dst_reg(this->control_data_bits), this->control_data_bits, mask);
 }
 
 static const unsigned *
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index 8bf11facb0b..1c8e7ad876c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -91,7 +91,7 @@ protected:
    virtual void emit_thread_end();
    virtual void emit_urb_write_header(int mrf);
    virtual vec4_instruction *emit_urb_write_opcode(bool complete);
-   virtual int compute_array_stride(ir_dereference_array *ir);
+   virtual unsigned get_array_stride(ir_dereference_array *ir);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index de04d881d8b..25a48fd5f4e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -23,7 +23,6 @@
 
 #include "brw_vec4.h"
 #include "brw_cfg.h"
-#include "glsl/ir_uniform.h"
 extern "C" {
 #include "program/sampler.h"
 }
@@ -62,198 +61,11 @@ vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
 }
 
 vec4_instruction *
-vec4_visitor::emit(vec4_instruction *inst)
+vec4_visitor::SCRATCH_READ(vec4_builder &bld, const dst_reg &dst,
+                           const src_reg &index)
 {
-   inst->ir = this->base_ir;
-   inst->annotation = this->current_annotation;
-
-   this->instructions.push_tail(inst);
-
-   return inst;
-}
-
-vec4_instruction *
-vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
-                          vec4_instruction *new_inst)
-{
-   new_inst->ir = inst->ir;
-   new_inst->annotation = inst->annotation;
-
-   inst->insert_before(block, new_inst);
-
-   return inst;
-}
-
-vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst,
-		   src_reg src0, src_reg src1, src_reg src2)
-{
-   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
-}
-
-
-vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
-{
-   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
-}
-
-vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
-{
-   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
-}
-
-vec4_instruction *
-vec4_visitor::emit(enum opcode opcode, dst_reg dst)
-{
-   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
-}
-
-vec4_instruction *
-vec4_visitor::emit(enum opcode opcode)
-{
-   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
-}
-
-#define ALU1(op)							\
-   vec4_instruction *							\
-   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
-   {									\
-      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
-   }
-
-#define ALU2(op)							\
-   vec4_instruction *							\
-   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
-                    const src_reg &src1)				\
-   {									\
-      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
-					   src0, src1);			\
-   }
-
-#define ALU2_ACC(op)							\
-   vec4_instruction *							\
-   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
-                    const src_reg &src1)				\
-   {									\
-      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
-                       BRW_OPCODE_##op, dst, src0, src1);		\
-      inst->writes_accumulator = true;                                  \
-      return inst;                                                      \
-   }
-
-#define ALU3(op)							\
-   vec4_instruction *							\
-   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
-                    const src_reg &src1, const src_reg &src2)		\
-   {									\
-      assert(brw->gen >= 6);						\
-      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
-					   src0, src1, src2);		\
-   }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU1(F32TO16)
-ALU1(F16TO32)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(DP3)
-ALU2(DP4)
-ALU2(DPH)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-vec4_instruction *
-vec4_visitor::IF(enum brw_predicate predicate)
-{
-   vec4_instruction *inst;
-
-   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
-   inst->predicate = predicate;
-
-   return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-vec4_instruction *
-vec4_visitor::IF(src_reg src0, src_reg src1,
-                 enum brw_conditional_mod condition)
-{
-   assert(brw->gen == 6);
-
-   vec4_instruction *inst;
-
-   resolve_ud_negate(&src0);
-   resolve_ud_negate(&src1);
-
-   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
-					src0, src1);
-   inst->conditional_mod = condition;
-
-   return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-vec4_instruction *
-vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
-                  enum brw_conditional_mod condition)
-{
-   vec4_instruction *inst;
-
-   /* original gen4 does type conversion to the destination type
-    * before before comparison, producing garbage results for floating
-    * point comparisons.
-    */
-   if (brw->gen == 4) {
-      dst.type = src0.type;
-      if (dst.file == HW_REG)
-	 dst.fixed_hw_reg.type = dst.type;
-   }
-
-   resolve_ud_negate(&src0);
-   resolve_ud_negate(&src1);
-
-   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = condition;
-
-   return inst;
-}
-
-vec4_instruction *
-vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
-{
-   vec4_instruction *inst;
-
-   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
-					dst, index);
+   instruction *inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_READ,
+                                dst, index);
    inst->base_mrf = 14;
    inst->mlen = 2;
 
@@ -261,13 +73,11 @@ vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
 }
 
 vec4_instruction *
-vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
-                            const src_reg &index)
+vec4_visitor::SCRATCH_WRITE(vec4_builder &bld, const dst_reg &dst,
+                            const src_reg &src, const src_reg &index)
 {
-   vec4_instruction *inst;
-
-   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-					dst, src, index);
+   instruction *inst = bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+                                dst, src, index);
    inst->base_mrf = 13;
    inst->mlen = 3;
 
@@ -275,167 +85,48 @@ vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
 }
 
 void
-vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
+vec4_visitor::emit_pull_constant_load(vec4_builder &bld,
+                                      const dst_reg &dst,
+                                      const src_reg &surf_index,
+                                      uint32_t off,
+                                      const src_reg *reladdr,
+                                      unsigned num_components)
 {
-   static enum opcode dot_opcodes[] = {
-      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
-   };
-
-   emit(dot_opcodes[elements - 2], dst, src0, src1);
-}
-
-src_reg
-vec4_visitor::fix_3src_operand(src_reg src)
-{
-   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
-    * able to use vertical stride of zero to replicate the vec4 uniform, like
-    *
-    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
-    *
-    * But you can't, since vertical stride is always four in three-source
-    * instructions. Instead, insert a MOV instruction to do the replication so
-    * that the three-source instruction can consume it.
-    */
-
-   /* The MOV is only needed if the source is a uniform or immediate. */
-   if (src.file != UNIFORM && src.file != IMM)
-      return src;
-
-   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
-      return src;
-
-   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
-   expanded.type = src.type;
-   emit(MOV(expanded, src));
-   return src_reg(expanded);
-}
-
-src_reg
-vec4_visitor::fix_math_operand(src_reg src)
-{
-   /* The gen6 math instruction ignores the source modifiers --
-    * swizzle, abs, negate, and at least some parts of the register
-    * region description.
-    *
-    * Rather than trying to enumerate all these cases, *always* expand the
-    * operand to a temp GRF for gen6.
-    *
-    * For gen7, keep the operand as-is, except if immediate, which gen7 still
-    * can't use.
+   /* Pre-gen6, the message header uses byte offsets instead of vec4
+    * (16-byte) offset units.
     */
+   const unsigned scale = (brw->gen >= 6 ? 16 : 1);
+   src_reg result(bld.vector(num_components).natural_reg(dst.type));
+   src_reg addr;
 
-   if (brw->gen == 7 && src.file != IMM)
-      return src;
-
-   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
-   expanded.type = src.type;
-   emit(MOV(expanded, src));
-   return src_reg(expanded);
-}
-
-void
-vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
-{
-   src = fix_math_operand(src);
-
-   if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
-      /* The gen6 math instruction must be align1, so we can't do
-       * writemasks.
-       */
-      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
-
-      emit(opcode, temp_dst, src);
-
-      emit(MOV(dst, src_reg(temp_dst)));
+   if (reladdr) {
+      addr = src_reg(bld.scalar_reg(reladdr->type));
+      bld.ADD(dst_reg(addr), *reladdr, src_reg(off / 16));
+      if (scale == 1)
+         bld.SHL(dst_reg(addr), addr, src_reg(4));
    } else {
-      emit(opcode, dst, src);
+      addr = src_reg((off & ~0xf) / scale);
    }
-}
 
-void
-vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
-{
-   vec4_instruction *inst = emit(opcode, dst, src);
-   inst->base_mrf = 1;
-   inst->mlen = 1;
-}
-
-void
-vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
-{
-   switch (opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      break;
-   default:
-      unreachable("not reached: bad math opcode");
-   }
-
-   if (brw->gen >= 8) {
-      emit(opcode, dst, src);
-   } else if (brw->gen >= 6) {
-      emit_math1_gen6(opcode, dst, src);
-   } else {
-      emit_math1_gen4(opcode, dst, src);
-   }
-}
-
-void
-vec4_visitor::emit_math2_gen6(enum opcode opcode,
-			      dst_reg dst, src_reg src0, src_reg src1)
-{
-   src0 = fix_math_operand(src0);
-   src1 = fix_math_operand(src1);
-
-   if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
-      /* The gen6 math instruction must be align1, so we can't do
-       * writemasks.
-       */
-      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
-      temp_dst.type = dst.type;
-
-      emit(opcode, temp_dst, src0, src1);
+   if (brw->gen >= 7) {
+      if (addr.file == IMM) {
+         dst_reg tmp = bld.scalar_reg(addr.type);
+         bld.MOV(tmp, addr);
+         addr = src_reg(tmp);
+      }
 
-      emit(MOV(dst, src_reg(temp_dst)));
+      bld.emit(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, dst_reg(result),
+               surf_index, addr);
    } else {
-      emit(opcode, dst, src0, src1);
-   }
-}
-
-void
-vec4_visitor::emit_math2_gen4(enum opcode opcode,
-			      dst_reg dst, src_reg src0, src_reg src1)
-{
-   vec4_instruction *inst = emit(opcode, dst, src0, src1);
-   inst->base_mrf = 1;
-   inst->mlen = 2;
-}
-
-void
-vec4_visitor::emit_math(enum opcode opcode,
-			dst_reg dst, src_reg src0, src_reg src1)
-{
-   switch (opcode) {
-   case SHADER_OPCODE_POW:
-   case SHADER_OPCODE_INT_QUOTIENT:
-   case SHADER_OPCODE_INT_REMAINDER:
-      break;
-   default:
-      unreachable("not reached: unsupported binary math opcode");
+      vec4_instruction *pull = bld.emit(VS_OPCODE_PULL_CONSTANT_LOAD,
+                                        dst_reg(result), surf_index, addr);
+      pull->base_mrf = 14;
+      pull->mlen = 1;
    }
 
-   if (brw->gen >= 8) {
-      emit(opcode, dst, src0, src1);
-   } else if (brw->gen >= 6) {
-      emit_math2_gen6(opcode, dst, src0, src1);
-   } else {
-      emit_math2_gen4(opcode, dst, src0, src1);
-   }
+   result.swizzle += BRW_SWIZZLE4(off % 16 / 4, off % 16 / 4,
+                                  off % 16 / 4, off % 16 / 4);
+   bld.MOV(dst, result);
 }
 
 void
@@ -486,7 +177,7 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
     * You should inspect the disasm output in order to verify that the MOV is
     * not optimized away.
     */
-   emit(MOV(tmp_dst, src_reg(0x12345678u)));
+   bld.MOV(tmp_dst, src_reg(0x12345678u));
 #endif
 
    /* Give tmp the form below, where "." means untouched.
@@ -499,20 +190,20 @@ vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
     * relies on the undocumented hardware behavior mentioned above.
     */
    tmp_dst.writemask = WRITEMASK_XY;
-   emit(F32TO16(tmp_dst, src0));
+   bld.F32TO16(tmp_dst, src0);
 
    /* Give the write-channels of dst the form:
     *   0xhhhh0000
     */
    tmp_src.swizzle = BRW_SWIZZLE_YYYY;
-   emit(SHL(dst, tmp_src, src_reg(16u)));
+   bld.SHL(dst, tmp_src, src_reg(16u));
 
    /* Finally, give the write-channels of dst the form of packHalf2x16's
     * output:
     *   0xhhhhllll
     */
    tmp_src.swizzle = BRW_SWIZZLE_XXXX;
-   emit(OR(dst, src_reg(dst), tmp_src));
+   bld.OR(dst, src_reg(dst), tmp_src);
 }
 
 void
@@ -544,70 +235,13 @@ vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
    src_reg tmp_src(tmp_dst);
 
    tmp_dst.writemask = WRITEMASK_X;
-   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
+   bld.AND(tmp_dst, src0, src_reg(0xffffu));
 
    tmp_dst.writemask = WRITEMASK_Y;
-   emit(SHR(tmp_dst, src0, src_reg(16u)));
+   bld.SHR(tmp_dst, src0, src_reg(16u));
 
    dst.writemask = WRITEMASK_XY;
-   emit(F16TO32(dst, tmp_src));
-}
-
-void
-vec4_visitor::visit_instructions(const exec_list *list)
-{
-   foreach_in_list(ir_instruction, ir, list) {
-      base_ir = ir;
-      ir->accept(this);
-   }
-}
-
-
-static int
-type_size(const struct glsl_type *type)
-{
-   unsigned int i;
-   int size;
-
-   switch (type->base_type) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_BOOL:
-      if (type->is_matrix()) {
-	 return type->matrix_columns;
-      } else {
-	 /* Regardless of size of vector, it gets a vec4. This is bad
-	  * packing for things like floats, but otherwise arrays become a
-	  * mess.  Hopefully a later pass over the code can pack scalars
-	  * down if appropriate.
-	  */
-	 return 1;
-      }
-   case GLSL_TYPE_ARRAY:
-      assert(type->length > 0);
-      return type_size(type->fields.array) * type->length;
-   case GLSL_TYPE_STRUCT:
-      size = 0;
-      for (i = 0; i < type->length; i++) {
-	 size += type_size(type->fields.structure[i].type);
-      }
-      return size;
-   case GLSL_TYPE_SAMPLER:
-      /* Samplers take up no register space, since they're baked in at
-       * link time.
-       */
-      return 0;
-   case GLSL_TYPE_ATOMIC_UINT:
-      return 0;
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-      unreachable("not reached");
-   }
-
-   return 0;
+   bld.F16TO32(dst, tmp_src);
 }
 
 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
@@ -615,7 +249,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -633,7 +267,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type) * size);
+   this->reg = v->alloc.allocate(v->type_size(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -645,7 +279,7 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(type_size(type));
+   this->reg = v->alloc.allocate(v->type_size(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
@@ -656,55 +290,6 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    this->type = brw_type_for_base_type(type);
 }
 
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
- */
-void
-vec4_visitor::setup_uniform_values(ir_variable *ir)
-{
-   int namelen = strlen(ir->name);
-
-   /* The data for our (non-builtin) uniforms is stored in a series of
-    * gl_uniform_driver_storage structs for each subcomponent that
-    * glGetUniformLocation() could name.  We know it's been set up in the same
-    * order we'd walk the type, so walk the list of storage and find anything
-    * with our name, or the prefix of a component that starts with our name.
-    */
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
-      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
-      if (strncmp(ir->name, storage->name, namelen) != 0 ||
-          (storage->name[namelen] != 0 &&
-           storage->name[namelen] != '.' &&
-           storage->name[namelen] != '[')) {
-         continue;
-      }
-
-      gl_constant_value *components = storage->storage;
-      unsigned vector_count = (MAX2(storage->array_elements, 1) *
-                               storage->type->matrix_columns);
-
-      for (unsigned s = 0; s < vector_count; s++) {
-         assert(uniforms < uniform_array_size);
-         uniform_vector_size[uniforms] = storage->type->vector_elements;
-
-         int i;
-         for (i = 0; i < uniform_vector_size[uniforms]; i++) {
-            stage_prog_data->param[uniforms * 4 + i] = components;
-            components++;
-         }
-         for (; i < 4; i++) {
-            static gl_constant_value zero = { 0.0 };
-            stage_prog_data->param[uniforms * 4 + i] = &zero;
-         }
-
-         uniforms++;
-      }
-   }
-}
-
 void
 vec4_visitor::setup_uniform_clipplane_values()
 {
@@ -723,270 +308,6 @@ vec4_visitor::setup_uniform_clipplane_values()
    }
 }
 
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
-void
-vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
-{
-   const ir_state_slot *const slots = ir->get_state_slots();
-   assert(slots != NULL);
-
-   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
-      /* This state reference has already been setup by ir_to_mesa,
-       * but we'll get the same index back here.  We can reference
-       * ParameterValues directly, since unlike brw_fs.cpp, we never
-       * add new state references during compile.
-       */
-      int index = _mesa_add_state_reference(this->prog->Parameters,
-					    (gl_state_index *)slots[i].tokens);
-      gl_constant_value *values =
-         &this->prog->Parameters->ParameterValues[index][0];
-
-      assert(this->uniforms < uniform_array_size);
-      this->uniform_vector_size[this->uniforms] = 0;
-      /* Add each of the unique swizzled channels of the element.
-       * This will end up matching the size of the glsl_type of this field.
-       */
-      int last_swiz = -1;
-      for (unsigned int j = 0; j < 4; j++) {
-	 int swiz = GET_SWZ(slots[i].swizzle, j);
-	 last_swiz = swiz;
-
-	 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
-	 assert(this->uniforms < uniform_array_size);
-	 if (swiz <= last_swiz)
-	    this->uniform_vector_size[this->uniforms]++;
-      }
-      this->uniforms++;
-   }
-}
-
-dst_reg *
-vec4_visitor::variable_storage(ir_variable *var)
-{
-   return (dst_reg *)hash_table_find(this->variable_ht, var);
-}
-
-void
-vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
-                                     enum brw_predicate *predicate)
-{
-   ir_expression *expr = ir->as_expression();
-
-   *predicate = BRW_PREDICATE_NORMAL;
-
-   if (expr && expr->operation != ir_binop_ubo_load) {
-      src_reg op[3];
-      vec4_instruction *inst;
-
-      assert(expr->get_num_operands() <= 3);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-
-	 resolve_ud_negate(&op[i]);
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-	 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
-	 inst->conditional_mod = BRW_CONDITIONAL_Z;
-	 break;
-
-      case ir_binop_logic_xor:
-	 inst = emit(XOR(dst_null_d(), op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_binop_logic_or:
-	 inst = emit(OR(dst_null_d(), op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_binop_logic_and:
-	 inst = emit(AND(dst_null_d(), op[0], op[1]));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 break;
-
-      case ir_unop_f2b:
-	 if (brw->gen >= 6) {
-	    emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
-	 } else {
-	    inst = emit(MOV(dst_null_f(), op[0]));
-	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 }
-	 break;
-
-      case ir_unop_i2b:
-	 if (brw->gen >= 6) {
-	    emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-	 } else {
-	    inst = emit(MOV(dst_null_d(), op[0]));
-	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 }
-	 break;
-
-      case ir_binop_all_equal:
-	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
-	 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
-	 break;
-
-      case ir_binop_any_nequal:
-	 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
-	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-	 break;
-
-      case ir_unop_any:
-	 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-	 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-	 break;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_nequal:
-	 emit(CMP(dst_null_d(), op[0], op[1],
-		  brw_conditional_for_comparison(expr->operation)));
-	 break;
-
-      case ir_triop_csel: {
-         /* Expand the boolean condition into the flag register. */
-         inst = emit(MOV(dst_null_d(), op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-         /* Select which boolean to return. */
-         dst_reg temp(this, expr->operands[1]->type);
-         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         /* Expand the result to a condition code. */
-         inst = emit(MOV(dst_null_d(), src_reg(temp)));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-         break;
-      }
-
-      default:
-	 unreachable("not reached");
-      }
-      return;
-   }
-
-   ir->accept(this);
-
-   resolve_ud_negate(&this->result);
-
-   if (brw->gen >= 6) {
-      vec4_instruction *inst = emit(AND(dst_null_d(),
-					this->result, src_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-   } else {
-      vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-   }
-}
-
-/**
- * Emit a gen6 IF statement with the comparison folded into the IF
- * instruction.
- */
-void
-vec4_visitor::emit_if_gen6(ir_if *ir)
-{
-   ir_expression *expr = ir->condition->as_expression();
-
-   if (expr && expr->operation != ir_binop_ubo_load) {
-      src_reg op[3];
-      dst_reg temp;
-
-      assert(expr->get_num_operands() <= 3);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
-	 return;
-
-      case ir_binop_logic_xor:
-	 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_logic_or:
-	 temp = dst_reg(this, glsl_type::bool_type);
-	 emit(OR(temp, op[0], op[1]));
-	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_logic_and:
-	 temp = dst_reg(this, glsl_type::bool_type);
-	 emit(AND(temp, op[0], op[1]));
-	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_unop_f2b:
-	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_unop_i2b:
-	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_nequal:
-	 emit(IF(op[0], op[1],
-		 brw_conditional_for_comparison(expr->operation)));
-	 return;
-
-      case ir_binop_all_equal:
-	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
-	 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
-	 return;
-
-      case ir_binop_any_nequal:
-	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
-	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
-	 return;
-
-      case ir_unop_any:
-	 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
-	 return;
-
-      case ir_triop_csel: {
-         /* Expand the boolean condition into the flag register. */
-         vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-         /* Select which boolean to return. */
-         dst_reg temp(this, expr->operands[1]->type);
-         inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-      }
-
-      default:
-	 unreachable("not reached");
-      }
-      return;
-   }
-
-   ir->condition->accept(this);
-
-   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
-}
-
 void
 vec4_visitor::visit(ir_variable *ir)
 {
@@ -995,1803 +316,31 @@ vec4_visitor::visit(ir_variable *ir)
    if (variable_storage(ir))
       return;
 
-   switch (ir->data.mode) {
-   case ir_var_shader_in:
-      reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
-      break;
+   if (ir->data.mode == ir_var_shader_in) {
+      reg = new(mem_ctx) dst_reg(resize(dst_reg(ATTR, ir->data.location),
+                                        type_vector_size(ir->type)));
 
-   case ir_var_shader_out:
-      reg = new(mem_ctx) dst_reg(this, ir->type);
+   } else if (ir->data.mode == ir_var_shader_out) {
+      reg = new(mem_ctx) dst_reg(temporary_reg(ir->type));
 
       for (int i = 0; i < type_size(ir->type); i++) {
-	 output_reg[ir->data.location + i] = *reg;
-	 output_reg[ir->data.location + i].reg_offset = i;
-	 output_reg[ir->data.location + i].type =
+         output_reg[ir->data.location + i] = *reg;
+         output_reg[ir->data.location + i].reg_offset = i;
+         output_reg[ir->data.location + i].type =
             brw_type_for_base_type(ir->type->get_scalar_type());
-	 output_reg_annotation[ir->data.location + i] = ir->name;
-      }
-      break;
-
-   case ir_var_auto:
-   case ir_var_temporary:
-      reg = new(mem_ctx) dst_reg(this, ir->type);
-      break;
-
-   case ir_var_uniform:
-      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
-
-      /* Thanks to the lower_ubo_reference pass, we will see only
-       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
-       * variables, so no need for them to be in variable_ht.
-       *
-       * Some uniforms, such as samplers and atomic counters, have no actual
-       * storage, so we should ignore them.
-       */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
-         return;
-
-      /* Track how big the whole uniform variable is, in case we need to put a
-       * copy of its data into pull constants for array access.
-       */
-      assert(this->uniforms < uniform_array_size);
-      this->uniform_size[this->uniforms] = type_size(ir->type);
-
-      if (!strncmp(ir->name, "gl_", 3)) {
-	 setup_builtin_uniform_values(ir);
-      } else {
-	 setup_uniform_values(ir);
+         output_reg_annotation[ir->data.location + i] = ir->name;
       }
-      break;
 
-   case ir_var_system_value:
+   } else if (ir->data.mode == ir_var_system_value) {
       reg = make_reg_for_system_value(ir);
-      break;
-
-   default:
-      unreachable("not reached");
-   }
-
-   reg->type = brw_type_for_base_type(ir->type);
-   hash_table_insert(this->variable_ht, reg, ir);
-}
-
-void
-vec4_visitor::visit(ir_loop *ir)
-{
-   /* We don't want debugging output to print the whole body of the
-    * loop as the annotation.
-    */
-   this->base_ir = NULL;
-
-   emit(BRW_OPCODE_DO);
-
-   visit_instructions(&ir->body_instructions);
-
-   emit(BRW_OPCODE_WHILE);
-}
-
-void
-vec4_visitor::visit(ir_loop_jump *ir)
-{
-   switch (ir->mode) {
-   case ir_loop_jump::jump_break:
-      emit(BRW_OPCODE_BREAK);
-      break;
-   case ir_loop_jump::jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
-      break;
-   }
-}
-
-
-void
-vec4_visitor::visit(ir_function_signature *)
-{
-   unreachable("not reached");
-}
-
-void
-vec4_visitor::visit(ir_function *ir)
-{
-   /* Ignore function bodies other than main() -- we shouldn't see calls to
-    * them since they should all be inlined.
-    */
-   if (strcmp(ir->name, "main") == 0) {
-      const ir_function_signature *sig;
-      exec_list empty;
-
-      sig = ir->matching_signature(NULL, &empty, false);
-
-      assert(sig);
-
-      visit_instructions(&sig->body);
-   }
-}
-
-bool
-vec4_visitor::try_emit_mad(ir_expression *ir)
-{
-   /* 3-src instructions were introduced in gen6. */
-   if (brw->gen < 6)
-      return false;
-
-   /* MAD can only handle floating-point data. */
-   if (ir->type->base_type != GLSL_TYPE_FLOAT)
-      return false;
-
-   ir_rvalue *nonmul = ir->operands[1];
-   ir_expression *mul = ir->operands[0]->as_expression();
-
-   if (!mul || mul->operation != ir_binop_mul) {
-      nonmul = ir->operands[0];
-      mul = ir->operands[1]->as_expression();
-
-      if (!mul || mul->operation != ir_binop_mul)
-         return false;
-   }
-
-   nonmul->accept(this);
-   src_reg src0 = fix_3src_operand(this->result);
-
-   mul->operands[0]->accept(this);
-   src_reg src1 = fix_3src_operand(this->result);
-
-   mul->operands[1]->accept(this);
-   src_reg src2 = fix_3src_operand(this->result);
-
-   this->result = src_reg(this, ir->type);
-   emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
-
-   return true;
-}
-
-bool
-vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
-{
-   /* This optimization relies on CMP setting the destination to 0 when
-    * false.  Early hardware only sets the least significant bit, and
-    * leaves the other bits undefined.  So we can't use it.
-    */
-   if (brw->gen < 6)
-      return false;
-
-   ir_expression *const cmp = ir->operands[0]->as_expression();
-
-   if (cmp == NULL)
-      return false;
-
-   switch (cmp->operation) {
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_nequal:
-      break;
-
-   default:
-      return false;
-   }
-
-   cmp->operands[0]->accept(this);
-   const src_reg cmp_src0 = this->result;
-
-   cmp->operands[1]->accept(this);
-   const src_reg cmp_src1 = this->result;
-
-   this->result = src_reg(this, ir->type);
-
-   emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
-            brw_conditional_for_comparison(cmp->operation)));
-
-   /* If the comparison is false, this->result will just happen to be zero.
-    */
-   vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
-                                       this->result, src_reg(1.0f));
-   inst->predicate = BRW_PREDICATE_NORMAL;
-   inst->predicate_inverse = true;
-
-   return true;
-}
-
-void
-vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
-                          src_reg src0, src_reg src1)
-{
-   vec4_instruction *inst;
-
-   if (brw->gen >= 6) {
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->conditional_mod = conditionalmod;
-   } else {
-      emit(CMP(dst, src0, src1, conditionalmod));
-
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   }
-}
-
-void
-vec4_visitor::emit_lrp(const dst_reg &dst,
-                       const src_reg &x, const src_reg &y, const src_reg &a)
-{
-   if (brw->gen >= 6) {
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(LRP(dst,
-               fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
-   } else {
-      /* Earlier generations don't support three source operations, so we
-       * need to emit x*(1-a) + y*a.
-       */
-      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
-      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
-      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
-      y_times_a.writemask           = dst.writemask;
-      one_minus_a.writemask         = dst.writemask;
-      x_times_one_minus_a.writemask = dst.writemask;
-
-      emit(MUL(y_times_a, y, a));
-      emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
-      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
-      emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
-   }
-}
-
-void
-vec4_visitor::visit(ir_expression *ir)
-{
-   unsigned int operand;
-   src_reg op[Elements(ir->operands)];
-   src_reg result_src;
-   dst_reg result_dst;
-   vec4_instruction *inst;
-
-   if (ir->operation == ir_binop_add) {
-      if (try_emit_mad(ir))
-	 return;
-   }
-
-   if (ir->operation == ir_unop_b2f) {
-      if (try_emit_b2f_of_compare(ir))
-	 return;
-   }
-
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
-      this->result.file = BAD_FILE;
-      ir->operands[operand]->accept(this);
-      if (this->result.file == BAD_FILE) {
-	 fprintf(stderr, "Failed to get tree for expression operand:\n");
-	 ir->operands[operand]->fprint(stderr);
-	 exit(1);
-      }
-      op[operand] = this->result;
-
-      /* Matrix expression operands should have been broken down to vector
-       * operations already.
-       */
-      assert(!ir->operands[operand]->type->is_matrix());
-   }
-
-   int vector_elements = ir->operands[0]->type->vector_elements;
-   if (ir->operands[1]) {
-      vector_elements = MAX2(vector_elements,
-			     ir->operands[1]->type->vector_elements);
-   }
-
-   this->result.file = BAD_FILE;
-
-   /* Storage for our result.  Ideally for an assignment we'd be using
-    * the actual storage for the result here, instead.
-    */
-   result_src = src_reg(this, ir->type);
-   /* convenience for the emit functions below. */
-   result_dst = dst_reg(result_src);
-   /* If nothing special happens, this is the result. */
-   this->result = result_src;
-   /* Limit writes to the channels that will be used by result_src later.
-    * This does limit this temp's use as a temporary for multi-instruction
-    * sequences.
-    */
-   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
-
-   switch (ir->operation) {
-   case ir_unop_logic_not:
-      if (ctx->Const.UniformBooleanTrue != 1) {
-         emit(NOT(result_dst, op[0]));
-      } else {
-         emit(XOR(result_dst, op[0], src_reg(1)));
-      }
-      break;
-   case ir_unop_neg:
-      op[0].negate = !op[0].negate;
-      emit(MOV(result_dst, op[0]));
-      break;
-   case ir_unop_abs:
-      op[0].abs = true;
-      op[0].negate = false;
-      emit(MOV(result_dst, op[0]));
-      break;
-
-   case ir_unop_sign:
-      if (ir->type->is_float()) {
-         /* AND(val, 0x80000000) gives the sign bit.
-          *
-          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
-          * zero.
-          */
-         emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
-
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         result_dst.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(result_dst, op[0], src_reg(0x80000000u)));
-
-         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         this->result.type = BRW_REGISTER_TYPE_F;
-      } else {
-         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
-          *               -> non-negative val generates 0x00000000.
-          *  Predicated OR sets 1 if val is positive.
-          */
-         emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
-
-         emit(ASR(result_dst, op[0], src_reg(31)));
-
-         inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-      break;
-
-   case ir_unop_rcp:
-      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
-      break;
-
-   case ir_unop_exp2:
-      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
-      break;
-   case ir_unop_log2:
-      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
-      break;
-   case ir_unop_exp:
-   case ir_unop_log:
-      unreachable("not reached: should be handled by ir_explog_to_explog2");
-   case ir_unop_sin:
-   case ir_unop_sin_reduced:
-      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
-      break;
-   case ir_unop_cos:
-   case ir_unop_cos_reduced:
-      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
-      break;
-
-   case ir_unop_dFdx:
-   case ir_unop_dFdx_coarse:
-   case ir_unop_dFdx_fine:
-   case ir_unop_dFdy:
-   case ir_unop_dFdy_coarse:
-   case ir_unop_dFdy_fine:
-      unreachable("derivatives not valid in vertex shader");
-
-   case ir_unop_bitfield_reverse:
-      emit(BFREV(result_dst, op[0]));
-      break;
-   case ir_unop_bit_count:
-      emit(CBIT(result_dst, op[0]));
-      break;
-   case ir_unop_find_msb: {
-      src_reg temp = src_reg(this, glsl_type::uint_type);
-
-      inst = emit(FBH(dst_reg(temp), op[0]));
-      inst->dst.writemask = WRITEMASK_XYZW;
-
-      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
-       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
-       * subtract the result from 31 to convert the MSB count into an LSB count.
-       */
-
-      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
-      temp.swizzle = BRW_SWIZZLE_NOOP;
-      emit(MOV(result_dst, temp));
-
-      src_reg src_tmp = src_reg(result_dst);
-      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
-
-      src_tmp.negate = true;
-      inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-   }
-   case ir_unop_find_lsb:
-      emit(FBL(result_dst, op[0]));
-      break;
-   case ir_unop_saturate:
-      inst = emit(MOV(result_dst, op[0]));
-      inst->saturate = true;
-      break;
-
-   case ir_unop_noise:
-      unreachable("not reached: should be handled by lower_noise");
-
-   case ir_binop_add:
-      emit(ADD(result_dst, op[0], op[1]));
-      break;
-   case ir_binop_sub:
-      unreachable("not reached: should be handled by ir_sub_to_add_neg");
-
-   case ir_binop_mul:
-      if (brw->gen < 8 && ir->type->is_integer()) {
-	 /* For integer multiplication, the MUL uses the low 16 bits of one of
-	  * the operands (src0 through SNB, src1 on IVB and later).  The MACH
-	  * accumulates in the contribution of the upper 16 bits of that
-	  * operand.  If we can determine that one of the args is in the low
-	  * 16 bits, though, we can just emit a single MUL.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(result_dst, op[0], op[1]));
-            else
-               emit(MUL(result_dst, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (brw->gen < 7)
-               emit(MUL(result_dst, op[1], op[0]));
-            else
-               emit(MUL(result_dst, op[0], op[1]));
-         } else {
-            struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(dst_null_d(), op[0], op[1]));
-            emit(MOV(result_dst, src_reg(acc)));
-         }
-      } else {
-	 emit(MUL(result_dst, op[0], op[1]));
-      }
-      break;
-   case ir_binop_imul_high: {
-      struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
-
-      emit(MUL(acc, op[0], op[1]));
-      emit(MACH(result_dst, op[0], op[1]));
-      break;
-   }
-   case ir_binop_div:
-      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
-      break;
-   case ir_binop_carry: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
-
-      emit(ADDC(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
-
-      emit(SUBB(dst_null_ud(), op[0], op[1]));
-      emit(MOV(result_dst, src_reg(acc)));
-      break;
-   }
-   case ir_binop_mod:
-      /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
-      break;
-
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_nequal: {
-      emit(CMP(result_dst, op[0], op[1],
-	       brw_conditional_for_comparison(ir->operation)));
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         emit(AND(result_dst, result_src, src_reg(1)));
-      }
-      break;
-   }
-
-   case ir_binop_all_equal:
-      /* "==" operator producing a scalar boolean. */
-      if (ir->operands[0]->type->is_vector() ||
-	  ir->operands[1]->type->is_vector()) {
-	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
-	 emit(MOV(result_dst, src_reg(0)));
-         inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
-	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
-      } else {
-	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
-         if (ctx->Const.UniformBooleanTrue == 1) {
-            emit(AND(result_dst, result_src, src_reg(1)));
-         }
-      }
-      break;
-   case ir_binop_any_nequal:
-      /* "!=" operator producing a scalar boolean. */
-      if (ir->operands[0]->type->is_vector() ||
-	  ir->operands[1]->type->is_vector()) {
-	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
-
-	 emit(MOV(result_dst, src_reg(0)));
-         inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
-	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-      } else {
-	 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
-         if (ctx->Const.UniformBooleanTrue == 1) {
-            emit(AND(result_dst, result_src, src_reg(1)));
-         }
-      }
-      break;
-
-   case ir_unop_any:
-      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-      emit(MOV(result_dst, src_reg(0)));
-
-      inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
-      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-      break;
-
-   case ir_binop_logic_xor:
-      emit(XOR(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_or:
-      emit(OR(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_and:
-      emit(AND(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_dot:
-      assert(ir->operands[0]->type->is_vector());
-      assert(ir->operands[0]->type == ir->operands[1]->type);
-      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
-      break;
-
-   case ir_unop_sqrt:
-      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
-      break;
-   case ir_unop_rsq:
-      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
-      break;
-
-   case ir_unop_bitcast_i2f:
-   case ir_unop_bitcast_u2f:
-      this->result = op[0];
-      this->result.type = BRW_REGISTER_TYPE_F;
-      break;
-
-   case ir_unop_bitcast_f2i:
-      this->result = op[0];
-      this->result.type = BRW_REGISTER_TYPE_D;
-      break;
-
-   case ir_unop_bitcast_f2u:
-      this->result = op[0];
-      this->result.type = BRW_REGISTER_TYPE_UD;
-      break;
-
-   case ir_unop_i2f:
-   case ir_unop_i2u:
-   case ir_unop_u2i:
-   case ir_unop_u2f:
-   case ir_unop_f2i:
-   case ir_unop_f2u:
-      emit(MOV(result_dst, op[0]));
-      break;
-   case ir_unop_b2i:
-      if (ctx->Const.UniformBooleanTrue != 1) {
-         emit(AND(result_dst, op[0], src_reg(1)));
-      } else {
-         emit(MOV(result_dst, op[0]));
-      }
-      break;
-   case ir_unop_b2f:
-      if (ctx->Const.UniformBooleanTrue != 1) {
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         result_dst.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
-         result_dst.type = BRW_REGISTER_TYPE_F;
-      } else {
-         emit(MOV(result_dst, op[0]));
-      }
-      break;
-   case ir_unop_f2b:
-   case ir_unop_i2b:
-      emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
-      if (ctx->Const.UniformBooleanTrue == 1) {
-         emit(AND(result_dst, result_src, src_reg(1)));
-      }
-      break;
-
-   case ir_unop_trunc:
-      emit(RNDZ(result_dst, op[0]));
-      break;
-   case ir_unop_ceil:
-      op[0].negate = !op[0].negate;
-      inst = emit(RNDD(result_dst, op[0]));
-      this->result.negate = true;
-      break;
-   case ir_unop_floor:
-      inst = emit(RNDD(result_dst, op[0]));
-      break;
-   case ir_unop_fract:
-      inst = emit(FRC(result_dst, op[0]));
-      break;
-   case ir_unop_round_even:
-      emit(RNDE(result_dst, op[0]));
-      break;
-
-   case ir_binop_min:
-      emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
-      break;
-   case ir_binop_max:
-      emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
-      break;
-
-   case ir_binop_pow:
-      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
-      break;
-
-   case ir_unop_bit_not:
-      inst = emit(NOT(result_dst, op[0]));
-      break;
-   case ir_binop_bit_and:
-      inst = emit(AND(result_dst, op[0], op[1]));
-      break;
-   case ir_binop_bit_xor:
-      inst = emit(XOR(result_dst, op[0], op[1]));
-      break;
-   case ir_binop_bit_or:
-      inst = emit(OR(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_lshift:
-      inst = emit(SHL(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_rshift:
-      if (ir->type->base_type == GLSL_TYPE_INT)
-         inst = emit(ASR(result_dst, op[0], op[1]));
-      else
-         inst = emit(SHR(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_bfm:
-      emit(BFI1(result_dst, op[0], op[1]));
-      break;
-
-   case ir_binop_ubo_load: {
-      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
-      ir_constant *const_offset_ir = ir->operands[1]->as_constant();
-      unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
-      src_reg offset;
-
-      /* Now, load the vector from that offset. */
-      assert(ir->type->is_vector() || ir->type->is_scalar());
-
-      src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
-      packed_consts.type = result.type;
-      src_reg surf_index;
-
-      if (const_uniform_block) {
-         /* The block index is a constant, so just emit the binding table entry
-          * as an immediate.
-          */
-         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
-                              const_uniform_block->value.u[0]);
-      } else {
-         /* The block index is not a constant. Evaluate the index expression
-          * per-channel and add the base UBO index; the generator will select
-          * a value from any live channel.
-          */
-         surf_index = src_reg(this, glsl_type::uint_type);
-         emit(ADD(dst_reg(surf_index), op[0],
-                  src_reg(prog_data->base.binding_table.ubo_start)));
-
-         /* Assume this may touch any UBO. It would be nice to provide
-          * a tighter bound, but the array information is already lowered away.
-          */
-         brw_mark_surface_used(&prog_data->base,
-                               prog_data->base.binding_table.ubo_start +
-                               shader_prog->NumUniformBlocks - 1);
-      }
-
-      if (const_offset_ir) {
-         if (brw->gen >= 8) {
-            /* Store the offset in a GRF so we can send-from-GRF. */
-            offset = src_reg(this, glsl_type::int_type);
-            emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
-         } else {
-            /* Immediates are fine on older generations since they'll be moved
-             * to a (potentially fake) MRF at the generator level.
-             */
-            offset = src_reg(const_offset / 16);
-         }
-      } else {
-         offset = src_reg(this, glsl_type::uint_type);
-         emit(SHR(dst_reg(offset), op[1], src_reg(4)));
-      }
-
-      if (brw->gen >= 7) {
-         dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
-         grf_offset.type = offset.type;
-
-         emit(MOV(grf_offset, offset));
-
-         emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                            dst_reg(packed_consts),
-                                            surf_index,
-                                            src_reg(grf_offset)));
-      } else {
-         vec4_instruction *pull =
-            emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                               dst_reg(packed_consts),
-                                               surf_index,
-                                               offset));
-         pull->base_mrf = 14;
-         pull->mlen = 1;
-      }
-
-      packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
-      packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
-                                            const_offset % 16 / 4,
-                                            const_offset % 16 / 4,
-                                            const_offset % 16 / 4);
-
-      /* UBO bools are any nonzero int.  We need to convert them to use the
-       * value of true stored in ctx->Const.UniformBooleanTrue.
-       */
-      if (ir->type->base_type == GLSL_TYPE_BOOL) {
-         emit(CMP(result_dst, packed_consts, src_reg(0u),
-                  BRW_CONDITIONAL_NZ));
-         if (ctx->Const.UniformBooleanTrue == 1) {
-            emit(AND(result_dst, result, src_reg(1)));
-         }
-      } else {
-         emit(MOV(result_dst, packed_consts));
-      }
-      break;
-   }
-
-   case ir_binop_vector_extract:
-      unreachable("should have been lowered by vec_index_to_cond_assign");
-
-   case ir_triop_fma:
-      op[0] = fix_3src_operand(op[0]);
-      op[1] = fix_3src_operand(op[1]);
-      op[2] = fix_3src_operand(op[2]);
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(MAD(result_dst, op[2], op[1], op[0]));
-      break;
-
-   case ir_triop_lrp:
-      emit_lrp(result_dst, op[0], op[1], op[2]);
-      break;
-
-   case ir_triop_csel:
-      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
-      inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-
-   case ir_triop_bfi:
-      op[0] = fix_3src_operand(op[0]);
-      op[1] = fix_3src_operand(op[1]);
-      op[2] = fix_3src_operand(op[2]);
-      emit(BFI2(result_dst, op[0], op[1], op[2]));
-      break;
-
-   case ir_triop_bitfield_extract:
-      op[0] = fix_3src_operand(op[0]);
-      op[1] = fix_3src_operand(op[1]);
-      op[2] = fix_3src_operand(op[2]);
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(BFE(result_dst, op[2], op[1], op[0]));
-      break;
-
-   case ir_triop_vector_insert:
-      unreachable("should have been lowered by lower_vector_insert");
-
-   case ir_quadop_bitfield_insert:
-      unreachable("not reached: should be handled by "
-              "bitfield_insert_to_bfm_bfi\n");
-
-   case ir_quadop_vector:
-      unreachable("not reached: should be handled by lower_quadop_vector");
-
-   case ir_unop_pack_half_2x16:
-      emit_pack_half_2x16(result_dst, op[0]);
-      break;
-   case ir_unop_unpack_half_2x16:
-      emit_unpack_half_2x16(result_dst, op[0]);
-      break;
-   case ir_unop_pack_snorm_2x16:
-   case ir_unop_pack_snorm_4x8:
-   case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_unorm_4x8:
-   case ir_unop_unpack_snorm_2x16:
-   case ir_unop_unpack_snorm_4x8:
-   case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_unorm_4x8:
-      unreachable("not reached: should be handled by lower_packing_builtins");
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-   case ir_binop_pack_half_2x16_split:
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_sample:
-   case ir_binop_interpolate_at_offset:
-      unreachable("not reached: should not occur in vertex shader");
-   case ir_binop_ldexp:
-      unreachable("not reached: should be handled by ldexp_to_arith()");
-   }
-}
-
-
-void
-vec4_visitor::visit(ir_swizzle *ir)
-{
-   src_reg src;
-   int i = 0;
-   int swizzle[4];
-
-   /* Note that this is only swizzles in expressions, not those on the left
-    * hand side of an assignment, which do write masking.  See ir_assignment
-    * for that.
-    */
-
-   ir->val->accept(this);
-   src = this->result;
-   assert(src.file != BAD_FILE);
-
-   for (i = 0; i < ir->type->vector_elements; i++) {
-      switch (i) {
-      case 0:
-	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
-	 break;
-      case 1:
-	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
-	 break;
-      case 2:
-	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
-	 break;
-      case 3:
-	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
-	    break;
-      }
-   }
-   for (; i < 4; i++) {
-      /* Replicate the last channel out. */
-      swizzle[i] = swizzle[ir->type->vector_elements - 1];
-   }
-
-   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-
-   this->result = src;
-}
-
-void
-vec4_visitor::visit(ir_dereference_variable *ir)
-{
-   const struct glsl_type *type = ir->type;
-   dst_reg *reg = variable_storage(ir->var);
-
-   if (!reg) {
-      fail("Failed to find variable storage for %s\n", ir->var->name);
-      this->result = src_reg(brw_null_reg());
-      return;
-   }
-
-   this->result = src_reg(*reg);
-
-   /* System values get their swizzle from the dst_reg writemask */
-   if (ir->var->data.mode == ir_var_system_value)
-      return;
-
-   if (type->is_scalar() || type->is_vector() || type->is_matrix())
-      this->result.swizzle = swizzle_for_size(type->vector_elements);
-}
-
-
-int
-vec4_visitor::compute_array_stride(ir_dereference_array *ir)
-{
-   /* Under normal circumstances array elements are stored consecutively, so
-    * the stride is equal to the size of the array element.
-    */
-   return type_size(ir->type);
-}
-
-
-void
-vec4_visitor::visit(ir_dereference_array *ir)
-{
-   ir_constant *constant_index;
-   src_reg src;
-   int array_stride = compute_array_stride(ir);
-
-   constant_index = ir->array_index->constant_expression_value();
-
-   ir->array->accept(this);
-   src = this->result;
-
-   if (constant_index) {
-      src.reg_offset += constant_index->value.i[0] * array_stride;
-   } else {
-      /* Variable index array dereference.  It eats the "vec4" of the
-       * base of the array and an index that offsets the Mesa register
-       * index.
-       */
-      ir->array_index->accept(this);
-
-      src_reg index_reg;
-
-      if (array_stride == 1) {
-	 index_reg = this->result;
-      } else {
-	 index_reg = src_reg(this, glsl_type::int_type);
-
-	 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
-      }
-
-      if (src.reladdr) {
-	 src_reg temp = src_reg(this, glsl_type::int_type);
-
-	 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
-
-	 index_reg = temp;
-      }
-
-      src.reladdr = ralloc(mem_ctx, src_reg);
-      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
-   }
-
-   /* If the type is smaller than a vec4, replicate the last channel out. */
-   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
-      src.swizzle = swizzle_for_size(ir->type->vector_elements);
-   else
-      src.swizzle = BRW_SWIZZLE_NOOP;
-   src.type = brw_type_for_base_type(ir->type);
-
-   this->result = src;
-}
-
-void
-vec4_visitor::visit(ir_dereference_record *ir)
-{
-   unsigned int i;
-   const glsl_type *struct_type = ir->record->type;
-   int offset = 0;
-
-   ir->record->accept(this);
-
-   for (i = 0; i < struct_type->length; i++) {
-      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
-	 break;
-      offset += type_size(struct_type->fields.structure[i].type);
-   }
-
-   /* If the type is smaller than a vec4, replicate the last channel out. */
-   if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
-      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
-   else
-      this->result.swizzle = BRW_SWIZZLE_NOOP;
-   this->result.type = brw_type_for_base_type(ir->type);
-
-   this->result.reg_offset += offset;
-}
-
-/**
- * We want to be careful in assignment setup to hit the actual storage
- * instead of potentially using a temporary like we might with the
- * ir_dereference handler.
- */
-static dst_reg
-get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
-{
-   /* The LHS must be a dereference.  If the LHS is a variable indexed array
-    * access of a vector, it must be separated into a series conditional moves
-    * before reaching this point (see ir_vec_index_to_cond_assign).
-    */
-   assert(ir->as_dereference());
-   ir_dereference_array *deref_array = ir->as_dereference_array();
-   if (deref_array) {
-      assert(!deref_array->array->type->is_vector());
-   }
-
-   /* Use the rvalue deref handler for the most part.  We'll ignore
-    * swizzles in it and write swizzles using writemask, though.
-    */
-   ir->accept(v);
-   return dst_reg(v->result);
-}
-
-void
-vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
-                              const struct glsl_type *type,
-                              enum brw_predicate predicate)
-{
-   if (type->base_type == GLSL_TYPE_STRUCT) {
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
-      }
-      return;
-   }
-
-   if (type->is_array()) {
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_block_move(dst, src, type->fields.array, predicate);
-      }
-      return;
-   }
-
-   if (type->is_matrix()) {
-      const struct glsl_type *vec_type;
-
-      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
-					 type->vector_elements, 1);
-
-      for (int i = 0; i < type->matrix_columns; i++) {
-	 emit_block_move(dst, src, vec_type, predicate);
-      }
-      return;
-   }
-
-   assert(type->is_scalar() || type->is_vector());
-
-   dst->type = brw_type_for_base_type(type);
-   src->type = dst->type;
-
-   dst->writemask = (1 << type->vector_elements) - 1;
-
-   src->swizzle = swizzle_for_size(type->vector_elements);
-
-   vec4_instruction *inst = emit(MOV(*dst, *src));
-   inst->predicate = predicate;
-
-   dst->reg_offset++;
-   src->reg_offset++;
-}
-
-
-/* If the RHS processing resulted in an instruction generating a
- * temporary value, and it would be easy to rewrite the instruction to
- * generate its result right into the LHS instead, do so.  This ends
- * up reliably removing instructions where it can be tricky to do so
- * later without real UD chain information.
- */
-bool
-vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
-				     dst_reg dst,
-				     src_reg src,
-				     vec4_instruction *pre_rhs_inst,
-				     vec4_instruction *last_rhs_inst)
-{
-   /* This could be supported, but it would take more smarts. */
-   if (ir->condition)
-      return false;
-
-   if (pre_rhs_inst == last_rhs_inst)
-      return false; /* No instructions generated to work with. */
-
-   /* Make sure the last instruction generated our source reg. */
-   if (src.file != GRF ||
-       src.file != last_rhs_inst->dst.file ||
-       src.reg != last_rhs_inst->dst.reg ||
-       src.reg_offset != last_rhs_inst->dst.reg_offset ||
-       src.reladdr ||
-       src.abs ||
-       src.negate ||
-       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
-      return false;
-
-   /* Check that that last instruction fully initialized the channels
-    * we want to use, in the order we want to use them.  We could
-    * potentially reswizzle the operands of many instructions so that
-    * we could handle out of order channels, but don't yet.
-    */
-
-   for (unsigned i = 0; i < 4; i++) {
-      if (dst.writemask & (1 << i)) {
-	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
-	    return false;
-
-	 if (BRW_GET_SWZ(src.swizzle, i) != i)
-	    return false;
-      }
-   }
-
-   /* Success!  Rewrite the instruction. */
-   last_rhs_inst->dst.file = dst.file;
-   last_rhs_inst->dst.reg = dst.reg;
-   last_rhs_inst->dst.reg_offset = dst.reg_offset;
-   last_rhs_inst->dst.reladdr = dst.reladdr;
-   last_rhs_inst->dst.writemask &= dst.writemask;
-
-   return true;
-}
-
-void
-vec4_visitor::visit(ir_assignment *ir)
-{
-   dst_reg dst = get_assignment_lhs(ir->lhs, this);
-   enum brw_predicate predicate = BRW_PREDICATE_NONE;
-
-   if (!ir->lhs->type->is_scalar() &&
-       !ir->lhs->type->is_vector()) {
-      ir->rhs->accept(this);
-      src_reg src = this->result;
-
-      if (ir->condition) {
-	 emit_bool_to_cond_code(ir->condition, &predicate);
-      }
-
-      /* emit_block_move doesn't account for swizzles in the source register.
-       * This should be ok, since the source register is a structure or an
-       * array, and those can't be swizzled.  But double-check to be sure.
-       */
-      assert(src.swizzle ==
-             (ir->rhs->type->is_matrix()
-              ? swizzle_for_size(ir->rhs->type->vector_elements)
-              : BRW_SWIZZLE_NOOP));
-
-      emit_block_move(&dst, &src, ir->rhs->type, predicate);
-      return;
-   }
-
-   /* Now we're down to just a scalar/vector with writemasks. */
-   int i;
-
-   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
-   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
-
-   ir->rhs->accept(this);
-
-   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
-
-   src_reg src = this->result;
-
-   int swizzles[4];
-   int first_enabled_chan = 0;
-   int src_chan = 0;
-
-   assert(ir->lhs->type->is_vector() ||
-	  ir->lhs->type->is_scalar());
-   dst.writemask = ir->write_mask;
-
-   for (int i = 0; i < 4; i++) {
-      if (dst.writemask & (1 << i)) {
-	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
-	 break;
-      }
-   }
-
-   /* Swizzle a small RHS vector into the channels being written.
-    *
-    * glsl ir treats write_mask as dictating how many channels are
-    * present on the RHS while in our instructions we need to make
-    * those channels appear in the slots of the vec4 they're written to.
-    */
-   for (int i = 0; i < 4; i++) {
-      if (dst.writemask & (1 << i))
-	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
-      else
-	 swizzles[i] = first_enabled_chan;
-   }
-   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
-			      swizzles[2], swizzles[3]);
-
-   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
-      return;
-   }
-
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition, &predicate);
-   }
-
-   for (i = 0; i < type_size(ir->lhs->type); i++) {
-      vec4_instruction *inst = emit(MOV(dst, src));
-      inst->predicate = predicate;
-
-      dst.reg_offset++;
-      src.reg_offset++;
-   }
-}
-
-void
-vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
-{
-   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
-      foreach_in_list(ir_constant, field_value, &ir->components) {
-	 emit_constant_values(dst, field_value);
-      }
-      return;
-   }
-
-   if (ir->type->is_array()) {
-      for (unsigned int i = 0; i < ir->type->length; i++) {
-	 emit_constant_values(dst, ir->array_elements[i]);
-      }
-      return;
-   }
-
-   if (ir->type->is_matrix()) {
-      for (int i = 0; i < ir->type->matrix_columns; i++) {
-	 float *vec = &ir->value.f[i * ir->type->vector_elements];
-
-	 for (int j = 0; j < ir->type->vector_elements; j++) {
-	    dst->writemask = 1 << j;
-	    dst->type = BRW_REGISTER_TYPE_F;
-
-	    emit(MOV(*dst, src_reg(vec[j])));
-	 }
-	 dst->reg_offset++;
-      }
-      return;
-   }
-
-   int remaining_writemask = (1 << ir->type->vector_elements) - 1;
-
-   for (int i = 0; i < ir->type->vector_elements; i++) {
-      if (!(remaining_writemask & (1 << i)))
-	 continue;
-
-      dst->writemask = 1 << i;
-      dst->type = brw_type_for_base_type(ir->type);
-
-      /* Find other components that match the one we're about to
-       * write.  Emits fewer instructions for things like vec4(0.5,
-       * 1.5, 1.5, 1.5).
-       */
-      for (int j = i + 1; j < ir->type->vector_elements; j++) {
-	 if (ir->type->base_type == GLSL_TYPE_BOOL) {
-	    if (ir->value.b[i] == ir->value.b[j])
-	       dst->writemask |= (1 << j);
-	 } else {
-	    /* u, i, and f storage all line up, so no need for a
-	     * switch case for comparing each type.
-	     */
-	    if (ir->value.u[i] == ir->value.u[j])
-	       dst->writemask |= (1 << j);
-	 }
-      }
-
-      switch (ir->type->base_type) {
-      case GLSL_TYPE_FLOAT:
-	 emit(MOV(*dst, src_reg(ir->value.f[i])));
-	 break;
-      case GLSL_TYPE_INT:
-	 emit(MOV(*dst, src_reg(ir->value.i[i])));
-	 break;
-      case GLSL_TYPE_UINT:
-	 emit(MOV(*dst, src_reg(ir->value.u[i])));
-	 break;
-      case GLSL_TYPE_BOOL:
-         emit(MOV(*dst,
-                  src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
-                                              : 0)));
-	 break;
-      default:
-	 unreachable("Non-float/uint/int/bool constant");
-      }
-
-      remaining_writemask &= ~dst->writemask;
-   }
-   dst->reg_offset++;
-}
-
-void
-vec4_visitor::visit(ir_constant *ir)
-{
-   dst_reg dst = dst_reg(this, ir->type);
-   this->result = src_reg(dst);
-
-   emit_constant_values(&dst, ir);
-}
-
-void
-vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
-{
-   ir_dereference *deref = static_cast<ir_dereference *>(
-      ir->actual_parameters.get_head());
-   ir_variable *location = deref->variable_referenced();
-   unsigned surf_index = (prog_data->base.binding_table.abo_start +
-                          location->data.binding);
-
-   /* Calculate the surface offset */
-   src_reg offset(this, glsl_type::uint_type);
-   ir_dereference_array *deref_array = deref->as_dereference_array();
-   if (deref_array) {
-      deref_array->array_index->accept(this);
-
-      src_reg tmp(this, glsl_type::uint_type);
-      emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
-      emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
-   } else {
-      offset = location->data.atomic.offset;
-   }
-
-   /* Emit the appropriate machine instruction */
-   const char *callee = ir->callee->function_name();
-   dst_reg dst = get_assignment_lhs(ir->return_deref, this);
-
-   if (!strcmp("__intrinsic_atomic_read", callee)) {
-      emit_untyped_surface_read(surf_index, dst, offset);
-
-   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
-      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
-                          src_reg(), src_reg());
-
-   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
-      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
-                          src_reg(), src_reg());
-   }
-}
-
-void
-vec4_visitor::visit(ir_call *ir)
-{
-   const char *callee = ir->callee->function_name();
-
-   if (!strcmp("__intrinsic_atomic_read", callee) ||
-       !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee)) {
-      visit_atomic_counter_intrinsic(ir);
-   } else {
-      unreachable("Unsupported intrinsic.");
-   }
-}
-
-src_reg
-vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
-{
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS);
-   inst->base_mrf = 2;
-   inst->mlen = 1;
-   inst->dst = dst_reg(this, glsl_type::uvec4_type);
-   inst->dst.writemask = WRITEMASK_XYZW;
-
-   inst->src[1] = sampler;
-
-   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int param_base = inst->base_mrf;
-   int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
-   int zero_mask = 0xf & ~coord_mask;
-
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
-            coordinate));
-
-   emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
-            src_reg(0)));
-
-   emit(inst);
-   return src_reg(inst->dst);
-}
-
-static bool
-is_high_sampler(struct brw_context *brw, src_reg sampler)
-{
-   if (brw->gen < 8 && !brw->is_haswell)
-      return false;
-
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
-}
-
-void
-vec4_visitor::visit(ir_texture *ir)
-{
-   uint32_t sampler =
-      _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
-
-   ir_rvalue *nonconst_sampler_index =
-      _mesa_get_sampler_array_nonconst_index(ir->sampler);
-
-   /* Handle non-constant sampler array indexing */
-   src_reg sampler_reg;
-   if (nonconst_sampler_index) {
-      /* The highest sampler which may be used by this operation is
-       * the last element of the array. Mark it here, because the generator
-       * doesn't have enough information to determine the bound.
-       */
-      uint32_t array_size = ir->sampler->as_dereference_array()
-         ->array->type->array_size();
-
-      uint32_t max_used = sampler + array_size - 1;
-      if (ir->op == ir_tg4 && brw->gen < 8) {
-         max_used += prog_data->base.binding_table.gather_texture_start;
-      } else {
-         max_used += prog_data->base.binding_table.texture_start;
-      }
-
-      brw_mark_surface_used(&prog_data->base, max_used);
-
-      /* Emit code to evaluate the actual indexing expression */
-      nonconst_sampler_index->accept(this);
-      dst_reg temp(this, glsl_type::uint_type);
-      emit(ADD(temp, this->result, src_reg(sampler)))
-         ->force_writemask_all = true;
-      sampler_reg = src_reg(temp);
-   } else {
-      /* Single sampler, or constant array index; the indexing expression
-       * is just an immediate.
-       */
-      sampler_reg = src_reg(sampler);
-   }
-
-   /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
-    * emitting anything other than setting up the constant result.
-    */
-   if (ir->op == ir_tg4) {
-      ir_constant *chan = ir->lod_info.component->as_constant();
-      int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
-      if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
-         dst_reg result(this, ir->type);
-         this->result = src_reg(result);
-         emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
-         return;
-      }
-   }
-
-   /* Should be lowered by do_lower_texture_projection */
-   assert(!ir->projector);
-
-   /* Should be lowered */
-   assert(!ir->offset || !ir->offset->type->is_array());
-
-   /* Generate code to compute all the subexpression trees.  This has to be
-    * done before loading any values into MRFs for the sampler message since
-    * generating these values may involve SEND messages that need the MRFs.
-    */
-   src_reg coordinate;
-   if (ir->coordinate) {
-      ir->coordinate->accept(this);
-      coordinate = this->result;
-   }
-
-   src_reg shadow_comparitor;
-   if (ir->shadow_comparitor) {
-      ir->shadow_comparitor->accept(this);
-      shadow_comparitor = this->result;
-   }
-
-   bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
-   src_reg offset_value;
-   if (has_nonconstant_offset) {
-      ir->offset->accept(this);
-      offset_value = src_reg(this->result);
-   }
-
-   const glsl_type *lod_type = NULL, *sample_index_type = NULL;
-   src_reg lod, dPdx, dPdy, sample_index, mcs;
-   switch (ir->op) {
-   case ir_tex:
-      lod = src_reg(0.0f);
-      lod_type = glsl_type::float_type;
-      break;
-   case ir_txf:
-   case ir_txl:
-   case ir_txs:
-      ir->lod_info.lod->accept(this);
-      lod = this->result;
-      lod_type = ir->lod_info.lod->type;
-      break;
-   case ir_query_levels:
-      lod = src_reg(0);
-      lod_type = glsl_type::int_type;
-      break;
-   case ir_txf_ms:
-      ir->lod_info.sample_index->accept(this);
-      sample_index = this->result;
-      sample_index_type = ir->lod_info.sample_index->type;
-
-      if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
-         mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
-      else
-         mcs = src_reg(0u);
-      break;
-   case ir_txd:
-      ir->lod_info.grad.dPdx->accept(this);
-      dPdx = this->result;
-
-      ir->lod_info.grad.dPdy->accept(this);
-      dPdy = this->result;
-
-      lod_type = ir->lod_info.grad.dPdx->type;
-      break;
-   case ir_txb:
-   case ir_lod:
-   case ir_tg4:
-      break;
-   }
-
-   enum opcode opcode;
-   switch (ir->op) {
-   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
-   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
-   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
-   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
-   case ir_tg4: opcode = has_nonconstant_offset
-                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
-   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
-   case ir_txb:
-      unreachable("TXB is not valid for vertex shaders.");
-   case ir_lod:
-      unreachable("LOD is not valid for vertex shaders.");
-   default:
-      unreachable("Unrecognized tex op");
-   }
-
-   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode);
-
-   if (ir->offset != NULL && ir->op != ir_txf)
-      inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
-
-   /* Stuff the channel select bits in the top of the texture offset */
-   if (ir->op == ir_tg4)
-      inst->texture_offset |= gather_channel(ir, sampler) << 16;
-
-   /* The message header is necessary for:
-    * - Gen4 (always)
-    * - Texel offsets
-    * - Gather channel selection
-    * - Sampler indices too large to fit in a 4-bit value.
-    */
-   inst->header_present =
-      brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
-      is_high_sampler(brw, sampler_reg);
-   inst->base_mrf = 2;
-   inst->mlen = inst->header_present + 1; /* always at least one */
-   inst->dst = dst_reg(this, ir->type);
-   inst->dst.writemask = WRITEMASK_XYZW;
-   inst->shadow_compare = ir->shadow_comparitor != NULL;
-
-   inst->src[1] = sampler_reg;
-
-   /* MRF for the first parameter */
-   int param_base = inst->base_mrf + inst->header_present;
 
-   if (ir->op == ir_txs || ir->op == ir_query_levels) {
-      int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
-      emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
    } else {
-      /* Load the coordinate */
-      /* FINISHME: gl_clamp_mask and saturate */
-      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
-      int zero_mask = 0xf & ~coord_mask;
-
-      emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
-               coordinate));
-
-      if (zero_mask != 0) {
-         emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
-                  src_reg(0)));
-      }
-      /* Load the shadow comparitor */
-      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
-	 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
-			  WRITEMASK_X),
-		  shadow_comparitor));
-	 inst->mlen++;
-      }
-
-      /* Load the LOD info */
-      if (ir->op == ir_tex || ir->op == ir_txl) {
-	 int mrf, writemask;
-	 if (brw->gen >= 5) {
-	    mrf = param_base + 1;
-	    if (ir->shadow_comparitor) {
-	       writemask = WRITEMASK_Y;
-	       /* mlen already incremented */
-	    } else {
-	       writemask = WRITEMASK_X;
-	       inst->mlen++;
-	    }
-	 } else /* brw->gen == 4 */ {
-	    mrf = param_base;
-	    writemask = WRITEMASK_W;
-	 }
-	 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
-      } else if (ir->op == ir_txf) {
-         emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
-      } else if (ir->op == ir_txf_ms) {
-         emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
-                  sample_index));
-         if (brw->gen >= 7) {
-            /* MCS data is in the first channel of `mcs`, but we need to get it into
-             * the .y channel of the second vec4 of params, so replicate .x across
-             * the whole vec4 and then mask off everything except .y
-             */
-            mcs.swizzle = BRW_SWIZZLE_XXXX;
-            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
-                     mcs));
-         }
-         inst->mlen++;
-      } else if (ir->op == ir_txd) {
-	 const glsl_type *type = lod_type;
-
-	 if (brw->gen >= 5) {
-	    dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
-	    inst->mlen++;
-
-	    if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
-	       dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
-	       dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
-	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
-	       inst->mlen++;
-
-               if (ir->shadow_comparitor) {
-                  emit(MOV(dst_reg(MRF, param_base + 2,
-                                   ir->shadow_comparitor->type, WRITEMASK_Z),
-                           shadow_comparitor));
-               }
-	    }
-	 } else /* brw->gen == 4 */ {
-	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
-	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
-	    inst->mlen += 2;
-	 }
-      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
-         if (ir->shadow_comparitor) {
-            emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
-                     shadow_comparitor));
-         }
-
-         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
-                  offset_value));
-         inst->mlen++;
-      }
-   }
-
-   emit(inst);
-
-   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
-    * spec requires layers.
-    */
-   if (ir->op == ir_txs) {
-      glsl_type const *type = ir->sampler->type;
-      if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-          type->sampler_array) {
-         emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                   writemask(inst->dst, WRITEMASK_Z),
-                   src_reg(inst->dst), src_reg(6));
-      }
-   }
-
-   if (brw->gen == 6 && ir->op == ir_tg4) {
-      emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
-   }
-
-   swizzle_result(ir, src_reg(inst->dst), sampler);
-}
-
-/**
- * Apply workarounds for Gen6 gather with UINT/SINT
- */
-void
-vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
-{
-   if (!wa)
+      backend_visitor::visit(ir);
       return;
-
-   int width = (wa & WA_8BIT) ? 8 : 16;
-   dst_reg dst_f = dst;
-   dst_f.type = BRW_REGISTER_TYPE_F;
-
-   /* Convert from UNORM to UINT */
-   emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
-   emit(MOV(dst, src_reg(dst_f)));
-
-   if (wa & WA_SIGN) {
-      /* Reinterpret the UINT value as a signed INT value by
-       * shifting the sign bit into place, then shifting back
-       * preserving sign.
-       */
-      emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
-      emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
    }
-}
 
-/**
- * Set up the gather channel based on the swizzle, for gather4.
- */
-uint32_t
-vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
-{
-   ir_constant *chan = ir->lod_info.component->as_constant();
-   int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
-   switch (swiz) {
-      case SWIZZLE_X: return 0;
-      case SWIZZLE_Y:
-         /* gather4 sampler is broken for green channel on RG32F --
-          * we must ask for blue instead.
-          */
-         if (key->tex.gather_channel_quirk_mask & (1<<sampler))
-            return 2;
-         return 1;
-      case SWIZZLE_Z: return 2;
-      case SWIZZLE_W: return 3;
-      default:
-         unreachable("Not reached"); /* zero, one swizzles handled already */
-   }
-}
-
-void
-vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
-{
-   int s = key->tex.swizzles[sampler];
-
-   this->result = src_reg(this, ir->type);
-   dst_reg swizzled_result(this->result);
-
-   if (ir->op == ir_query_levels) {
-      /* # levels is in .w */
-      orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-   if (ir->op == ir_txs || ir->type == glsl_type::float_type
-			|| s == SWIZZLE_NOOP || ir->op == ir_tg4) {
-      emit(MOV(swizzled_result, orig_val));
-      return;
-   }
-
-
-   int zero_mask = 0, one_mask = 0, copy_mask = 0;
-   int swizzle[4] = {0};
-
-   for (int i = 0; i < 4; i++) {
-      switch (GET_SWZ(s, i)) {
-      case SWIZZLE_ZERO:
-	 zero_mask |= (1 << i);
-	 break;
-      case SWIZZLE_ONE:
-	 one_mask |= (1 << i);
-	 break;
-      default:
-	 copy_mask |= (1 << i);
-	 swizzle[i] = GET_SWZ(s, i);
-	 break;
-      }
-   }
-
-   if (copy_mask) {
-      orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
-      swizzled_result.writemask = copy_mask;
-      emit(MOV(swizzled_result, orig_val));
-   }
-
-   if (zero_mask) {
-      swizzled_result.writemask = zero_mask;
-      emit(MOV(swizzled_result, src_reg(0.0f)));
-   }
-
-   if (one_mask) {
-      swizzled_result.writemask = one_mask;
-      emit(MOV(swizzled_result, src_reg(1.0f)));
-   }
-}
-
-void
-vec4_visitor::visit(ir_return *)
-{
-   unreachable("not reached");
+   reg->type = brw_type_for_base_type(ir->type);
+   hash_table_insert(this->variable_ht, reg, ir);
 }
 
 void
@@ -2801,35 +350,6 @@ vec4_visitor::visit(ir_discard *)
 }
 
 void
-vec4_visitor::visit(ir_if *ir)
-{
-   /* Don't point the annotation at the if statement, because then it plus
-    * the then and else blocks get printed.
-    */
-   this->base_ir = ir->condition;
-
-   if (brw->gen == 6) {
-      emit_if_gen6(ir);
-   } else {
-      enum brw_predicate predicate;
-      emit_bool_to_cond_code(ir->condition, &predicate);
-      emit(IF(predicate));
-   }
-
-   visit_instructions(&ir->then_instructions);
-
-   if (!ir->else_instructions.is_empty()) {
-      this->base_ir = ir->condition;
-      emit(BRW_OPCODE_ELSE);
-
-      visit_instructions(&ir->else_instructions);
-   }
-
-   this->base_ir = ir->condition;
-   emit(BRW_OPCODE_ENDIF);
-}
-
-void
 vec4_visitor::visit(ir_emit_vertex *)
 {
    unreachable("not reached");
@@ -2842,55 +362,6 @@ vec4_visitor::visit(ir_end_primitive *)
 }
 
 void
-vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                  dst_reg dst, src_reg offset,
-                                  src_reg src0, src_reg src1)
-{
-   unsigned mlen = 0;
-
-   /* Set the atomic operation offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
-   mlen++;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
-      mlen++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
-      mlen++;
-   }
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped atomic message on Ivy Bridge, but that's OK because
-    * unused channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
-                                 src_reg(atomic_op), src_reg(surf_index));
-   inst->base_mrf = 0;
-   inst->mlen = mlen;
-}
-
-void
-vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                        src_reg offset)
-{
-   /* Set the surface read offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped surface read message, but that's OK because unused
-    * channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
-                                 dst, src_reg(surf_index));
-   inst->base_mrf = 0;
-   inst->mlen = 1;
-}
-
-void
 vec4_visitor::emit_ndc_computation()
 {
    /* Get the position */
@@ -2900,17 +371,17 @@ vec4_visitor::emit_ndc_computation()
    dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
    output_reg[BRW_VARYING_SLOT_NDC] = ndc;
 
-   current_annotation = "NDC";
+   bld.set_annotation("NDC");
    dst_reg ndc_w = ndc;
    ndc_w.writemask = WRITEMASK_W;
    src_reg pos_w = pos;
    pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
-   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+   bld.emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
 
    dst_reg ndc_xyz = ndc;
    ndc_xyz.writemask = WRITEMASK_XYZ;
 
-   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+   bld.MUL(ndc_xyz, pos, src_reg(ndc_w));
 }
 
 void
@@ -2923,29 +394,29 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       dst_reg header1_w = header1;
       header1_w.writemask = WRITEMASK_W;
 
-      emit(MOV(header1, 0u));
+      bld.MOV(header1, 0u);
 
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
 
-	 current_annotation = "Point size";
-	 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
-	 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
+         bld.set_annotation("Point size");
+	 bld.MUL(header1_w, psiz, src_reg((float)(1 << 11)));
+	 bld.AND(header1_w, src_reg(header1_w), 0x7ff << 8);
       }
 
       if (key->userclip_active) {
-         current_annotation = "Clipping flags";
+         bld.set_annotation("Clipping flags");
          dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
          dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
 
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
-         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
+         bld.CMP(bld.reg_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L);
+         bld.emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
+         bld.OR(header1_w, src_reg(header1_w), src_reg(flags0));
 
-         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
-         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
-         emit(SHL(flags1, src_reg(flags1), src_reg(4)));
-         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
+         bld.CMP(bld.reg_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L);
+         bld.emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
+         bld.SHL(flags1, src_reg(flags1), src_reg(4));
+         bld.OR(header1_w, src_reg(header1_w), src_reg(flags1));
       }
 
       /* i965 clipping workaround:
@@ -2960,35 +431,35 @@ vec4_visitor::emit_psiz_and_flags(dst_reg reg)
       if (brw->has_negative_rhw_bug) {
          src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
          ndc_w.swizzle = BRW_SWIZZLE_WWWW;
-         emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
+         bld.CMP(bld.reg_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L);
          vec4_instruction *inst;
-         inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
+         inst = bld.OR(header1_w, src_reg(header1_w), src_reg(1u << 6));
          inst->predicate = BRW_PREDICATE_NORMAL;
-         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
+         inst = bld.MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f));
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
 
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+      bld.MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1));
    } else if (brw->gen < 6) {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
+      bld.MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u);
    } else {
-      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
+      bld.MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
          dst_reg reg_w = reg;
          reg_w.writemask = WRITEMASK_W;
-         emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
+         bld.MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ]));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
          dst_reg reg_y = reg;
          reg_y.writemask = WRITEMASK_Y;
          reg_y.type = BRW_REGISTER_TYPE_D;
-         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
+         bld.MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER]));
       }
       if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
          dst_reg reg_z = reg;
          reg_z.writemask = WRITEMASK_Z;
          reg_z.type = BRW_REGISTER_TYPE_D;
-         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
+         bld.MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT]));
       }
    }
 }
@@ -3016,9 +487,9 @@ vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
    for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
         ++i) {
       reg.writemask = 1 << i;
-      emit(DP4(reg,
+      bld.DP4(reg,
                src_reg(output_reg[clip_vertex]),
-               src_reg(this->userplane[i + offset])));
+               src_reg(this->userplane[i + offset]));
    }
 }
 
@@ -3027,10 +498,10 @@ vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
 {
    assert (varying < VARYING_SLOT_MAX);
    reg.type = output_reg[varying].type;
-   current_annotation = output_reg_annotation[varying];
+   bld.set_annotation(output_reg_annotation[varying]);
    /* Copy the register, saturating if necessary */
-   vec4_instruction *inst = emit(MOV(reg,
-                                     src_reg(output_reg[varying])));
+   vec4_instruction *inst = bld.MOV(reg,
+                                     src_reg(output_reg[varying]));
    if ((varying == VARYING_SLOT_COL0 ||
         varying == VARYING_SLOT_COL1 ||
         varying == VARYING_SLOT_BFC0 ||
@@ -3049,17 +520,17 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
    case VARYING_SLOT_PSIZ:
    {
       /* PSIZ is always in slot 0, and is coupled with other flags. */
-      current_annotation = "indices, point width, clip flags";
+      bld.set_annotation("indices, point width, clip flags");
       emit_psiz_and_flags(reg);
       break;
    }
    case BRW_VARYING_SLOT_NDC:
-      current_annotation = "NDC";
-      emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
+      bld.set_annotation("NDC");
+      bld.MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC]));
       break;
    case VARYING_SLOT_POS:
-      current_annotation = "gl_Position";
-      emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
+      bld.set_annotation("gl_Position");
+      bld.MOV(reg, src_reg(output_reg[VARYING_SLOT_POS]));
       break;
    case VARYING_SLOT_EDGE:
       /* This is present when doing unfilled polygons.  We're supposed to copy
@@ -3068,9 +539,9 @@ vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
        * of that attribute (starts as 1.0f).  This is then used in clipping to
        * determine which edges should be drawn as wireframe.
        */
-      current_annotation = "edge flag";
-      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
-                                    glsl_type::float_type, WRITEMASK_XYZW))));
+      bld.set_annotation("edge flag");
+      bld.MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
+                                    glsl_type::float_type, WRITEMASK_XYZW)));
       break;
    case BRW_VARYING_SLOT_PAD:
       /* No need to write to this slot */
@@ -3138,7 +609,7 @@ vec4_visitor::emit_vertex()
 
    /* Lower legacy ff and ClipVertex clipping to clip distances */
    if (key->userclip_active && !prog->UsesClipDistanceOut) {
-      current_annotation = "user clip distances";
+      bld.set_annotation("user clip distances");
 
       output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
       output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
@@ -3173,7 +644,7 @@ vec4_visitor::emit_vertex()
       }
 
       complete = slot >= prog_data->vue_map.num_slots;
-      current_annotation = "URB write";
+      bld.set_annotation("URB write");
       vec4_instruction *inst = emit_urb_write_opcode(complete);
       inst->base_mrf = base_mrf;
       inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
@@ -3198,44 +669,14 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
       message_header_scale *= 16;
 
    if (reladdr) {
+      vec4_builder ibld = bld.at(block, inst);
       src_reg index = src_reg(this, glsl_type::int_type);
 
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
-      emit_before(block, inst, MUL(dst_reg(index), index,
-                                   src_reg(message_header_scale)));
-
-      return index;
-   } else {
-      return src_reg(reg_offset * message_header_scale);
-   }
-}
-
-src_reg
-vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
-				       src_reg *reladdr, int reg_offset)
-{
-   if (reladdr) {
-      src_reg index = src_reg(this, glsl_type::int_type);
-
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   src_reg(reg_offset)));
-
-      /* Pre-gen6, the message header uses byte offsets instead of vec4
-       * (16-byte) offset units.
-       */
-      if (brw->gen < 6) {
-         emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
-      }
+      ibld.ADD(dst_reg(index), *reladdr, src_reg(reg_offset));
+      ibld.MUL(dst_reg(index), index, src_reg(message_header_scale));
 
       return index;
-   } else if (brw->gen >= 8) {
-      /* Store the offset in a GRF so we can send-from-GRF. */
-      src_reg offset = src_reg(this, glsl_type::int_type);
-      emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
-      return offset;
    } else {
-      int message_header_scale = brw->gen < 6 ? 16 : 1;
       return src_reg(reg_offset * message_header_scale);
    }
 }
@@ -3251,11 +692,12 @@ vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
 				dst_reg temp, src_reg orig_src,
 				int base_offset)
 {
+   vec4_builder ibld = bld.at(block, inst);
    int reg_offset = base_offset + orig_src.reg_offset;
    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
                                       reg_offset);
 
-   emit_before(block, inst, SCRATCH_READ(temp, index));
+   SCRATCH_READ(ibld, temp, index);
 }
 
 /**
@@ -3291,13 +733,13 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
                                swizzles[2], swizzles[3]);
 
+   vec4_builder ibld = bld.at(block, (vec4_instruction *)inst->next);
    dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
 				       inst->dst.writemask));
-   vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
-   write->predicate = inst->predicate;
-   write->ir = inst->ir;
-   write->annotation = inst->annotation;
-   inst->insert_after(block, write);
+   ibld.set_base_ir(inst->ir);
+   ibld.set_annotation(inst->annotation);
+   exec_predicate(inst->predicate,
+                  SCRATCH_WRITE(ibld, dst, temp, index));
 
    inst->dst.file = temp.file;
    inst->dst.reg = temp.reg;
@@ -3346,8 +788,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
     */
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       /* Set up the annotation tracking for new generated instructions. */
-      base_ir = inst->ir;
-      current_annotation = inst->annotation;
+      bld.set_base_ir(inst->ir);
+      bld.set_annotation(inst->annotation);
 
       if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
 	 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
@@ -3371,37 +813,6 @@ vec4_visitor::move_grf_array_access_to_scratch()
 }
 
 /**
- * Emits an instruction before @inst to load the value named by @orig_src
- * from the pull constant buffer (surface) at @base_offset to @temp.
- */
-void
-vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
-				      dst_reg temp, src_reg orig_src,
-				      int base_offset)
-{
-   int reg_offset = base_offset + orig_src.reg_offset;
-   src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
-   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
-                                             reg_offset);
-   vec4_instruction *load;
-
-   if (brw->gen >= 7) {
-      dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
-      grf_offset.type = offset.type;
-      emit_before(block, inst, MOV(grf_offset, offset));
-
-      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                           temp, index, src_reg(grf_offset));
-   } else {
-      load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                           temp, index, offset);
-      load->base_mrf = 14;
-      load->mlen = 1;
-   }
-   emit_before(block, inst, load);
-}
-
-/**
  * Implements array access of uniforms by inserting a
  * PULL_CONSTANT_LOAD instruction.
  *
@@ -3430,7 +841,7 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
 	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
 	    continue;
 
-	 int uniform = inst->src[i].reg;
+	 unsigned uniform = inst->src[i].reg;
 
 	 /* If this array isn't already present in the pull constant buffer,
 	  * add it.
@@ -3449,13 +860,16 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
 	 }
 
 	 /* Set up the annotation tracking for new generated instructions. */
-	 base_ir = inst->ir;
-	 current_annotation = inst->annotation;
+	 bld.set_base_ir(inst->ir);
+         bld.set_annotation(inst->annotation);
 
-	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+         vec4_builder ibld = bld.at(block, inst);
+	 int loc = pull_constant_loc[uniform] + inst->src[i].reg_offset;
+         src_reg surf_index(prog_data->base.binding_table.pull_constants_start);
+	 dst_reg temp = bld.natural_reg(BRW_REGISTER_TYPE_F);
 
-	 emit_pull_constant_load(block, inst, temp, inst->src[i],
-				 pull_constant_loc[uniform]);
+         emit_pull_constant_load(ibld, temp, surf_index, 16 * loc,
+                                 inst->src[i].reladdr, 4);
 
 	 inst->src[i].file = temp.file;
 	 inst->src[i].reg = temp.reg;
@@ -3472,16 +886,162 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
    split_uniform_registers();
 }
 
-void
-vec4_visitor::resolve_ud_negate(src_reg *reg)
+static bool
+is_high_sampler(struct brw_context *brw, src_reg sampler)
 {
-   if (reg->type != BRW_REGISTER_TYPE_UD ||
-       !reg->negate)
-      return;
+   if (brw->gen < 8 && !brw->is_haswell)
+      return false;
 
-   src_reg temp = src_reg(this, glsl_type::uvec4_type);
-   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
-   *reg = temp;
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+vec4_instruction *
+vec4_visitor::emit_texture(ir_texture *ir, const dst_reg &dst,
+                           const src_reg &coordinate, const src_reg &shadow_c,
+                           const src_reg &lod, const src_reg &lod2,
+                           const src_reg &offset_val, const src_reg &sample_index,
+                           const src_reg &mcs, const src_reg &sampler)
+{
+   const bool has_nonconstant_offset = (offset_val.file != BAD_FILE);
+   enum opcode opcode;
+
+   switch (ir->op) {
+   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+   case ir_tg4: opcode = has_nonconstant_offset
+                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+   case ir_txb:
+      unreachable("TXB is not valid for vertex shaders.");
+   case ir_lod:
+      unreachable("LOD is not valid for vertex shaders.");
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction inst(opcode, dst, src_reg(), sampler);
+
+   /* The message header is necessary for:
+    * - Gen4 (always)
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    */
+   inst.header_present =
+      brw->gen < 5 || inst.texture_offset != 0 || ir->op == ir_tg4 ||
+      is_high_sampler(brw, sampler);
+   inst.base_mrf = 2;
+   inst.mlen = inst.header_present + 1; /* always at least one */
+
+   /* MRF for the first parameter */
+   dst_reg payload = dst_reg(MRF, inst.base_mrf + inst.header_present);
+
+   if (ir->op == ir_txs || ir->op == ir_query_levels) {
+      const unsigned mask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      bld.MOV(writemask(retype(payload, lod.type), mask), lod);
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      bld.MOV(writemask(retype(payload, coordinate.type), coord_mask),
+              coordinate);
+
+      if (zero_mask != 0)
+         bld.MOV(writemask(retype(payload, coordinate.type), zero_mask),
+                 src_reg(0));
+
+      /* Load the shadow comparitor */
+      if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
+         bld.MOV(writemask(offset(retype(payload, shadow_c.type), 1),
+                           WRITEMASK_X),
+                 shadow_c);
+	 inst.mlen++;
+      }
+
+      /* Load the LOD info */
+      if (ir->op == ir_tex || ir->op == ir_txl) {
+         dst_reg mrf;
+	 unsigned mask;
+	 if (brw->gen >= 5) {
+	    mrf = offset(payload, 1);
+	    if (ir->shadow_comparitor) {
+	       mask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       mask = WRITEMASK_X;
+	       inst.mlen++;
+	    }
+	 } else /* brw->gen == 4 */ {
+	    mrf = payload;
+	    mask = WRITEMASK_W;
+	 }
+	 bld.MOV(writemask(retype(mrf, lod.type), mask), lod);
+      } else if (ir->op == ir_txf) {
+         bld.MOV(writemask(retype(payload, lod.type), WRITEMASK_W),
+                 lod);
+      } else if (ir->op == ir_txf_ms) {
+         bld.MOV(writemask(retype(offset(payload, 1), sample_index.type),
+                           WRITEMASK_X), sample_index);
+         if (brw->gen >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            bld.MOV(writemask(retype(offset(payload, 1), BRW_REGISTER_TYPE_UD),
+                              WRITEMASK_Y),
+                    swizzle(mcs, BRW_SWIZZLE_XXXX));
+         }
+         inst.mlen++;
+      } else if (ir->op == ir_txd) {
+         dst_reg mrf = retype(payload, lod.type);
+
+	 if (brw->gen >= 5) {
+	    bld.MOV(writemask(offset(mrf, 1), WRITEMASK_XZ),
+                    swizzle(lod, BRW_SWIZZLE_XXYY));
+	    bld.MOV(writemask(offset(mrf, 1), WRITEMASK_YW),
+                    swizzle(lod2, BRW_SWIZZLE_XXYY));
+	    inst.mlen++;
+
+	    if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
+               bld.MOV(writemask(offset(mrf, 2), WRITEMASK_X),
+                       swizzle(lod, BRW_SWIZZLE_ZZZZ));
+               bld.MOV(writemask(offset(mrf, 2), WRITEMASK_Y),
+                       swizzle(lod2, BRW_SWIZZLE_ZZZZ));
+	       inst.mlen++;
+
+               if (ir->shadow_comparitor)
+                  bld.MOV(writemask(offset(retype(payload, shadow_c.type), 2),
+                                    WRITEMASK_Z), shadow_c);
+	    }
+	 } else /* brw->gen == 4 */ {
+	    bld.MOV(writemask(offset(mrf, 1), WRITEMASK_XYZ), lod);
+	    bld.MOV(writemask(offset(mrf, 2), WRITEMASK_XYZ), lod2);
+	    inst.mlen += 2;
+	 }
+      } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
+         if (ir->shadow_comparitor)
+            bld.MOV(writemask(retype(payload, shadow_c.type),
+                              WRITEMASK_W), shadow_c);
+
+         bld.MOV(writemask(retype(offset(payload, 1), BRW_REGISTER_TYPE_D),
+                           WRITEMASK_XY), offset_val);
+         inst.mlen++;
+      }
+   }
+
+   return bld.emit(inst);
+}
+
+src_reg
+vec4_visitor::emit_untyped_surface_header()
+{
+   return src_reg();
 }
 
 vec4_visitor::vec4_visitor(struct brw_context *brw,
@@ -3494,81 +1054,26 @@ vec4_visitor::vec4_visitor(struct brw_context *brw,
 			   void *mem_ctx,
                            bool debug_flag,
                            bool no_spills,
-                           shader_time_shader_type st_base,
-                           shader_time_shader_type st_written,
-                           shader_time_shader_type st_reset)
-   : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
+                           shader_time_shader_type st_type)
+   : backend_visitor(brw, shader_prog, prog, &prog_data->base, mem_ctx, stage,
+                     debug_flag, false,
+                     vec4_builder(brw, mem_ctx, alloc, instructions),
+                     st_type,
+                     /* Initialize uniform_array_size to at least 1 because
+                      * pre-gen6 VS requires at least one. See
+                      * setup_uniforms() in brw_vec4.cpp.
+                      */
+                     MAX2(prog_data->base.nr_params, 1)),
      c(c),
      key(key),
      prog_data(prog_data),
      sanity_param_count(0),
-     fail_msg(NULL),
-     first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      debug_flag(debug_flag),
-     no_spills(no_spills),
-     st_base(st_base),
-     st_written(st_written),
-     st_reset(st_reset)
+     no_spills(no_spills)
 {
-   this->mem_ctx = mem_ctx;
-   this->failed = false;
-
-   this->base_ir = NULL;
-   this->current_annotation = NULL;
    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
-
-   this->variable_ht = hash_table_ctor(0,
-				       hash_table_pointer_hash,
-				       hash_table_pointer_compare);
-
-   this->virtual_grf_start = NULL;
-   this->virtual_grf_end = NULL;
    this->live_intervals_valid = false;
-
-   this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
-
-   this->uniforms = 0;
-
-   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
-    * at least one. See setup_uniforms() in brw_vec4.cpp.
-    */
-   this->uniform_array_size = 1;
-   if (prog_data) {
-      this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
-   }
-
-   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
-   this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
-}
-
-vec4_visitor::~vec4_visitor()
-{
-   hash_table_dtor(this->variable_ht);
-}
-
-
-void
-vec4_visitor::fail(const char *format, ...)
-{
-   va_list va;
-   char *msg;
-
-   if (failed)
-      return;
-
-   failed = true;
-
-   va_start(va, format);
-   msg = ralloc_vasprintf(mem_ctx, format, va);
-   va_end(va);
-   msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
-
-   this->fail_msg = msg;
-
-   if (debug_flag) {
-      fprintf(stderr, "%s",  msg);
-   }
 }
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index 5d9027b2ea6..304d3f0015c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -43,10 +43,10 @@ vec4_visitor::emit_vp_sop(enum brw_conditional_mod conditional_mod,
 {
    vec4_instruction *inst;
 
-   inst = emit(BRW_OPCODE_CMP, dst_null_d(), src0, src1);
+   inst = bld.emit(BRW_OPCODE_CMP, bld.reg_null_d(), src0, src1);
    inst->conditional_mod = conditional_mod;
 
-   inst = emit(BRW_OPCODE_SEL, dst, one, src_reg(0.0f));
+   inst = bld.emit(BRW_OPCODE_SEL, dst, one, src_reg(0.0f));
    inst->predicate = BRW_PREDICATE_NORMAL;
 }
 
@@ -68,11 +68,11 @@ vec4_vs_visitor::emit_program_code()
     * mov.f0 dst 1.0
     */
    src_reg one = src_reg(this, glsl_type::float_type);
-   emit(MOV(dst_reg(one), src_reg(1.0f)));
+   bld.MOV(dst_reg(one), src_reg(1.0f));
 
    for (unsigned int insn = 0; insn < prog->NumInstructions; insn++) {
       const struct prog_instruction *vpi = &prog->Instructions[insn];
-      base_ir = vpi;
+      bld.set_base_ir(vpi);
 
       dst_reg dst;
       src_reg src[3];
@@ -89,11 +89,11 @@ vec4_vs_visitor::emit_program_code()
       case OPCODE_ABS:
          src[0].abs = true;
          src[0].negate = false;
-         emit(MOV(dst, src[0]));
+         bld.MOV(dst, src[0]);
          break;
 
       case OPCODE_ADD:
-         emit(ADD(dst, src[0], src[1]));
+         bld.ADD(dst, src[0], src[1]);
          break;
 
       case OPCODE_ARL:
@@ -102,40 +102,40 @@ vec4_vs_visitor::emit_program_code()
             dst_reg dst_f = dst;
             dst_f.type = BRW_REGISTER_TYPE_F;
 
-            emit(RNDD(dst_f, src[0]));
-            emit(MOV(dst, src_reg(dst_f)));
+            bld.RNDD(dst_f, src[0]);
+            bld.MOV(dst, src_reg(dst_f));
          } else {
-            emit(RNDD(dst, src[0]));
+            bld.RNDD(dst, src[0]);
          }
          break;
 
       case OPCODE_DP3:
-         emit(DP3(dst, src[0], src[1]));
+         bld.DP3(dst, src[0], src[1]);
          break;
       case OPCODE_DP4:
-         emit(DP4(dst, src[0], src[1]));
+         bld.DP4(dst, src[0], src[1]);
          break;
       case OPCODE_DPH:
-         emit(DPH(dst, src[0], src[1]));
+         bld.DPH(dst, src[0], src[1]);
          break;
 
       case OPCODE_DST: {
          dst_reg t = dst;
          if (vpi->DstReg.WriteMask & WRITEMASK_X) {
             t.writemask = WRITEMASK_X;
-            emit(MOV(t, src_reg(1.0f)));
+            bld.MOV(t, src_reg(1.0f));
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
             t.writemask = WRITEMASK_Y;
-            emit(MUL(t, src[0], src[1]));
+            bld.MUL(t, src[0], src[1]);
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
             t.writemask = WRITEMASK_Z;
-            emit(MOV(t, src[0]));
+            bld.MOV(t, src[0]);
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_W) {
             t.writemask = WRITEMASK_W;
-            emit(MOV(t, src[1]));
+            bld.MOV(t, src[1]);
          }
          break;
       }
@@ -146,46 +146,46 @@ vec4_vs_visitor::emit_program_code()
             /* tmp_d = floor(src[0].x) */
             src_reg tmp_d = src_reg(this, glsl_type::ivec4_type);
             assert(tmp_d.type == BRW_REGISTER_TYPE_D);
-            emit(RNDD(dst_reg(tmp_d), swizzle(src[0], BRW_SWIZZLE_XXXX)));
+            bld.RNDD(dst_reg(tmp_d), swizzle(src[0], BRW_SWIZZLE_XXXX));
 
             /* result[0] = 2.0 ^ tmp */
             /* Adjust exponent for floating point: exp += 127 */
             dst_reg tmp_d_x(GRF, tmp_d.reg, glsl_type::int_type, WRITEMASK_X);
-            emit(ADD(tmp_d_x, tmp_d, src_reg(127)));
+            bld.ADD(tmp_d_x, tmp_d, src_reg(127));
 
             /* Install exponent and sign.  Excess drops off the edge: */
             dst_reg res_d_x(GRF, result.reg, glsl_type::int_type, WRITEMASK_X);
-            emit(BRW_OPCODE_SHL, res_d_x, tmp_d, src_reg(23));
+            bld.emit(BRW_OPCODE_SHL, res_d_x, tmp_d, src_reg(23));
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
             result.writemask = WRITEMASK_Y;
-            emit(FRC(result, src[0]));
+            bld.FRC(result, src[0]);
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
             result.writemask = WRITEMASK_Z;
-            emit_math(SHADER_OPCODE_EXP2, result, src[0]);
+            bld.emit_math(SHADER_OPCODE_EXP2, result, src[0]);
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_W) {
             result.writemask = WRITEMASK_W;
-            emit(MOV(result, src_reg(1.0f)));
+            bld.MOV(result, src_reg(1.0f));
          }
          break;
       }
 
       case OPCODE_EX2:
-         emit_math(SHADER_OPCODE_EXP2, dst, src[0]);
+         bld.emit_math(SHADER_OPCODE_EXP2, dst, src[0]);
          break;
 
       case OPCODE_FLR:
-         emit(RNDD(dst, src[0]));
+         bld.RNDD(dst, src[0]);
          break;
 
       case OPCODE_FRC:
-         emit(FRC(dst, src[0]));
+         bld.FRC(dst, src[0]);
          break;
 
       case OPCODE_LG2:
-         emit_math(SHADER_OPCODE_LOG2, dst, src[0]);
+         bld.emit_math(SHADER_OPCODE_LOG2, dst, src[0]);
          break;
 
       case OPCODE_LIT: {
@@ -207,36 +207,36 @@ vec4_vs_visitor::emit_program_code()
           */
          if (vpi->DstReg.WriteMask & WRITEMASK_XW) {
             result.writemask = WRITEMASK_XW;
-            emit(MOV(result, src_reg(1.0f)));
+            bld.MOV(result, src_reg(1.0f));
          }
          if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
             result.writemask = WRITEMASK_YZ;
-            emit(MOV(result, src_reg(0.0f)));
+            bld.MOV(result, src_reg(0.0f));
 
             src_reg tmp_x = swizzle(src[0], BRW_SWIZZLE_XXXX);
 
-            emit(CMP(dst_null_d(), tmp_x, src_reg(0.0f), BRW_CONDITIONAL_G));
-            emit(IF(BRW_PREDICATE_NORMAL));
+            bld.CMP(bld.reg_null_d(), tmp_x, src_reg(0.0f), BRW_CONDITIONAL_G);
+            bld.IF(BRW_PREDICATE_NORMAL);
 
             if (vpi->DstReg.WriteMask & WRITEMASK_Y) {
                result.writemask = WRITEMASK_Y;
-               emit(MOV(result, tmp_x));
+               bld.MOV(result, tmp_x);
             }
 
             if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
                /* if (tmp.y < 0) tmp.y = 0; */
                src_reg tmp_y = swizzle(src[0], BRW_SWIZZLE_YYYY);
                result.writemask = WRITEMASK_Z;
-               emit_minmax(BRW_CONDITIONAL_G, result, tmp_y, src_reg(0.0f));
+               bld.emit_minmax(BRW_CONDITIONAL_G, result, tmp_y, src_reg(0.0f));
 
                src_reg clamped_y(result);
                clamped_y.swizzle = BRW_SWIZZLE_ZZZZ;
 
                src_reg tmp_w = swizzle(src[0], BRW_SWIZZLE_WWWW);
 
-               emit_math(SHADER_OPCODE_POW, result, clamped_y, tmp_w);
+               bld.emit_math(SHADER_OPCODE_POW, result, clamped_y, tmp_w);
             }
-            emit(BRW_OPCODE_ENDIF);
+            bld.emit(BRW_OPCODE_ENDIF);
          }
          break;
       }
@@ -260,19 +260,19 @@ vec4_vs_visitor::emit_program_code()
           */
          if (vpi->DstReg.WriteMask & WRITEMASK_XZ) {
             result.writemask = WRITEMASK_X;
-            emit(AND(result, arg0_ud, src_reg((1u << 31) - 1)));
-            emit(BRW_OPCODE_SHR, result, result_src, src_reg(23u));
+            bld.AND(result, arg0_ud, src_reg((1u << 31) - 1));
+            bld.emit(BRW_OPCODE_SHR, result, result_src, src_reg(23u));
             src_reg result_d(result_src);
             result_d.type = BRW_REGISTER_TYPE_D; /* does it matter? */
             result.type = BRW_REGISTER_TYPE_F;
-            emit(ADD(result, result_d, src_reg(-127)));
+            bld.ADD(result, result_d, src_reg(-127));
          }
 
          if (vpi->DstReg.WriteMask & WRITEMASK_YZ) {
             result.writemask = WRITEMASK_Y;
             result.type = BRW_REGISTER_TYPE_UD;
-            emit(AND(result, arg0_ud, src_reg((1u << 23) - 1)));
-            emit(OR(result, result_src, src_reg(127u << 23)));
+            bld.AND(result, arg0_ud, src_reg((1u << 23) - 1));
+            bld.OR(result, result_src, src_reg(127u << 23));
          }
 
          if (vpi->DstReg.WriteMask & WRITEMASK_Z) {
@@ -294,51 +294,51 @@ vec4_vs_visitor::emit_program_code()
             result_x.swizzle = BRW_SWIZZLE_XXXX;
             result_y.swizzle = BRW_SWIZZLE_YYYY;
             result_z.swizzle = BRW_SWIZZLE_ZZZZ;
-            emit_math(SHADER_OPCODE_LOG2, result, result_y);
-            emit(ADD(result, result_z, result_x));
+            bld.emit_math(SHADER_OPCODE_LOG2, result, result_y);
+            bld.ADD(result, result_z, result_x);
          }
 
          if (vpi->DstReg.WriteMask & WRITEMASK_W) {
             result.type = BRW_REGISTER_TYPE_F;
             result.writemask = WRITEMASK_W;
-            emit(MOV(result, src_reg(1.0f)));
+            bld.MOV(result, src_reg(1.0f));
          }
          break;
       }
 
       case OPCODE_MAD: {
          src_reg temp = src_reg(this, glsl_type::vec4_type);
-         emit(MUL(dst_reg(temp), src[0], src[1]));
-         emit(ADD(dst, temp, src[2]));
+         bld.MUL(dst_reg(temp), src[0], src[1]);
+         bld.ADD(dst, temp, src[2]);
          break;
       }
 
       case OPCODE_MAX:
-         emit_minmax(BRW_CONDITIONAL_G, dst, src[0], src[1]);
+         bld.emit_minmax(BRW_CONDITIONAL_G, dst, src[0], src[1]);
          break;
 
       case OPCODE_MIN:
-         emit_minmax(BRW_CONDITIONAL_L, dst, src[0], src[1]);
+         bld.emit_minmax(BRW_CONDITIONAL_L, dst, src[0], src[1]);
          break;
 
       case OPCODE_MOV:
-         emit(MOV(dst, src[0]));
+         bld.MOV(dst, src[0]);
          break;
 
       case OPCODE_MUL:
-         emit(MUL(dst, src[0], src[1]));
+         bld.MUL(dst, src[0], src[1]);
          break;
 
       case OPCODE_POW:
-         emit_math(SHADER_OPCODE_POW, dst, src[0], src[1]);
+         bld.emit_math(SHADER_OPCODE_POW, dst, src[0], src[1]);
          break;
 
       case OPCODE_RCP:
-         emit_math(SHADER_OPCODE_RCP, dst, src[0]);
+         bld.emit_math(SHADER_OPCODE_RCP, dst, src[0]);
          break;
 
       case OPCODE_RSQ:
-         emit_math(SHADER_OPCODE_RSQ, dst, src[0]);
+         bld.emit_math(SHADER_OPCODE_RSQ, dst, src[0]);
          break;
 
       case OPCODE_SGE:
@@ -352,7 +352,7 @@ vec4_vs_visitor::emit_program_code()
       case OPCODE_SUB: {
          src_reg neg_src1 = src[1];
          neg_src1.negate = !src[1].negate;
-         emit(ADD(dst, src[0], neg_src1));
+         bld.ADD(dst, src[0], neg_src1);
          break;
       }
 
@@ -360,21 +360,21 @@ vec4_vs_visitor::emit_program_code()
          /* Note that SWZ's extended swizzles are handled in the general
           * get_src_reg() code.
           */
-         emit(MOV(dst, src[0]));
+         bld.MOV(dst, src[0]);
          break;
 
       case OPCODE_XPD: {
          src_reg t1 = src_reg(this, glsl_type::vec4_type);
          src_reg t2 = src_reg(this, glsl_type::vec4_type);
 
-         emit(MUL(dst_reg(t1),
+         bld.MUL(dst_reg(t1),
                   swizzle(src[0], BRW_SWIZZLE_YZXW),
-                  swizzle(src[1], BRW_SWIZZLE_ZXYW)));
-         emit(MUL(dst_reg(t2),
+                  swizzle(src[1], BRW_SWIZZLE_ZXYW));
+         bld.MUL(dst_reg(t2),
                   swizzle(src[0], BRW_SWIZZLE_ZXYW),
-                  swizzle(src[1], BRW_SWIZZLE_YZXW)));
+                  swizzle(src[1], BRW_SWIZZLE_YZXW));
          t2.negate = true;
-         emit(ADD(dst, t1, t2));
+         bld.ADD(dst, t1, t2);
          break;
       }
 
@@ -388,7 +388,7 @@ vec4_vs_visitor::emit_program_code()
 
       /* Copy the temporary back into the actual destination register. */
       if (vpi->Opcode != OPCODE_END) {
-         emit(MOV(get_vp_dst_reg(vpi->DstReg), src_reg(dst)));
+         bld.MOV(get_vp_dst_reg(vpi->DstReg), src_reg(dst));
       }
    }
 
@@ -475,7 +475,7 @@ vec4_vs_visitor::get_vp_dst_reg(const prog_dst_register &dst)
    }
 
    case PROGRAM_UNDEFINED:
-      return dst_null_f();
+      return bld.reg_null_f();
 
    default:
       unreachable("vec4_vp: bad destination register file");
@@ -530,10 +530,10 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
          src_reg reladdr = src_reg(this, glsl_type::int_type);
          dst_reg dst_reladdr = dst_reg(reladdr);
          dst_reladdr.writemask = WRITEMASK_X;
-         emit(ADD(dst_reladdr, this->vp_addr_reg, src_reg(src.Index)));
+         bld.ADD(dst_reladdr, this->vp_addr_reg, src_reg(src.Index));
 
          if (brw->gen < 6)
-            emit(MUL(dst_reladdr, reladdr, src_reg(16)));
+            bld.MUL(dst_reladdr, reladdr, src_reg(16));
 
       #if 0
          assert(src.Index < this->uniforms);
@@ -547,17 +547,14 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
          src_reg surf_index = src_reg(unsigned(prog_data->base.binding_table.pull_constants_start));
          vec4_instruction *load;
          if (brw->gen >= 7) {
-            load = new(mem_ctx)
-               vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
-                                dst_reg(result), surf_index, reladdr);
+            load = bld.emit(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                           dst_reg(result), surf_index, reladdr);
          } else {
-            load = new(mem_ctx)
-               vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
-                                dst_reg(result), surf_index, reladdr);
+            load = bld.emit(VS_OPCODE_PULL_CONSTANT_LOAD,
+                            dst_reg(result), surf_index, reladdr);
             load->base_mrf = 14;
             load->mlen = 1;
          }
-         emit(load);
          break;
       }
 
@@ -571,7 +568,7 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
          for (int i = 0; i < 4; i++) {
             dst_reg t = dst_reg(result);
             t.writemask = 1 << i;
-            emit(MOV(t, src_reg(plist->ParameterValues[src.Index][i].f)));
+            bld.MOV(t, src_reg(plist->ParameterValues[src.Index][i].f));
          }
          break;
 
@@ -636,24 +633,24 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
 
          if (src_mask) {
             temp.writemask = src_mask;
-            emit(MOV(temp, result));
+            bld.MOV(temp, result);
          }
 
          if (zeros_mask) {
             temp.writemask = zeros_mask;
-            emit(MOV(temp, src_reg(0.0f)));
+            bld.MOV(temp, src_reg(0.0f));
          }
 
          if (ones_mask) {
             temp.writemask = ones_mask;
-            emit(MOV(temp, src_reg(1.0f)));
+            bld.MOV(temp, src_reg(1.0f));
          }
 
          if (src.Negate) {
             temp.writemask = src.Negate;
             src_reg neg(temp_src);
             neg.negate = true;
-            emit(MOV(temp, neg));
+            bld.MOV(temp, neg);
          }
          result = temp_src;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 72b6ef03b42..ac544354d6a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -51,7 +51,7 @@ vec4_vs_visitor::emit_prolog()
             dst_reg dst = reg;
             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
             dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
-            emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
+            bld.MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f));
          }
 
          /* Do sign recovery for 2101010 formats if required. */
@@ -59,19 +59,19 @@ vec4_vs_visitor::emit_prolog()
             if (sign_recovery_shift.file == BAD_FILE) {
                /* shift constant: <22,22,22,30> */
                sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
-               emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
+               bld.MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u));
+               bld.MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u));
             }
 
-            emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
-            emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
+            bld.SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift));
+            bld.ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift));
          }
 
          /* Apply BGRA swizzle if required. */
          if (wa_flags & BRW_ATTRIB_WA_BGRA) {
             src_reg temp = src_reg(reg);
             temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
-            emit(MOV(reg, temp));
+            bld.MOV(reg, temp);
          }
 
          if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
@@ -87,17 +87,17 @@ vec4_vs_visitor::emit_prolog()
                if (es3_normalize_factor.file == BAD_FILE) {
                   /* mul constant: 1 / (2^(b-1) - 1) */
                   es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
-                           src_reg(1.0f / ((1<<9) - 1))));
-                  emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W),
-                           src_reg(1.0f / ((1<<1) - 1))));
+                  bld.MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ),
+                           src_reg(1.0f / ((1<<9) - 1)));
+                  bld.MOV(writemask(es3_normalize_factor, WRITEMASK_W),
+                           src_reg(1.0f / ((1<<1) - 1)));
                }
 
                dst_reg dst = reg;
                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg(reg_d)));
-               emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
-               emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
+               bld.MOV(dst, src_reg(reg_d));
+               bld.MUL(dst, src_reg(dst), src_reg(es3_normalize_factor));
+               bld.emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
             } else {
                /* The following equations are from the OpenGL 3.2 specification:
                 *
@@ -113,30 +113,30 @@ vec4_vs_visitor::emit_prolog()
                if (normalize_factor.file == BAD_FILE) {
                   /* 1 / (2^b - 1) for b=<10,10,10,2> */
                   normalize_factor = dst_reg(this, glsl_type::vec4_type);
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ),
-                           src_reg(1.0f / ((1<<10) - 1))));
-                  emit(MOV(writemask(normalize_factor, WRITEMASK_W),
-                           src_reg(1.0f / ((1<<2) - 1))));
+                  bld.MOV(writemask(normalize_factor, WRITEMASK_XYZ),
+                           src_reg(1.0f / ((1<<10) - 1)));
+                  bld.MOV(writemask(normalize_factor, WRITEMASK_W),
+                           src_reg(1.0f / ((1<<2) - 1)));
                }
 
                dst_reg dst = reg;
                dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-               emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
+               bld.MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud));
 
                /* For signed normalization, we want the numerator to be 2c+1. */
                if (wa_flags & BRW_ATTRIB_WA_SIGN) {
-                  emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
-                  emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
+                  bld.MUL(dst, src_reg(dst), src_reg(2.0f));
+                  bld.ADD(dst, src_reg(dst), src_reg(1.0f));
                }
 
-               emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
+               bld.MUL(dst, src_reg(dst), src_reg(normalize_factor));
             }
          }
 
          if (wa_flags & BRW_ATTRIB_WA_SCALE) {
             dst_reg dst = reg;
             dst.type = brw_type_for_base_type(glsl_type::vec4_type);
-            emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
+            bld.MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud));
          }
       }
    }
@@ -193,7 +193,7 @@ vec4_vs_visitor::emit_urb_write_opcode(bool complete)
          emit_shader_time_end();
    }
 
-   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   vec4_instruction *inst = bld.emit(VS_OPCODE_URB_WRITE);
    inst->urb_write_flags = complete ?
       BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
 
@@ -221,7 +221,7 @@ vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
                   &vs_compile->key.base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, INTEL_DEBUG & DEBUG_VS, false /* no_spills */,
-                  ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
+                  ST_VS),
      vs_compile(vs_compile),
      vs_prog_data(vs_prog_data)
 {
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index d16cc6ed8b7..f06da953bcf 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -70,20 +70,20 @@ gen6_gs_visitor::emit_prolog()
     * flags for the next vertex come right after the data items and flags for
     * the previous vertex.
     */
-   this->current_annotation = "gen6 prolog";
+   bld.set_annotation("gen6 prolog");
    this->vertex_output = src_reg(this,
                                  glsl_type::uint_type,
                                  (prog_data->vue_map.num_slots + 1) *
                                  c->gp->program.VerticesOut);
    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
+   bld.MOV(dst_reg(this->vertex_output_offset), src_reg(0u));
 
    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
     * so initialize it once to R0.
     */
-   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
+   vec4_instruction *inst = bld.MOV(dst_reg(MRF, 1),
                                      retype(brw_vec8_grf(0, 0),
-                                            BRW_REGISTER_TYPE_UD)));
+                                            BRW_REGISTER_TYPE_UD));
    inst->force_writemask_all = true;
 
    /* This will be used as a temporary to store writeback data of FF_SYNC
@@ -98,13 +98,13 @@ gen6_gs_visitor::emit_prolog()
     * headers.
     */
    this->first_vertex = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
+   bld.MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START);
 
    /* The FF_SYNC message requires to know the number of primitives generated,
     * so keep a counter for this.
     */
    this->prim_count = src_reg(this, glsl_type::uint_type);
-   emit(MOV(dst_reg(this->prim_count), 0u));
+   bld.MOV(dst_reg(this->prim_count), 0u);
 
    if (c->prog_data.gen6_xfb_enabled) {
       /* Create a virtual register to hold destination indices in SOL */
@@ -115,8 +115,8 @@ gen6_gs_visitor::emit_prolog()
       this->svbi = src_reg(this, glsl_type::uvec4_type);
       /* Create a virtual register to hold max values of SVBI */
       this->max_svbi = src_reg(this, glsl_type::uvec4_type);
-      emit(MOV(dst_reg(this->max_svbi),
-               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
+      bld.MOV(dst_reg(this->max_svbi),
+               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)));
 
       xfb_setup();
    }
@@ -142,21 +142,21 @@ gen6_gs_visitor::emit_prolog()
    if (c->prog_data.include_primitive_id) {
       this->primitive_id =
          src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
-      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
+      bld.emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
    }
 }
 
 void
 gen6_gs_visitor::visit(ir_emit_vertex *)
 {
-   this->current_annotation = "gen6 emit vertex";
+   bld.set_annotation("gen6 emit vertex");
    /* Honor max_vertex layout indication in geometry shader by ignoring any
     * vertices coming after c->gp->program.VerticesOut.
     */
    unsigned num_output_vertices = c->gp->program.VerticesOut;
-   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
-            BRW_CONDITIONAL_L));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.CMP(bld.reg_null_d(), this->vertex_count, src_reg(num_output_vertices),
+            BRW_CONDITIONAL_L);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
       /* Buffer all output slots for this vertex in vertex_output */
       for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
@@ -183,12 +183,12 @@ gen6_gs_visitor::visit(ir_emit_vertex *)
             dst_reg dst(this->vertex_output);
             dst.reladdr = ralloc(mem_ctx, src_reg);
             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
-            vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
+            vec4_instruction *inst = bld.MOV(dst, src_reg(tmp));
             inst->force_writemask_all = true;
          }
 
-         emit(ADD(dst_reg(this->vertex_output_offset),
-                  this->vertex_output_offset, 1u));
+         bld.ADD(dst_reg(this->vertex_output_offset),
+                  this->vertex_output_offset, 1u);
       }
 
       /* Now buffer flags for this vertex */
@@ -199,32 +199,32 @@ gen6_gs_visitor::visit(ir_emit_vertex *)
          /* If we are outputting points, then every vertex has PrimStart and
           * PrimEnd set.
           */
-         emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
-                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
-         emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
+         bld.MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
+                  URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
+         bld.ADD(dst_reg(this->prim_count), this->prim_count, 1u);
       } else {
          /* Otherwise, we can only set the PrimStart flag, which we have stored
           * in the first_vertex register. We will have to wait until we execute
           * EndPrimitive() or we end the thread to set the PrimEnd flag on a
           * vertex.
           */
-         emit(OR(dst, this->first_vertex,
-                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
-         emit(MOV(dst_reg(this->first_vertex), 0u));
+         bld.OR(dst, this->first_vertex,
+                 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT));
+         bld.MOV(dst_reg(this->first_vertex), 0u);
       }
-      emit(ADD(dst_reg(this->vertex_output_offset),
-               this->vertex_output_offset, 1u));
+      bld.ADD(dst_reg(this->vertex_output_offset),
+               this->vertex_output_offset, 1u);
 
       /* Update vertex count */
-      emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
+      bld.ADD(dst_reg(this->vertex_count), this->vertex_count, 1u);
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 }
 
 void
 gen6_gs_visitor::visit(ir_end_primitive *)
 {
-   this->current_annotation = "gen6 end primitive";
+   bld.set_annotation("gen6 end primitive");
    /* Calling EndPrimitive() is optional for point output. In this case we set
     * the PrimEnd flag when we process EmitVertex().
     */
@@ -241,40 +241,40 @@ gen6_gs_visitor::visit(ir_end_primitive *)
     * below).
     */
    unsigned num_output_vertices = c->gp->program.VerticesOut;
-   emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
-            BRW_CONDITIONAL_L));
-   vec4_instruction *inst = emit(CMP(dst_null_d(),
+   bld.CMP(bld.reg_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
+            BRW_CONDITIONAL_L);
+   vec4_instruction *inst = bld.CMP(bld.reg_null_d(),
                                      this->vertex_count, 0u,
-                                     BRW_CONDITIONAL_NEQ));
+                                     BRW_CONDITIONAL_NEQ);
    inst->predicate = BRW_PREDICATE_NORMAL;
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
       /* vertex_output_offset is already pointing at the first entry of the
        * next vertex. So subtract 1 to modify the flags for the previous
        * vertex.
        */
       src_reg offset(this, glsl_type::uint_type);
-      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
+      bld.ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1));
 
       src_reg dst(this->vertex_output);
       dst.reladdr = ralloc(mem_ctx, src_reg);
       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 
-      emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
-      emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
+      bld.OR(dst_reg(dst), dst, URB_WRITE_PRIM_END);
+      bld.ADD(dst_reg(this->prim_count), this->prim_count, 1u);
 
       /* Set the first vertex flag to indicate that the next vertex will start
        * a primitive.
        */
-      emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
+      bld.MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START);
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 }
 
 void
 gen6_gs_visitor::emit_urb_write_header(int mrf)
 {
-   this->current_annotation = "gen6 urb header";
+   bld.set_annotation("gen6 urb header");
    /* Compute offset of the flags for the current vertex in vertex_output and
     * write them in dw2 of the message header.
     *
@@ -284,14 +284,14 @@ gen6_gs_visitor::emit_urb_write_header(int mrf)
     * slots per vertex to that offset to obtain the flags data offset.
     */
    src_reg flags_offset(this, glsl_type::uint_type);
-   emit(ADD(dst_reg(flags_offset),
-            this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
+   bld.ADD(dst_reg(flags_offset),
+            this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots));
 
    src_reg flags_data(this->vertex_output);
    flags_data.reladdr = ralloc(mem_ctx, src_reg);
    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
 
-   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
+   bld.emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 }
 
 void
@@ -302,7 +302,7 @@ gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
 
    if (!complete) {
       /* If the vertex is not complete we don't have to do anything special */
-      inst = emit(GS_OPCODE_URB_WRITE);
+      inst = bld.emit(GS_OPCODE_URB_WRITE);
       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
    } else {
       /* Otherwise we always request to allocate a new VUE handle. If this is
@@ -313,7 +313,7 @@ gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
        * which would require to end the program with an IF/ELSE/ENDIF block,
        * something we do not want.
        */
-      inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
+      inst = bld.emit(GS_OPCODE_URB_WRITE_ALLOCATE);
       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
       inst->dst = dst_reg(MRF, base_mrf);
       inst->src[0] = this->temp;
@@ -339,12 +339,12 @@ gen6_gs_visitor::emit_thread_end()
     * points because in the point case we set PrimEnd on all vertices.
     */
    if (c->gp->program.OutputType != GL_POINTS) {
-      emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
-      emit(IF(BRW_PREDICATE_NORMAL));
+      bld.CMP(bld.reg_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z);
+      bld.IF(BRW_PREDICATE_NORMAL);
       {
          visit((ir_end_primitive *) NULL);
       }
-      emit(BRW_OPCODE_ENDIF);
+      bld.emit(BRW_OPCODE_ENDIF);
    }
 
    /* Here we have to:
@@ -367,38 +367,38 @@ gen6_gs_visitor::emit_thread_end()
    int max_usable_mrf = 13;
 
    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
-   emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.CMP(bld.reg_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
-      this->current_annotation = "gen6 thread end: ff_sync";
+      bld.set_annotation("gen6 thread end: ff_sync");
 
       vec4_instruction *inst;
       if (c->prog_data.gen6_xfb_enabled) {
          src_reg sol_temp(this, glsl_type::uvec4_type);
-         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+         bld.emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
               dst_reg(this->svbi),
               this->vertex_count,
               this->prim_count,
               sol_temp);
-         inst = emit(GS_OPCODE_FF_SYNC,
+         inst = bld.emit(GS_OPCODE_FF_SYNC,
                      dst_reg(this->temp), this->prim_count, this->svbi);
       } else {
-         inst = emit(GS_OPCODE_FF_SYNC,
+         inst = bld.emit(GS_OPCODE_FF_SYNC,
                      dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
       }
       inst->base_mrf = base_mrf;
 
       /* Loop over all buffered vertices and emit URB write messages */
-      this->current_annotation = "gen6 thread end: urb writes init";
+      bld.set_annotation("gen6 thread end: urb writes init");
       src_reg vertex(this, glsl_type::uint_type);
-      emit(MOV(dst_reg(vertex), 0u));
-      emit(MOV(dst_reg(this->vertex_output_offset), 0u));
+      bld.MOV(dst_reg(vertex), 0u);
+      bld.MOV(dst_reg(this->vertex_output_offset), 0u);
 
-      this->current_annotation = "gen6 thread end: urb writes";
-      emit(BRW_OPCODE_DO);
+      bld.set_annotation("gen6 thread end: urb writes");
+      bld.emit(BRW_OPCODE_DO);
       {
-         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
-         inst = emit(BRW_OPCODE_BREAK);
+         bld.CMP(bld.reg_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE);
+         inst = bld.emit(BRW_OPCODE_BREAK);
          inst->predicate = BRW_PREDICATE_NORMAL;
 
          /* First we prepare the message header */
@@ -417,7 +417,7 @@ gen6_gs_visitor::emit_thread_end()
 
             for (; slot < prog_data->vue_map.num_slots; ++slot) {
                int varying = prog_data->vue_map.slot_to_varying[slot];
-               current_annotation = output_reg_annotation[varying];
+               bld.set_annotation(output_reg_annotation[varying]);
 
                /* Compute offset of this slot for the current vertex
                 * in vertex_output
@@ -431,12 +431,12 @@ gen6_gs_visitor::emit_thread_end()
                dst_reg reg = dst_reg(MRF, mrf);
                reg.type = output_reg[varying].type;
                data.type = reg.type;
-               vec4_instruction *inst = emit(MOV(reg, data));
+               vec4_instruction *inst = bld.MOV(reg, data);
                inst->force_writemask_all = true;
 
                mrf++;
-               emit(ADD(dst_reg(this->vertex_output_offset),
-                        this->vertex_output_offset, 1u));
+               bld.ADD(dst_reg(this->vertex_output_offset),
+                        this->vertex_output_offset, 1u);
 
                /* If this was max_usable_mrf, we can't fit anything more into
                 * this URB WRITE.
@@ -455,17 +455,17 @@ gen6_gs_visitor::emit_thread_end()
           * to the first data item of the next vertex, so that we can start
           * writing the next vertex.
           */
-         emit(ADD(dst_reg(this->vertex_output_offset),
-                  this->vertex_output_offset, 1u));
+         bld.ADD(dst_reg(this->vertex_output_offset),
+                  this->vertex_output_offset, 1u);
 
-         emit(ADD(dst_reg(vertex), vertex, 1u));
+         bld.ADD(dst_reg(vertex), vertex, 1u);
       }
-      emit(BRW_OPCODE_WHILE);
+      bld.emit(BRW_OPCODE_WHILE);
 
       if (c->prog_data.gen6_xfb_enabled)
          xfb_write();
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 
    /* Finally, emit EOT message.
     *
@@ -482,17 +482,17 @@ gen6_gs_visitor::emit_thread_end()
     * which works for both cases by setting the COMPLETE and UNUSED flags in
     * the EOT message.
     */
-   this->current_annotation = "gen6 thread end: EOT";
+   bld.set_annotation("gen6 thread end: EOT");
 
    if (c->prog_data.gen6_xfb_enabled) {
       /* When emitting EOT, set SONumPrimsWritten Increment Value. */
       src_reg data(this, glsl_type::uint_type);
-      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
-      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
-      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
+      bld.AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu));
+      bld.SHL(dst_reg(data), data, brw_imm_ud(16u));
+      bld.emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
    }
 
-   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
+   vec4_instruction *inst = bld.emit(GS_OPCODE_THREAD_END);
    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
    inst->base_mrf = base_mrf;
    inst->mlen = 1;
@@ -610,10 +610,10 @@ gen6_gs_visitor::xfb_write()
       unreachable("Unexpected primitive type in Gen6 SOL program.");
    }
 
-   this->current_annotation = "gen6 thread end: svb writes init";
+   bld.set_annotation("gen6 thread end: svb writes init");
 
-   emit(MOV(dst_reg(this->vertex_output_offset), 0u));
-   emit(MOV(dst_reg(this->sol_prim_written), 0u));
+   bld.MOV(dst_reg(this->vertex_output_offset), 0u);
+   bld.MOV(dst_reg(this->sol_prim_written), 0u);
 
    /* Check that at least one primitive can be written
     *
@@ -624,37 +624,37 @@ gen6_gs_visitor::xfb_write()
     * transform feedback is in interleaved or separate attribs mode.
     */
    src_reg sol_temp(this, glsl_type::uvec4_type);
-   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
+   bld.ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts));
 
    /* Compare SVBI calculated number with the maximum value, which is
     * in R1.4 (previously saved in this->max_svbi) for gen6.
     */
-   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.CMP(bld.reg_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
-      struct src_reg destination_indices_uw =
+      src_reg destination_indices_uw =
          retype(destination_indices, BRW_REGISTER_TYPE_UW);
 
-      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
-                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
+      vec4_instruction *inst = bld.MOV(dst_reg(destination_indices_uw),
+                                        brw_imm_v(0x00020100)); /* (0, 1, 2) */
       inst->force_writemask_all = true;
 
-      emit(ADD(dst_reg(this->destination_indices),
+      bld.ADD(dst_reg(this->destination_indices),
                this->destination_indices,
-               this->svbi));
+               this->svbi);
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 
    /* Write transform feedback data for all processed vertices. */
    for (int i = 0; i < c->gp->program.VerticesOut; i++) {
-      emit(MOV(dst_reg(sol_temp), i));
-      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
-               BRW_CONDITIONAL_L));
-      emit(IF(BRW_PREDICATE_NORMAL));
+      bld.MOV(dst_reg(sol_temp), i);
+      bld.CMP(bld.reg_null_d(), sol_temp, this->vertex_count,
+               BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
       {
          xfb_program(i, num_verts);
       }
-      emit(BRW_OPCODE_ENDIF);
+      bld.emit(BRW_OPCODE_ENDIF);
    }
 }
 
@@ -670,16 +670,16 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
    /* Check for buffer overflow: we need room to write the complete primitive
     * (all vertices). Otherwise, avoid writing any vertices for it
     */
-   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
-   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
-   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
-   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.ADD(dst_reg(sol_temp), this->sol_prim_written, 1u);
+   bld.MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts));
+   bld.ADD(dst_reg(sol_temp), sol_temp, this->svbi);
+   bld.CMP(bld.reg_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE);
+   bld.IF(BRW_PREDICATE_NORMAL);
    {
       /* Avoid overwriting MRF 1 as it is used as URB write message header */
       dst_reg mrf_reg(MRF, 2);
 
-      this->current_annotation = "gen6: emit SOL vertex data";
+      bld.set_annotation("gen6: emit SOL vertex data");
       /* For each vertex, generate code to output each varying using the
        * appropriate binding table entry.
        */
@@ -688,7 +688,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
             prog_data->transform_feedback_bindings[binding];
 
          /* Set up the correct destination index for this vertex */
-         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
+         vec4_instruction *inst = bld.emit(GS_OPCODE_SVB_SET_DST_INDEX,
                                        mrf_reg,
                                        this->destination_indices);
          inst->sol_vertex = vertex % num_verts;
@@ -705,11 +705,11 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
          /* Compute offset of this varying for the current vertex
           * in vertex_output
           */
-         this->current_annotation = output_reg_annotation[varying];
+         bld.set_annotation(output_reg_annotation[varying]);
          src_reg data(this->vertex_output);
          data.reladdr = ralloc(mem_ctx, src_reg);
          int offset = get_vertex_output_offset_for_varying(vertex, varying);
-         emit(MOV(dst_reg(this->vertex_output_offset), offset));
+         bld.MOV(dst_reg(this->vertex_output_offset), offset);
          memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
          data.type = output_reg[varying].type;
 
@@ -726,7 +726,7 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
             data.swizzle = prog_data->transform_feedback_swizzles[binding];
 
          /* Write data */
-         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
+         inst = bld.emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
          inst->sol_binding = binding;
          inst->sol_final_write = final_write;
 
@@ -734,17 +734,17 @@ gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
             /* This is the last vertex of the primitive, then increment
              * SO num primitive counter and destination indices.
              */
-            emit(ADD(dst_reg(this->destination_indices),
+            bld.ADD(dst_reg(this->destination_indices),
                      this->destination_indices,
-                     brw_imm_ud(num_verts)));
-            emit(ADD(dst_reg(this->sol_prim_written),
-                     this->sol_prim_written, 1u));
+                     brw_imm_ud(num_verts));
+            bld.ADD(dst_reg(this->sol_prim_written),
+                     this->sol_prim_written, 1u);
          }
 
       }
-      this->current_annotation = NULL;
+      bld.set_annotation(NULL);
    }
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 }
 
 int
author	Francisco Jerez <currojerez@riseup.net>	2014-10-28 15:59:34 +0200
committer	Francisco Jerez <currojerez@riseup.net>	2014-10-30 16:39:53 +0200
commit	a841b3c0cb61b11f993eaa52e75ae72daa4d5fa4 (patch)
tree	859e614042badaee0feeb510f6f3fbc089ccb421
parent	d46cf50e4ce13b478544de223ec64302ab832d59 (diff)