150 files changed, 74368 insertions, 9 deletions
diff --git a/src/gallium/drivers/nouveau/Android.mk b/src/gallium/drivers/nouveau/Android.mk
index 782b7cec188..5275aa60157 100644
--- a/src/gallium/drivers/nouveau/Android.mk
+++ b/src/gallium/drivers/nouveau/Android.mk
@@ -28,7 +28,13 @@ include $(LOCAL_PATH)/Makefile.sources
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := $(C_SOURCES)
+LOCAL_SRC_FILES := $(C_SOURCES) \
+	$(NV30_C_SOURCES) \
+	$(NV50_CODEGEN_SOURCES) \
+	$(NV50_C_SOURES) \
+	$(NVC0_CODEGEN_SOURCES) \
+	$(NVC0_C_SOURCES)
+
 LOCAL_C_INCLUDES := $(DRM_TOP) \
 	$(DRM_TOP)/include/drm \
 	$(DRM_TOP)/nouveau
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index 12e3da0334a..c4b51d9dff2 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -27,9 +27,15 @@ noinst_LTLIBRARIES = libnouveau.la
 
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/drivers/nouveau/include \
-	-I$(top_srcdir)/src/gallium/drivers \
-	-I$(top_srcdir)/include \
 	$(GALLIUM_CFLAGS) \
-	$(LIBDRM_CFLAGS)
+	$(LIBDRM_CFLAGS) \
+	$(NOUVEAU_CFLAGS) \
+	$(VISIBILITY_CFLAGS)
 
-libnouveau_la_SOURCES = $(C_SOURCES)
+libnouveau_la_SOURCES = \
+	$(C_SOURCES) \
+	$(NV30_C_SOURCES) \
+	$(NV50_CODEGEN_SOURCES) \
+	$(NV50_C_SOURCES) \
+	$(NVC0_CODEGEN_SOURCES) \
+	$(NVC0_C_SOURCES)
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 7912f6703e3..cc84ec3edb6 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -8,3 +8,94 @@ C_SOURCES := \
 	nouveau_vp3_video.c \
 	nouveau_vp3_video_bsp.c \
 	nouveau_vp3_video_vp.c
+
+NV30_C_SOURCES := \
+	nv30/nv30_screen.c \
+	nv30/nv30_context.c \
+	nv30/nv30_format.c \
+	nv30/nv30_resource.c \
+	nv30/nv30_transfer.c \
+	nv30/nv30_miptree.c \
+	nv30/nv30_state.c \
+	nv30/nv30_state_validate.c \
+	nv30/nv30_texture.c \
+	nv30/nv30_fragtex.c \
+	nv30/nv40_verttex.c \
+	nv30/nv30_fragprog.c \
+	nv30/nv30_vertprog.c \
+	nv30/nv30_clear.c \
+	nv30/nv30_vbo.c \
+	nv30/nv30_push.c \
+	nv30/nv30_draw.c \
+	nv30/nv30_query.c \
+	nv30/nvfx_vertprog.c \
+	nv30/nvfx_fragprog.c
+
+NV50_C_SOURCES := \
+	nv50/nv50_context.c \
+	nv50/nv50_formats.c \
+	nv50/nv50_miptree.c \
+	nv50/nv50_resource.c \
+	nv50/nv50_screen.c \
+	nv50/nv50_state.c \
+	nv50/nv50_state_validate.c \
+	nv50/nv50_surface.c \
+	nv50/nv50_tex.c \
+	nv50/nv50_transfer.c \
+	nv50/nv50_vbo.c \
+	nv50/nv50_program.c \
+	nv50/nv50_shader_state.c \
+	nv50/nv50_push.c \
+	nv50/nv50_query.c \
+	nv50/nv84_video.c \
+	nv50/nv84_video_bsp.c \
+	nv50/nv84_video_vp.c \
+	nv50/nv98_video.c \
+	nv50/nv98_video_bsp.c \
+	nv50/nv98_video_vp.c \
+	nv50/nv98_video_ppp.c
+
+NV50_CODEGEN_SOURCES := \
+	codegen/nv50_ir.cpp \
+	codegen/nv50_ir_bb.cpp \
+	codegen/nv50_ir_build_util.cpp \
+	codegen/nv50_ir_emit_nv50.cpp \
+	codegen/nv50_ir_from_tgsi.cpp \
+	codegen/nv50_ir_graph.cpp \
+	codegen/nv50_ir_lowering_nv50.cpp \
+	codegen/nv50_ir_peephole.cpp \
+	codegen/nv50_ir_print.cpp \
+	codegen/nv50_ir_ra.cpp \
+	codegen/nv50_ir_ssa.cpp \
+	codegen/nv50_ir_target.cpp \
+	codegen/nv50_ir_target_nv50.cpp \
+	codegen/nv50_ir_util.cpp
+
+NVC0_CODEGEN_SOURCES := \
+	codegen/nv50_ir_emit_gk110.cpp \
+	codegen/nv50_ir_emit_nvc0.cpp \
+	codegen/nv50_ir_lowering_nvc0.cpp \
+	codegen/nv50_ir_target_nvc0.cpp
+
+NVC0_C_SOURCES := \
+	nvc0/nvc0_compute.c \
+	nvc0/nvc0_context.c \
+	nvc0/nvc0_formats.c \
+	nvc0/nvc0_miptree.c \
+	nvc0/nvc0_resource.c \
+	nvc0/nvc0_screen.c \
+	nvc0/nvc0_state.c \
+	nvc0/nvc0_state_validate.c \
+	nvc0/nvc0_surface.c \
+	nvc0/nvc0_tex.c \
+	nvc0/nvc0_transfer.c \
+	nvc0/nvc0_vbo.c \
+	nvc0/nvc0_vbo_translate.c \
+	nvc0/nvc0_program.c \
+	nvc0/nvc0_shader_state.c \
+	nvc0/nvc0_query.c \
+	nvc0/nve4_compute.c \
+	nvc0/nvc0_video.c \
+	nvc0/nvc0_video_bsp.c \
+	nvc0/nvc0_video_vp.c \
+	nvc0/nvc0_video_ppp.c
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
new file mode 100644
index 00000000000..90fb51c26df
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -0,0 +1,1231 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+#include "nv50/nv50_program.h"
+#include "nv50/nv50_debug.h"
+}
+
+namespace nv50_ir {
+
+Modifier::Modifier(operation op)
+{
+   switch (op) {
+   case OP_NEG: bits = NV50_IR_MOD_NEG; break;
+   case OP_ABS: bits = NV50_IR_MOD_ABS; break;
+   case OP_SAT: bits = NV50_IR_MOD_SAT; break;
+   case OP_NOT: bits = NV50_IR_MOD_NOT; break;
+   default:
+      bits = 0;
+      break;
+   }
+}
+
+Modifier Modifier::operator*(const Modifier m) const
+{
+   unsigned int a, b, c;
+
+   b = m.bits;
+   if (this->bits & NV50_IR_MOD_ABS)
+      b &= ~NV50_IR_MOD_NEG;
+
+   a = (this->bits ^ b)      & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG);
+   c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT);
+
+   return Modifier(a | c);
+}
+
+ValueRef::ValueRef(Value *v) : value(NULL), insn(NULL)
+{
+   indirect[0] = -1;
+   indirect[1] = -1;
+   usedAsPtr = false;
+   set(v);
+}
+
+ValueRef::ValueRef(const ValueRef& ref) : value(NULL), insn(ref.insn)
+{
+   set(ref);
+   usedAsPtr = ref.usedAsPtr;
+}
+
+ValueRef::~ValueRef()
+{
+   this->set(NULL);
+}
+
+bool ValueRef::getImmediate(ImmediateValue &imm) const
+{
+   const ValueRef *src = this;
+   Modifier m;
+   DataType type = src->insn->sType;
+
+   while (src) {
+      if (src->mod) {
+         if (src->insn->sType != type)
+            break;
+         m *= src->mod;
+      }
+      if (src->getFile() == FILE_IMMEDIATE) {
+         imm = *(src->value->asImm());
+         // The immediate's type isn't required to match its use, it's
+         // more of a hint; applying a modifier makes use of that hint.
+         imm.reg.type = type;
+         m.applyTo(imm);
+         return true;
+      }
+
+      Instruction *insn = src->value->getUniqueInsn();
+
+      if (insn && insn->op == OP_MOV) {
+         src = &insn->src(0);
+         if (src->mod)
+            WARN("OP_MOV with modifier encountered !\n");
+      } else {
+         src = NULL;
+      }
+   }
+   return false;
+}
+
+ValueDef::ValueDef(Value *v) : value(NULL), insn(NULL)
+{
+   set(v);
+}
+
+ValueDef::ValueDef(const ValueDef& def) : value(NULL), insn(NULL)
+{
+   set(def.get());
+}
+
+ValueDef::~ValueDef()
+{
+   this->set(NULL);
+}
+
+void
+ValueRef::set(const ValueRef &ref)
+{
+   this->set(ref.get());
+   mod = ref.mod;
+   indirect[0] = ref.indirect[0];
+   indirect[1] = ref.indirect[1];
+}
+
+void
+ValueRef::set(Value *refVal)
+{
+   if (value == refVal)
+      return;
+   if (value)
+      value->uses.remove(this);
+   if (refVal)
+      refVal->uses.push_back(this);
+
+   value = refVal;
+}
+
+void
+ValueDef::set(Value *defVal)
+{
+   if (value == defVal)
+      return;
+   if (value)
+      value->defs.remove(this);
+   if (defVal)
+      defVal->defs.push_back(this);
+
+   value = defVal;
+}
+
+// Check if we can replace this definition's value by the value in @rep,
+// including the source modifiers, i.e. make sure that all uses support
+// @rep.mod.
+bool
+ValueDef::mayReplace(const ValueRef &rep)
+{
+   if (!rep.mod)
+      return true;
+
+   if (!insn || !insn->bb) // Unbound instruction ?
+      return false;
+
+   const Target *target = insn->bb->getProgram()->getTarget();
+
+   for (Value::UseIterator it = value->uses.begin(); it != value->uses.end();
+        ++it) {
+      Instruction *insn = (*it)->getInsn();
+      int s = -1;
+
+      for (int i = 0; insn->srcExists(i); ++i) {
+         if (insn->src(i).get() == value) {
+            // If there are multiple references to us we'd have to check if the
+            // combination of mods is still supported, but just bail for now.
+            if (&insn->src(i) != (*it))
+               return false;
+            s = i;
+         }
+      }
+      assert(s >= 0); // integrity of uses list
+
+      if (!target->isModSupported(insn, s, rep.mod))
+         return false;
+   }
+   return true;
+}
+
+void
+ValueDef::replace(const ValueRef &repVal, bool doSet)
+{
+   assert(mayReplace(repVal));
+
+   if (value == repVal.get())
+      return;
+
+   while (!value->uses.empty()) {
+      ValueRef *ref = value->uses.front();
+      ref->set(repVal.get());
+      ref->mod *= repVal.mod;
+   }
+
+   if (doSet)
+      set(repVal.get());
+}
+
+Value::Value()
+{
+  join = this;
+  memset(&reg, 0, sizeof(reg));
+  reg.size = 4;
+}
+
+LValue::LValue(Function *fn, DataFile file)
+{
+   reg.file = file;
+   reg.size = (file != FILE_PREDICATE) ? 4 : 1;
+   reg.data.id = -1;
+
+   compMask = 0;
+   compound = 0;
+   ssa = 0;
+   fixedReg = 0;
+   noSpill = 0;
+
+   fn->add(this, this->id);
+}
+
+LValue::LValue(Function *fn, LValue *lval)
+{
+   assert(lval);
+
+   reg.file = lval->reg.file;
+   reg.size = lval->reg.size;
+   reg.data.id = -1;
+
+   compMask = 0;
+   compound = 0;
+   ssa = 0;
+   fixedReg = 0;
+   noSpill = 0;
+
+   fn->add(this, this->id);
+}
+
+LValue *
+LValue::clone(ClonePolicy<Function>& pol) const
+{
+   LValue *that = new_LValue(pol.context(), reg.file);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   return that;
+}
+
+bool
+LValue::isUniform() const
+{
+   if (defs.size() > 1)
+      return false;
+   Instruction *insn = getInsn();
+   // let's not try too hard here for now ...
+   return !insn->srcExists(1) && insn->getSrc(0)->isUniform();
+}
+
+Symbol::Symbol(Program *prog, DataFile f, ubyte fidx)
+{
+   baseSym = NULL;
+
+   reg.file = f;
+   reg.fileIndex = fidx;
+   reg.data.offset = 0;
+
+   prog->add(this, this->id);
+}
+
+Symbol *
+Symbol::clone(ClonePolicy<Function>& pol) const
+{
+   Program *prog = pol.context()->getProgram();
+
+   Symbol *that = new_Symbol(prog, reg.file, reg.fileIndex);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   that->baseSym = this->baseSym;
+
+   return that;
+}
+
+bool
+Symbol::isUniform() const
+{
+   return
+      reg.file != FILE_SYSTEM_VALUE &&
+      reg.file != FILE_MEMORY_LOCAL &&
+      reg.file != FILE_SHADER_INPUT;
+}
+
+ImmediateValue::ImmediateValue(Program *prog, uint32_t uval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 4;
+   reg.type = TYPE_U32;
+
+   reg.data.u32 = uval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, float fval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 4;
+   reg.type = TYPE_F32;
+
+   reg.data.f32 = fval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, double dval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 8;
+   reg.type = TYPE_F64;
+
+   reg.data.f64 = dval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(const ImmediateValue *proto, DataType ty)
+{
+   reg = proto->reg;
+
+   reg.type = ty;
+   reg.size = typeSizeof(ty);
+}
+
+ImmediateValue *
+ImmediateValue::clone(ClonePolicy<Function>& pol) const
+{
+   Program *prog = pol.context()->getProgram();
+   ImmediateValue *that = new_ImmediateValue(prog, 0u);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   return that;
+}
+
+bool
+ImmediateValue::isInteger(const int i) const
+{
+   switch (reg.type) {
+   case TYPE_S8:
+      return reg.data.s8 == i;
+   case TYPE_U8:
+      return reg.data.u8 == i;
+   case TYPE_S16:
+      return reg.data.s16 == i;
+   case TYPE_U16:
+      return reg.data.u16 == i;
+   case TYPE_S32:
+   case TYPE_U32:
+      return reg.data.s32 == i; // as if ...
+   case TYPE_F32:
+      return reg.data.f32 == static_cast<float>(i);
+   case TYPE_F64:
+      return reg.data.f64 == static_cast<double>(i);
+   default:
+      return false;
+   }
+}
+
+bool
+ImmediateValue::isNegative() const
+{
+   switch (reg.type) {
+   case TYPE_S8:  return reg.data.s8 < 0;
+   case TYPE_S16: return reg.data.s16 < 0;
+   case TYPE_S32:
+   case TYPE_U32: return reg.data.s32 < 0;
+   case TYPE_F32: return reg.data.u32 & (1 << 31);
+   case TYPE_F64: return reg.data.u64 & (1ULL << 63);
+   default:
+      return false;
+   }
+}
+
+bool
+ImmediateValue::isPow2() const
+{
+   switch (reg.type) {
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32: return util_is_power_of_two(reg.data.u32);
+   default:
+      return false;
+   }
+}
+
+void
+ImmediateValue::applyLog2()
+{
+   switch (reg.type) {
+   case TYPE_S8:
+   case TYPE_S16:
+   case TYPE_S32:
+      assert(!this->isNegative());
+      // fall through
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32:
+      reg.data.u32 = util_logbase2(reg.data.u32);
+      break;
+   case TYPE_F32:
+      reg.data.f32 = log2f(reg.data.f32);
+      break;
+   case TYPE_F64:
+      reg.data.f64 = log2(reg.data.f64);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+bool
+ImmediateValue::compare(CondCode cc, float fval) const
+{
+   if (reg.type != TYPE_F32)
+      ERROR("immediate value is not of type f32");
+
+   switch (static_cast<CondCode>(cc & 7)) {
+   case CC_TR: return true;
+   case CC_FL: return false;
+   case CC_LT: return reg.data.f32 <  fval;
+   case CC_LE: return reg.data.f32 <= fval;
+   case CC_GT: return reg.data.f32 >  fval;
+   case CC_GE: return reg.data.f32 >= fval;
+   case CC_EQ: return reg.data.f32 == fval;
+   case CC_NE: return reg.data.f32 != fval;
+   default:
+      assert(0);
+      return false;
+   }
+}
+
+ImmediateValue&
+ImmediateValue::operator=(const ImmediateValue &that)
+{
+   this->reg = that.reg;
+   return (*this);
+}
+
+bool
+Value::interfers(const Value *that) const
+{
+   uint32_t idA, idB;
+
+   if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+      return false;
+   if (this->asImm())
+      return false;
+
+   if (this->asSym()) {
+      idA = this->join->reg.data.offset;
+      idB = that->join->reg.data.offset;
+   } else {
+      idA = this->join->reg.data.id * MIN2(this->reg.size, 4);
+      idB = that->join->reg.data.id * MIN2(that->reg.size, 4);
+   }
+
+   if (idA < idB)
+      return (idA + this->reg.size > idB);
+   else
+   if (idA > idB)
+      return (idB + that->reg.size > idA);
+   else
+      return (idA == idB);
+}
+
+bool
+Value::equals(const Value *that, bool strict) const
+{
+   if (strict)
+      return this == that;
+
+   if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+      return false;
+   if (that->reg.size != this->reg.size)
+      return false;
+
+   if (that->reg.data.id != this->reg.data.id)
+      return false;
+
+   return true;
+}
+
+bool
+ImmediateValue::equals(const Value *that, bool strict) const
+{
+   const ImmediateValue *imm = that->asImm();
+   if (!imm)
+      return false;
+   return reg.data.u64 == imm->reg.data.u64;
+}
+
+bool
+Symbol::equals(const Value *that, bool strict) const
+{
+   if (reg.file != that->reg.file || reg.fileIndex != that->reg.fileIndex)
+      return false;
+   assert(that->asSym());
+
+   if (this->baseSym != that->asSym()->baseSym)
+      return false;
+
+   if (reg.file == FILE_SYSTEM_VALUE)
+      return (this->reg.data.sv.sv    == that->reg.data.sv.sv &&
+              this->reg.data.sv.index == that->reg.data.sv.index);
+   return this->reg.data.offset == that->reg.data.offset;
+}
+
+void Instruction::init()
+{
+   next = prev = 0;
+
+   cc = CC_ALWAYS;
+   rnd = ROUND_N;
+   cache = CACHE_CA;
+   subOp = 0;
+
+   saturate = 0;
+   join = 0;
+   exit = 0;
+   terminator = 0;
+   ftz = 0;
+   dnz = 0;
+   perPatch = 0;
+   fixed = 0;
+   encSize = 0;
+   ipa = 0;
+   mask = 0;
+
+   lanes = 0xf;
+
+   postFactor = 0;
+
+   predSrc = -1;
+   flagsDef = -1;
+   flagsSrc = -1;
+}
+
+Instruction::Instruction()
+{
+   init();
+
+   op = OP_NOP;
+   dType = sType = TYPE_F32;
+
+   id = -1;
+   bb = 0;
+}
+
+Instruction::Instruction(Function *fn, operation opr, DataType ty)
+{
+   init();
+
+   op = opr;
+   dType = sType = ty;
+
+   fn->add(this, id);
+}
+
+Instruction::~Instruction()
+{
+   if (bb) {
+      Function *fn = bb->getFunction();
+      bb->remove(this);
+      fn->allInsns.remove(id);
+   }
+
+   for (int s = 0; srcExists(s); ++s)
+      setSrc(s, NULL);
+   // must unlink defs too since the list pointers will get deallocated
+   for (int d = 0; defExists(d); ++d)
+      setDef(d, NULL);
+}
+
+void
+Instruction::setDef(int i, Value *val)
+{
+   int size = defs.size();
+   if (i >= size) {
+      defs.resize(i + 1);
+      while (size <= i)
+         defs[size++].setInsn(this);
+   }
+   defs[i].set(val);
+}
+
+void
+Instruction::setSrc(int s, Value *val)
+{
+   int size = srcs.size();
+   if (s >= size) {
+      srcs.resize(s + 1);
+      while (size <= s)
+         srcs[size++].setInsn(this);
+   }
+   srcs[s].set(val);
+}
+
+void
+Instruction::setSrc(int s, const ValueRef& ref)
+{
+   setSrc(s, ref.get());
+   srcs[s].mod = ref.mod;
+}
+
+void
+Instruction::swapSources(int a, int b)
+{
+   Value *value = srcs[a].get();
+   Modifier m = srcs[a].mod;
+
+   setSrc(a, srcs[b]);
+
+   srcs[b].set(value);
+   srcs[b].mod = m;
+}
+
+static inline void moveSourcesAdjustIndex(int8_t &index, int s, int delta)
+{
+   if (index >= s)
+      index += delta;
+   else
+   if ((delta < 0) && (index >= (s + delta)))
+      index = -1;
+}
+
+// Moves sources [@s,last_source] by @delta.
+// If @delta < 0, sources [@s - abs(@delta), @s) are erased.
+void
+Instruction::moveSources(const int s, const int delta)
+{
+   if (delta == 0)
+      return;
+   assert(s + delta >= 0);
+
+   int k;
+
+   for (k = 0; srcExists(k); ++k) {
+      for (int i = 0; i < 2; ++i)
+         moveSourcesAdjustIndex(src(k).indirect[i], s, delta);
+   }
+   moveSourcesAdjustIndex(predSrc, s, delta);
+   moveSourcesAdjustIndex(flagsSrc, s, delta);
+   if (asTex()) {
+      TexInstruction *tex = asTex();
+      moveSourcesAdjustIndex(tex->tex.rIndirectSrc, s, delta);
+      moveSourcesAdjustIndex(tex->tex.sIndirectSrc, s, delta);
+   }
+
+   if (delta > 0) {
+      --k;
+      for (int p = k + delta; k >= s; --k, --p)
+         setSrc(p, src(k));
+   } else {
+      int p;
+      for (p = s; p < k; ++p)
+         setSrc(p + delta, src(p));
+      for (; (p + delta) < k; ++p)
+         setSrc(p + delta, NULL);
+   }
+}
+
+void
+Instruction::takeExtraSources(int s, Value *values[3])
+{
+   values[0] = getIndirect(s, 0);
+   if (values[0])
+      setIndirect(s, 0, NULL);
+
+   values[1] = getIndirect(s, 1);
+   if (values[1])
+      setIndirect(s, 1, NULL);
+
+   values[2] = getPredicate();
+   if (values[2])
+      setPredicate(cc, NULL);
+}
+
+void
+Instruction::putExtraSources(int s, Value *values[3])
+{
+   if (values[0])
+      setIndirect(s, 0, values[0]);
+   if (values[1])
+      setIndirect(s, 1, values[1]);
+   if (values[2])
+      setPredicate(cc, values[2]);
+}
+
+Instruction *
+Instruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   if (!i)
+      i = new_Instruction(pol.context(), op, dType);
+#ifndef NDEBUG // non-conformant assert, so this is required
+   assert(typeid(*i) == typeid(*this));
+#endif
+
+   pol.set<Instruction>(this, i);
+
+   i->sType = sType;
+
+   i->rnd = rnd;
+   i->cache = cache;
+   i->subOp = subOp;
+
+   i->saturate = saturate;
+   i->join = join;
+   i->exit = exit;
+   i->mask = mask;
+   i->ftz = ftz;
+   i->dnz = dnz;
+   i->ipa = ipa;
+   i->lanes = lanes;
+   i->perPatch = perPatch;
+
+   i->postFactor = postFactor;
+
+   for (int d = 0; defExists(d); ++d)
+      i->setDef(d, pol.get(getDef(d)));
+
+   for (int s = 0; srcExists(s); ++s) {
+      i->setSrc(s, pol.get(getSrc(s)));
+      i->src(s).mod = src(s).mod;
+   }
+
+   i->cc = cc;
+   i->predSrc = predSrc;
+   i->flagsDef = flagsDef;
+   i->flagsSrc = flagsSrc;
+
+   return i;
+}
+
+unsigned int
+Instruction::defCount(unsigned int mask, bool singleFile) const
+{
+   unsigned int i, n;
+
+   if (singleFile) {
+      unsigned int d = ffs(mask);
+      if (!d)
+         return 0;
+      for (i = d--; defExists(i); ++i)
+         if (getDef(i)->reg.file != getDef(d)->reg.file)
+            mask &= ~(1 << i);
+   }
+
+   for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1)
+      n += mask & 1;
+   return n;
+}
+
+unsigned int
+Instruction::srcCount(unsigned int mask, bool singleFile) const
+{
+   unsigned int i, n;
+
+   if (singleFile) {
+      unsigned int s = ffs(mask);
+      if (!s)
+         return 0;
+      for (i = s--; srcExists(i); ++i)
+         if (getSrc(i)->reg.file != getSrc(s)->reg.file)
+            mask &= ~(1 << i);
+   }
+
+   for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1)
+      n += mask & 1;
+   return n;
+}
+
+bool
+Instruction::setIndirect(int s, int dim, Value *value)
+{
+   assert(this->srcExists(s));
+
+   int p = srcs[s].indirect[dim];
+   if (p < 0) {
+      if (!value)
+         return true;
+      p = srcs.size();
+      while (p > 0 && !srcExists(p - 1))
+         --p;
+   }
+   setSrc(p, value);
+   srcs[p].usedAsPtr = (value != 0);
+   srcs[s].indirect[dim] = value ? p : -1;
+   return true;
+}
+
+bool
+Instruction::setPredicate(CondCode ccode, Value *value)
+{
+   cc = ccode;
+
+   if (!value) {
+      if (predSrc >= 0) {
+         srcs[predSrc].set(NULL);
+         predSrc = -1;
+      }
+      return true;
+   }
+
+   if (predSrc < 0) {
+      predSrc = srcs.size();
+      while (predSrc > 0 && !srcExists(predSrc - 1))
+         --predSrc;
+   }
+
+   setSrc(predSrc, value);
+   return true;
+}
+
+bool
+Instruction::writesPredicate() const
+{
+   for (int d = 0; defExists(d); ++d)
+      if (getDef(d)->inFile(FILE_PREDICATE) || getDef(d)->inFile(FILE_FLAGS))
+         return true;
+   return false;
+}
+
+static bool
+insnCheckCommutationDefSrc(const Instruction *a, const Instruction *b)
+{
+   for (int d = 0; a->defExists(d); ++d)
+      for (int s = 0; b->srcExists(s); ++s)
+         if (a->getDef(d)->interfers(b->getSrc(s)))
+            return false;
+   return true;
+}
+
+static bool
+insnCheckCommutationDefDef(const Instruction *a, const Instruction *b)
+{
+   for (int d = 0; a->defExists(d); ++d)
+      for (int c = 0; b->defExists(c); ++c)
+         if (a->getDef(d)->interfers(b->getDef(c)))
+            return false;
+   return true;
+}
+
+bool
+Instruction::isCommutationLegal(const Instruction *i) const
+{
+   bool ret = insnCheckCommutationDefDef(this, i);
+   ret = ret && insnCheckCommutationDefSrc(this, i);
+   ret = ret && insnCheckCommutationDefSrc(i, this);
+   return ret;
+}
+
+TexInstruction::TexInstruction(Function *fn, operation op)
+   : Instruction(fn, op, TYPE_F32)
+{
+   memset(&tex, 0, sizeof(tex));
+
+   tex.rIndirectSrc = -1;
+   tex.sIndirectSrc = -1;
+}
+
+TexInstruction::~TexInstruction()
+{
+   for (int c = 0; c < 3; ++c) {
+      dPdx[c].set(NULL);
+      dPdy[c].set(NULL);
+   }
+}
+
+TexInstruction *
+TexInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   TexInstruction *tex = (i ? static_cast<TexInstruction *>(i) :
+                          new_TexInstruction(pol.context(), op));
+
+   Instruction::clone(pol, tex);
+
+   tex->tex = this->tex;
+
+   if (op == OP_TXD) {
+      for (unsigned int c = 0; c < tex->tex.target.getDim(); ++c) {
+         tex->dPdx[c].set(dPdx[c]);
+         tex->dPdy[c].set(dPdy[c]);
+      }
+   }
+
+   return tex;
+}
+
+const struct TexInstruction::Target::Desc TexInstruction::Target::descTable[] =
+{
+   { "1D",                1, 1, false, false, false },
+   { "2D",                2, 2, false, false, false },
+   { "2D_MS",             2, 3, false, false, false },
+   { "3D",                3, 3, false, false, false },
+   { "CUBE",              2, 3, false, true,  false },
+   { "1D_SHADOW",         1, 1, false, false, true  },
+   { "2D_SHADOW",         2, 2, false, false, true  },
+   { "CUBE_SHADOW",       2, 3, false, true,  true  },
+   { "1D_ARRAY",          1, 2, true,  false, false },
+   { "2D_ARRAY",          2, 3, true,  false, false },
+   { "2D_MS_ARRAY",       2, 4, true,  false, false },
+   { "CUBE_ARRAY",        2, 4, true,  true,  false },
+   { "1D_ARRAY_SHADOW",   1, 2, true,  false, true  },
+   { "2D_ARRAY_SHADOW",   2, 3, true,  false, true  },
+   { "RECT",              2, 2, false, false, false },
+   { "RECT_SHADOW",       2, 2, false, false, true  },
+   { "CUBE_ARRAY_SHADOW", 2, 4, true,  true,  true  },
+   { "BUFFER",            1, 1, false, false, false },
+};
+
+void
+TexInstruction::setIndirectR(Value *v)
+{
+   int p = ((tex.rIndirectSrc < 0) && v) ? srcs.size() : tex.rIndirectSrc;
+   if (p >= 0) {
+      tex.rIndirectSrc = p;
+      setSrc(p, v);
+      srcs[p].usedAsPtr = !!v;
+   }
+}
+
+void
+TexInstruction::setIndirectS(Value *v)
+{
+   int p = ((tex.sIndirectSrc < 0) && v) ? srcs.size() : tex.sIndirectSrc;
+   if (p >= 0) {
+      tex.sIndirectSrc = p;
+      setSrc(p, v);
+      srcs[p].usedAsPtr = !!v;
+   }
+}
+
+CmpInstruction::CmpInstruction(Function *fn, operation op)
+   : Instruction(fn, op, TYPE_F32)
+{
+   setCond = CC_ALWAYS;
+}
+
+CmpInstruction *
+CmpInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   CmpInstruction *cmp = (i ? static_cast<CmpInstruction *>(i) :
+                          new_CmpInstruction(pol.context(), op));
+   cmp->dType = dType;
+   Instruction::clone(pol, cmp);
+   cmp->setCond = setCond;
+   return cmp;
+}
+
+FlowInstruction::FlowInstruction(Function *fn, operation op, void *targ)
+   : Instruction(fn, op, TYPE_NONE)
+{
+   if (op == OP_CALL)
+      target.fn = reinterpret_cast<Function *>(targ);
+   else
+      target.bb = reinterpret_cast<BasicBlock *>(targ);
+
+   if (op == OP_BRA ||
+       op == OP_CONT || op == OP_BREAK ||
+       op == OP_RET || op == OP_EXIT)
+      terminator = 1;
+   else
+   if (op == OP_JOIN)
+      terminator = targ ? 1 : 0;
+
+   allWarp = absolute = limit = builtin = indirect = 0;
+}
+
+FlowInstruction *
+FlowInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   FlowInstruction *flow = (i ? static_cast<FlowInstruction *>(i) :
+                            new_FlowInstruction(pol.context(), op, NULL));
+
+   Instruction::clone(pol, flow);
+   flow->allWarp = allWarp;
+   flow->absolute = absolute;
+   flow->limit = limit;
+   flow->builtin = builtin;
+
+   if (builtin)
+      flow->target.builtin = target.builtin;
+   else
+   if (op == OP_CALL)
+      flow->target.fn = target.fn;
+   else
+   if (target.bb)
+      flow->target.bb = pol.get<BasicBlock>(target.bb);
+
+   return flow;
+}
+
+Program::Program(Type type, Target *arch)
+   : progType(type),
+     target(arch),
+     mem_Instruction(sizeof(Instruction), 6),
+     mem_CmpInstruction(sizeof(CmpInstruction), 4),
+     mem_TexInstruction(sizeof(TexInstruction), 4),
+     mem_FlowInstruction(sizeof(FlowInstruction), 4),
+     mem_LValue(sizeof(LValue), 8),
+     mem_Symbol(sizeof(Symbol), 7),
+     mem_ImmediateValue(sizeof(ImmediateValue), 7)
+{
+   code = NULL;
+   binSize = 0;
+
+   maxGPR = -1;
+
+   main = new Function(this, "MAIN", ~0);
+   calls.insert(&main->call);
+
+   dbgFlags = 0;
+   optLevel = 0;
+
+   targetPriv = NULL;
+}
+
+Program::~Program()
+{
+   for (ArrayList::Iterator it = allFuncs.iterator(); !it.end(); it.next())
+      delete reinterpret_cast<Function *>(it.get());
+
+   for (ArrayList::Iterator it = allRValues.iterator(); !it.end(); it.next())
+      releaseValue(reinterpret_cast<Value *>(it.get()));
+}
+
+void Program::releaseInstruction(Instruction *insn)
+{
+   // TODO: make this not suck so much
+
+   insn->~Instruction();
+
+   if (insn->asCmp())
+      mem_CmpInstruction.release(insn);
+   else
+   if (insn->asTex())
+      mem_TexInstruction.release(insn);
+   else
+   if (insn->asFlow())
+      mem_FlowInstruction.release(insn);
+   else
+      mem_Instruction.release(insn);
+}
+
+void Program::releaseValue(Value *value)
+{
+   value->~Value();
+
+   if (value->asLValue())
+      mem_LValue.release(value);
+   else
+   if (value->asImm())
+      mem_ImmediateValue.release(value);
+   else
+   if (value->asSym())
+      mem_Symbol.release(value);
+}
+
+
+} // namespace nv50_ir
+
+extern "C" {
+
+static void
+nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
+{
+#if defined(PIPE_SHADER_HULL) && defined(PIPE_SHADER_DOMAIN)
+   if (info->type == PIPE_SHADER_HULL || info->type == PIPE_SHADER_DOMAIN) {
+      info->prop.tp.domain = PIPE_PRIM_MAX;
+      info->prop.tp.outputPrim = PIPE_PRIM_MAX;
+   }
+#endif
+   if (info->type == PIPE_SHADER_GEOMETRY) {
+      info->prop.gp.instanceCount = 1;
+      info->prop.gp.maxVertices = 1;
+   }
+   info->io.clipDistance = 0xff;
+   info->io.pointSize = 0xff;
+   info->io.instanceId = 0xff;
+   info->io.vertexId = 0xff;
+   info->io.edgeFlagIn = 0xff;
+   info->io.edgeFlagOut = 0xff;
+   info->io.fragDepth = 0xff;
+   info->io.sampleMask = 0xff;
+   info->io.backFaceColor[0] = info->io.backFaceColor[1] = 0xff;
+}
+
+int
+nv50_ir_generate_code(struct nv50_ir_prog_info *info)
+{
+   int ret = 0;
+
+   nv50_ir::Program::Type type;
+
+   nv50_ir_init_prog_info(info);
+
+#define PROG_TYPE_CASE(a, b)                                      \
+   case PIPE_SHADER_##a: type = nv50_ir::Program::TYPE_##b; break
+
+   switch (info->type) {
+   PROG_TYPE_CASE(VERTEX, VERTEX);
+// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
+// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+   PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
+   PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
+   PROG_TYPE_CASE(COMPUTE, COMPUTE);
+   default:
+      type = nv50_ir::Program::TYPE_COMPUTE;
+      break;
+   }
+   INFO_DBG(info->dbgFlags, VERBOSE, "translating program of type %u\n", type);
+
+   nv50_ir::Target *targ = nv50_ir::Target::create(info->target);
+   if (!targ)
+      return -1;
+
+   nv50_ir::Program *prog = new nv50_ir::Program(type, targ);
+   if (!prog)
+      return -1;
+   prog->driver = info;
+   prog->dbgFlags = info->dbgFlags;
+   prog->optLevel = info->optLevel;
+
+   switch (info->bin.sourceRep) {
+#if 0
+   case PIPE_IR_LLVM:
+   case PIPE_IR_GLSL:
+      return -1;
+   case PIPE_IR_SM4:
+      ret = prog->makeFromSM4(info) ? 0 : -2;
+      break;
+   case PIPE_IR_TGSI:
+#endif
+   default:
+      ret = prog->makeFromTGSI(info) ? 0 : -2;
+      break;
+   }
+   if (ret < 0)
+      goto out;
+   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+      prog->print();
+
+   targ->parseDriverInfo(info);
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
+
+   prog->convertToSSA();
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+      prog->print();
+
+   prog->optimizeSSA(info->optLevel);
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+      prog->print();
+
+   if (!prog->registerAllocation()) {
+      ret = -4;
+      goto out;
+   }
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
+
+   prog->optimizePostRA(info->optLevel);
+
+   if (!prog->emitBinary(info)) {
+      ret = -5;
+      goto out;
+   }
+
+out:
+   INFO_DBG(prog->dbgFlags, VERBOSE, "nv50_ir_generate_code: ret = %i\n", ret);
+
+   info->bin.maxGPR = prog->maxGPR;
+   info->bin.code = prog->code;
+   info->bin.codeSize = prog->binSize;
+   info->bin.tlsSpace = prog->tlsSize;
+
+   delete prog;
+   nv50_ir::Target::destroy(targ);
+
+   return ret;
+}
+
+} // extern "C"
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
new file mode 100644
index 00000000000..68c76e5a9cb
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -0,0 +1,1197 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_H__
+#define __NV50_IR_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <deque>
+#include <list>
+#include <vector>
+
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_graph.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+namespace nv50_ir {
+
+enum operation
+{
+   OP_NOP = 0,
+   OP_PHI,
+   OP_UNION, // unify a new definition and several source values
+   OP_SPLIT, // $r0d -> { $r0, $r1 } ($r0d and $r0/$r1 will be coalesced)
+   OP_MERGE, // opposite of split, e.g. combine 2 32 bit into a 64 bit value
+   OP_CONSTRAINT, // copy values into consecutive registers
+   OP_MOV, // simple copy, no modifiers allowed
+   OP_LOAD,
+   OP_STORE,
+   OP_ADD, // NOTE: add u64 + u32 is legal for targets w/o 64-bit integer adds
+   OP_SUB,
+   OP_MUL,
+   OP_DIV,
+   OP_MOD,
+   OP_MAD,
+   OP_FMA,
+   OP_SAD, // abs(src0 - src1) + src2
+   OP_ABS,
+   OP_NEG,
+   OP_NOT,
+   OP_AND,
+   OP_OR,
+   OP_XOR,
+   OP_SHL,
+   OP_SHR,
+   OP_MAX,
+   OP_MIN,
+   OP_SAT, // CLAMP(f32, 0.0, 1.0)
+   OP_CEIL,
+   OP_FLOOR,
+   OP_TRUNC,
+   OP_CVT,
+   OP_SET_AND, // dst = (src0 CMP src1) & src2
+   OP_SET_OR,
+   OP_SET_XOR,
+   OP_SET,
+   OP_SELP, // dst = src2 ? src0 : src1
+   OP_SLCT, // dst = (src2 CMP 0) ? src0 : src1
+   OP_RCP,
+   OP_RSQ,
+   OP_LG2,
+   OP_SIN,
+   OP_COS,
+   OP_EX2,
+   OP_EXP, // exponential (base M_E)
+   OP_LOG, // natural logarithm
+   OP_PRESIN,
+   OP_PREEX2,
+   OP_SQRT,
+   OP_POW,
+   OP_BRA,
+   OP_CALL,
+   OP_RET,
+   OP_CONT,
+   OP_BREAK,
+   OP_PRERET,
+   OP_PRECONT,
+   OP_PREBREAK,
+   OP_BRKPT,     // breakpoint (not related to loops)
+   OP_JOINAT,    // push control flow convergence point
+   OP_JOIN,      // converge
+   OP_DISCARD,
+   OP_EXIT,
+   OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
+   OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
+   OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+   OP_EXPORT,
+   OP_LINTERP,
+   OP_PINTERP,
+   OP_EMIT,    // emit vertex
+   OP_RESTART, // restart primitive
+   OP_TEX,
+   OP_TXB, // texture bias
+   OP_TXL, // texure lod
+   OP_TXF, // texel fetch
+   OP_TXQ, // texture size query
+   OP_TXD, // texture derivatives
+   OP_TXG, // texture gather
+   OP_TEXCSAA, // texture op for coverage sampling
+   OP_TEXPREP, // turn cube map array into 2d array coordinates
+   OP_SULDB, // surface load (raw)
+   OP_SULDP, // surface load (formatted)
+   OP_SUSTB, // surface store (raw)
+   OP_SUSTP, // surface store (formatted)
+   OP_SUREDB,
+   OP_SUREDP, // surface reduction (atomic op)
+   OP_SULEA,   // surface load effective address
+   OP_SUBFM,   // surface bitfield manipulation
+   OP_SUCLAMP, // clamp surface coordinates
+   OP_SUEAU,   // surface effective address
+   OP_MADSP,   // special integer multiply-add
+   OP_TEXBAR, // texture dependency barrier
+   OP_DFDX,
+   OP_DFDY,
+   OP_RDSV, // read system value
+   OP_WRSV, // write system value
+   OP_QUADOP,
+   OP_QUADON,
+   OP_QUADPOP,
+   OP_POPCNT, // bitcount(src0 & src1)
+   OP_INSBF,  // insert first src1[8:15] bits of src0 into src2 at src1[0:7]
+   OP_EXTBF,  // place bits [K,K+N) of src0 into dst, src1 = 0xNNKK
+   OP_PERMT,  // dst = bytes from src2,src0 selected by src1 (nvc0's src order)
+   OP_ATOM,
+   OP_BAR,    // execution barrier, sources = { id, thread count, predicate }
+   OP_VADD,   // byte/word vector operations
+   OP_VAVG,
+   OP_VMIN,
+   OP_VMAX,
+   OP_VSAD,
+   OP_VSET,
+   OP_VSHR,
+   OP_VSHL,
+   OP_VSEL,
+   OP_CCTL, // cache control
+   OP_LAST
+};
+
+// various instruction-specific modifier definitions Instruction::subOp
+// MOV_FINAL marks a MOV originating from an EXPORT (used for placing TEXBARs)
+#define NV50_IR_SUBOP_MUL_HIGH     1
+#define NV50_IR_SUBOP_EMIT_RESTART 1
+#define NV50_IR_SUBOP_LDC_IL       1
+#define NV50_IR_SUBOP_LDC_IS       2
+#define NV50_IR_SUBOP_LDC_ISL      3
+#define NV50_IR_SUBOP_SHIFT_WRAP   1
+#define NV50_IR_SUBOP_EMU_PRERET   1
+#define NV50_IR_SUBOP_TEXBAR(n)    n
+#define NV50_IR_SUBOP_MOV_FINAL    1
+#define NV50_IR_SUBOP_EXTBF_REV    1
+#define NV50_IR_SUBOP_PERMT_F4E    1
+#define NV50_IR_SUBOP_PERMT_B4E    2
+#define NV50_IR_SUBOP_PERMT_RC8    3
+#define NV50_IR_SUBOP_PERMT_ECL    4
+#define NV50_IR_SUBOP_PERMT_ECR    5
+#define NV50_IR_SUBOP_PERMT_RC16   6
+#define NV50_IR_SUBOP_BAR_SYNC     0
+#define NV50_IR_SUBOP_BAR_ARRIVE   1
+#define NV50_IR_SUBOP_BAR_RED_AND  2
+#define NV50_IR_SUBOP_BAR_RED_OR   3
+#define NV50_IR_SUBOP_BAR_RED_POPC 4
+#define NV50_IR_SUBOP_MEMBAR_L     1
+#define NV50_IR_SUBOP_MEMBAR_S     2
+#define NV50_IR_SUBOP_MEMBAR_M     3
+#define NV50_IR_SUBOP_MEMBAR_CTA  (0 << 2)
+#define NV50_IR_SUBOP_MEMBAR_GL   (1 << 2)
+#define NV50_IR_SUBOP_MEMBAR_SYS  (2 << 2)
+#define NV50_IR_SUBOP_MEMBAR_DIR(m)   ((m) & 0x3)
+#define NV50_IR_SUBOP_MEMBAR_SCOPE(m) ((m) & ~0x3)
+#define NV50_IR_SUBOP_MEMBAR(d,s) \
+   (NV50_IR_SUBOP_MEMBAR_##d | NV50_IR_SUBOP_MEMBAR_##s)
+#define NV50_IR_SUBOP_ATOM_ADD      0
+#define NV50_IR_SUBOP_ATOM_MIN      1
+#define NV50_IR_SUBOP_ATOM_MAX      2
+#define NV50_IR_SUBOP_ATOM_INC      3
+#define NV50_IR_SUBOP_ATOM_DEC      4
+#define NV50_IR_SUBOP_ATOM_AND      5
+#define NV50_IR_SUBOP_ATOM_OR       6
+#define NV50_IR_SUBOP_ATOM_XOR      7
+#define NV50_IR_SUBOP_ATOM_CAS      8
+#define NV50_IR_SUBOP_ATOM_EXCH     9
+#define NV50_IR_SUBOP_CCTL_IV      5
+#define NV50_IR_SUBOP_CCTL_IVALL   6
+#define NV50_IR_SUBOP_SUST_IGN     0
+#define NV50_IR_SUBOP_SUST_TRAP    1
+#define NV50_IR_SUBOP_SUST_SDCL    3
+#define NV50_IR_SUBOP_SULD_ZERO    0
+#define NV50_IR_SUBOP_SULD_TRAP    1
+#define NV50_IR_SUBOP_SULD_SDCL    3
+#define NV50_IR_SUBOP_SUBFM_3D     1
+#define NV50_IR_SUBOP_SUCLAMP_2D   0x10
+#define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_PL(r, d) (( 5 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_BL(r, d) ((10 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_MADSP_SD     0xffff
+// Yes, we could represent those with DataType.
+// Or put the type into operation and have a couple 1000 values in that enum.
+// This will have to do for now.
+// The bitfields are supposed to correspond to nve4 ISA.
+#define NV50_IR_SUBOP_MADSP(a,b,c) (((c) << 8) | ((b) << 4) | (a))
+#define NV50_IR_SUBOP_V1(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x0000)
+#define NV50_IR_SUBOP_V2(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x4000)
+#define NV50_IR_SUBOP_V4(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x8000)
+#define NV50_IR_SUBOP_Vn(n)        ((n) >> 14)
+
+enum DataType
+{
+   TYPE_NONE,
+   TYPE_U8,
+   TYPE_S8,
+   TYPE_U16,
+   TYPE_S16,
+   TYPE_U32,
+   TYPE_S32,
+   TYPE_U64, // 64 bit operations are only lowered after register allocation
+   TYPE_S64,
+   TYPE_F16,
+   TYPE_F32,
+   TYPE_F64,
+   TYPE_B96,
+   TYPE_B128
+};
+
+enum CondCode
+{
+   CC_FL = 0,
+   CC_NEVER = CC_FL, // when used with FILE_FLAGS
+   CC_LT = 1,
+   CC_EQ = 2,
+   CC_NOT_P = CC_EQ, // when used with FILE_PREDICATE
+   CC_LE = 3,
+   CC_GT = 4,
+   CC_NE = 5,
+   CC_P  = CC_NE,
+   CC_GE = 6,
+   CC_TR = 7,
+   CC_ALWAYS = CC_TR,
+   CC_U  = 8,
+   CC_LTU = 9,
+   CC_EQU = 10,
+   CC_LEU = 11,
+   CC_GTU = 12,
+   CC_NEU = 13,
+   CC_GEU = 14,
+   CC_NO = 0x10,
+   CC_NC = 0x11,
+   CC_NS = 0x12,
+   CC_NA = 0x13,
+   CC_A  = 0x14,
+   CC_S  = 0x15,
+   CC_C  = 0x16,
+   CC_O  = 0x17
+};
+
+enum RoundMode
+{
+   ROUND_N, // nearest
+   ROUND_M, // towards -inf
+   ROUND_Z, // towards 0
+   ROUND_P, // towards +inf
+   ROUND_NI, // nearest integer
+   ROUND_MI, // to integer towards -inf
+   ROUND_ZI, // to integer towards 0
+   ROUND_PI, // to integer towards +inf
+};
+
+enum CacheMode
+{
+   CACHE_CA,            // cache at all levels
+   CACHE_WB = CACHE_CA, // cache write back
+   CACHE_CG,            // cache at global level
+   CACHE_CS,            // cache streaming
+   CACHE_CV,            // cache as volatile
+   CACHE_WT = CACHE_CV  // cache write-through
+};
+
+enum DataFile
+{
+   FILE_NULL = 0,
+   FILE_GPR,
+   FILE_PREDICATE,       // boolean predicate
+   FILE_FLAGS,           // zero/sign/carry/overflow bits
+   FILE_ADDRESS,
+   LAST_REGISTER_FILE = FILE_ADDRESS,
+   FILE_IMMEDIATE,
+   FILE_MEMORY_CONST,
+   FILE_SHADER_INPUT,
+   FILE_SHADER_OUTPUT,
+   FILE_MEMORY_GLOBAL,
+   FILE_MEMORY_SHARED,
+   FILE_MEMORY_LOCAL,
+   FILE_SYSTEM_VALUE,
+   DATA_FILE_COUNT
+};
+
+enum TexTarget
+{
+   TEX_TARGET_1D,
+   TEX_TARGET_2D,
+   TEX_TARGET_2D_MS,
+   TEX_TARGET_3D,
+   TEX_TARGET_CUBE,
+   TEX_TARGET_1D_SHADOW,
+   TEX_TARGET_2D_SHADOW,
+   TEX_TARGET_CUBE_SHADOW,
+   TEX_TARGET_1D_ARRAY,
+   TEX_TARGET_2D_ARRAY,
+   TEX_TARGET_2D_MS_ARRAY,
+   TEX_TARGET_CUBE_ARRAY,
+   TEX_TARGET_1D_ARRAY_SHADOW,
+   TEX_TARGET_2D_ARRAY_SHADOW,
+   TEX_TARGET_RECT,
+   TEX_TARGET_RECT_SHADOW,
+   TEX_TARGET_CUBE_ARRAY_SHADOW,
+   TEX_TARGET_BUFFER,
+   TEX_TARGET_COUNT
+};
+
+enum SVSemantic
+{
+   SV_POSITION, // WPOS
+   SV_VERTEX_ID,
+   SV_INSTANCE_ID,
+   SV_INVOCATION_ID,
+   SV_PRIMITIVE_ID,
+   SV_VERTEX_COUNT, // gl_PatchVerticesIn
+   SV_LAYER,
+   SV_VIEWPORT_INDEX,
+   SV_YDIR,
+   SV_FACE,
+   SV_POINT_SIZE,
+   SV_POINT_COORD,
+   SV_CLIP_DISTANCE,
+   SV_SAMPLE_INDEX,
+   SV_TESS_FACTOR,
+   SV_TESS_COORD,
+   SV_TID,
+   SV_CTAID,
+   SV_NTID,
+   SV_GRIDID,
+   SV_NCTAID,
+   SV_LANEID,
+   SV_PHYSID,
+   SV_NPHYSID,
+   SV_CLOCK,
+   SV_LBASE,
+   SV_SBASE,
+   SV_UNDEFINED,
+   SV_LAST
+};
+
+class Program;
+class Function;
+class BasicBlock;
+
+class Target;
+
+class Instruction;
+class CmpInstruction;
+class TexInstruction;
+class FlowInstruction;
+
+class Value;
+class LValue;
+class Symbol;
+class ImmediateValue;
+
+struct Storage
+{
+   DataFile file;
+   int8_t fileIndex; // signed, may be indirect for CONST[]
+   uint8_t size; // this should match the Instruction type's size
+   DataType type; // mainly for pretty printing
+   union {
+      uint64_t u64;    // immediate values
+      uint32_t u32;
+      uint16_t u16;
+      uint8_t u8;
+      int64_t s64;
+      int32_t s32;
+      int16_t s16;
+      int8_t s8;
+      float f32;
+      double f64;
+      int32_t offset; // offset from 0 (base of address space)
+      int32_t id;     // register id (< 0 if virtual/unassigned, in units <= 4)
+      struct {
+         SVSemantic sv;
+         int index;
+      } sv;
+   } data;
+};
+
+// precedence: NOT after SAT after NEG after ABS
+#define NV50_IR_MOD_ABS (1 << 0)
+#define NV50_IR_MOD_NEG (1 << 1)
+#define NV50_IR_MOD_SAT (1 << 2)
+#define NV50_IR_MOD_NOT (1 << 3)
+#define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)
+
+#define NV50_IR_INTERP_MODE_MASK   0x3
+#define NV50_IR_INTERP_LINEAR      (0 << 0)
+#define NV50_IR_INTERP_PERSPECTIVE (1 << 0)
+#define NV50_IR_INTERP_FLAT        (2 << 0)
+#define NV50_IR_INTERP_SC          (3 << 0) // what exactly is that ?
+#define NV50_IR_INTERP_SAMPLE_MASK 0xc
+#define NV50_IR_INTERP_DEFAULT     (0 << 2)
+#define NV50_IR_INTERP_CENTROID    (1 << 2)
+#define NV50_IR_INTERP_OFFSET      (2 << 2)
+#define NV50_IR_INTERP_SAMPLEID    (3 << 2)
+
+// do we really want this to be a class ?
+class Modifier
+{
+public:
+   Modifier() : bits(0) { }
+   Modifier(unsigned int m) : bits(m) { }
+   Modifier(operation op);
+
+   // @return new Modifier applying a after b (asserts if unrepresentable)
+   Modifier operator*(const Modifier) const;
+   Modifier operator*=(const Modifier m) { *this = *this * m; return *this; }
+   Modifier operator==(const Modifier m) const { return m.bits == bits; }
+   Modifier operator!=(const Modifier m) const { return m.bits != bits; }
+
+   inline Modifier operator&(const Modifier m) const { return bits & m.bits; }
+   inline Modifier operator|(const Modifier m) const { return bits | m.bits; }
+   inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; }
+
+   operation getOp() const;
+
+   inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; }
+   inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; }
+
+   inline operator bool() const { return bits ? true : false; }
+
+   void applyTo(ImmediateValue &imm) const;
+
+   int print(char *buf, size_t size) const;
+
+private:
+   uint8_t bits;
+};
+
+class ValueRef
+{
+public:
+   ValueRef(Value * = NULL);
+   ValueRef(const ValueRef&);
+   ~ValueRef();
+
+   inline bool exists() const { return value != NULL; }
+
+   void set(Value *);
+   void set(const ValueRef&);
+   inline Value *get() const { return value; }
+   inline Value *rep() const;
+
+   inline Instruction *getInsn() const { return insn; }
+   inline void setInsn(Instruction *inst) { insn = inst; }
+
+   inline bool isIndirect(int dim) const { return indirect[dim] >= 0; }
+   inline const ValueRef *getIndirect(int dim) const;
+
+   inline DataFile getFile() const;
+   inline unsigned getSize() const;
+
+   // SSA: return eventual (traverse MOVs) literal value, if it exists
+   bool getImmediate(ImmediateValue&) const;
+
+public:
+   Modifier mod;
+   int8_t indirect[2]; // >= 0 if relative to lvalue in insn->src(indirect[i])
+   uint8_t swizzle;
+
+   bool usedAsPtr; // for printing
+
+private:
+   Value *value;
+   Instruction *insn;
+};
+
+class ValueDef
+{
+public:
+   ValueDef(Value * = NULL);
+   ValueDef(const ValueDef&);
+   ~ValueDef();
+
+   inline bool exists() const { return value != NULL; }
+
+   inline Value *get() const { return value; }
+   inline Value *rep() const;
+   void set(Value *);
+   bool mayReplace(const ValueRef &);
+   void replace(const ValueRef &, bool doSet); // replace all uses of the old value
+
+   inline Instruction *getInsn() const { return insn; }
+   inline void setInsn(Instruction *inst) { insn = inst; }
+
+   inline DataFile getFile() const;
+   inline unsigned getSize() const;
+
+   inline void setSSA(LValue *);
+   inline const LValue *preSSA() const;
+
+private:
+   Value *value;   // should make this LValue * ...
+   LValue *origin; // pre SSA value
+   Instruction *insn;
+};
+
+class Value
+{
+public:
+   Value();
+   virtual ~Value() { }
+
+   virtual Value *clone(ClonePolicy<Function>&) const = 0;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const = 0;
+
+   virtual bool equals(const Value *, bool strict = false) const;
+   virtual bool interfers(const Value *) const;
+   virtual bool isUniform() const { return true; }
+
+   inline Value *rep() const { return join; }
+
+   inline Instruction *getUniqueInsn() const;
+   inline Instruction *getInsn() const; // use when uniqueness is certain
+
+   inline int refCount() { return uses.size(); }
+
+   inline LValue *asLValue();
+   inline Symbol *asSym();
+   inline ImmediateValue *asImm();
+   inline const Symbol *asSym() const;
+   inline const ImmediateValue *asImm() const;
+
+   inline bool inFile(DataFile f) { return reg.file == f; }
+
+   static inline Value *get(Iterator&);
+
+   std::list<ValueRef *> uses;
+   std::list<ValueDef *> defs;
+   typedef std::list<ValueRef *>::iterator UseIterator;
+   typedef std::list<ValueRef *>::const_iterator UseCIterator;
+   typedef std::list<ValueDef *>::iterator DefIterator;
+   typedef std::list<ValueDef *>::const_iterator DefCIterator;
+
+   int id;
+   Storage reg;
+
+   // TODO: these should be in LValue:
+   Interval livei;
+   Value *join;
+};
+
+class LValue : public Value
+{
+public:
+   LValue(Function *, DataFile file);
+   LValue(Function *, LValue *);
+   ~LValue() { }
+
+   virtual bool isUniform() const;
+
+   virtual LValue *clone(ClonePolicy<Function>&) const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+public:
+   unsigned compMask : 8; // compound/component mask
+   unsigned compound : 1; // used by RA, value involved in split/merge
+   unsigned ssa      : 1;
+   unsigned fixedReg : 1; // set & used by RA, earlier just use (id < 0)
+   unsigned noSpill  : 1; // do not spill (e.g. if spill temporary already)
+};
+
+class Symbol : public Value
+{
+public:
+   Symbol(Program *, DataFile file = FILE_MEMORY_CONST, ubyte fileIdx = 0);
+   ~Symbol() { }
+
+   virtual Symbol *clone(ClonePolicy<Function>&) const;
+
+   virtual bool equals(const Value *that, bool strict) const;
+
+   virtual bool isUniform() const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+   // print with indirect values
+   int print(char *, size_t, Value *, Value *, DataType ty = TYPE_NONE) const;
+
+   inline void setFile(DataFile file, ubyte fileIndex = 0)
+   {
+      reg.file = file;
+      reg.fileIndex = fileIndex;
+   }
+
+   inline void setOffset(int32_t offset);
+   inline void setAddress(Symbol *base, int32_t offset);
+   inline void setSV(SVSemantic sv, uint32_t idx = 0);
+
+   inline const Symbol *getBase() const { return baseSym; }
+
+private:
+   Symbol *baseSym; // array base for Symbols representing array elements
+};
+
+class ImmediateValue : public Value
+{
+public:
+   ImmediateValue() { }
+   ImmediateValue(Program *, uint32_t);
+   ImmediateValue(Program *, float);
+   ImmediateValue(Program *, double);
+   // NOTE: not added to program with
+   ImmediateValue(const ImmediateValue *, DataType ty);
+   ~ImmediateValue() { };
+
+   virtual ImmediateValue *clone(ClonePolicy<Function>&) const;
+
+   virtual bool equals(const Value *that, bool strict) const;
+
+   // these only work if 'type' is valid (we mostly use untyped literals):
+   bool isInteger(const int ival) const; // ival is cast to this' type
+   bool isNegative() const;
+   bool isPow2() const;
+
+   void applyLog2();
+
+   // for constant folding:
+   ImmediateValue operator+(const ImmediateValue&) const;
+   ImmediateValue operator-(const ImmediateValue&) const;
+   ImmediateValue operator*(const ImmediateValue&) const;
+   ImmediateValue operator/(const ImmediateValue&) const;
+
+   ImmediateValue& operator=(const ImmediateValue&); // only sets value !
+
+   bool compare(CondCode cc, float fval) const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+};
+
+class Instruction
+{
+public:
+   Instruction();
+   Instruction(Function *, operation, DataType);
+   virtual ~Instruction();
+
+   virtual Instruction *clone(ClonePolicy<Function>&,
+                              Instruction * = NULL) const;
+
+   void setDef(int i, Value *);
+   void setSrc(int s, Value *);
+   void setSrc(int s, const ValueRef&);
+   void swapSources(int a, int b);
+   void moveSources(int s, int delta);
+   bool setIndirect(int s, int dim, Value *);
+
+   inline ValueRef& src(int s) { return srcs[s]; }
+   inline ValueDef& def(int s) { return defs[s]; }
+   inline const ValueRef& src(int s) const { return srcs[s]; }
+   inline const ValueDef& def(int s) const { return defs[s]; }
+
+   inline Value *getDef(int d) const { return defs[d].get(); }
+   inline Value *getSrc(int s) const { return srcs[s].get(); }
+   inline Value *getIndirect(int s, int dim) const;
+
+   inline bool defExists(unsigned d) const
+   {
+      return d < defs.size() && defs[d].exists();
+   }
+   inline bool srcExists(unsigned s) const
+   {
+      return s < srcs.size() && srcs[s].exists();
+   }
+
+   inline bool constrainedDefs() const;
+
+   bool setPredicate(CondCode ccode, Value *);
+   inline Value *getPredicate() const;
+   bool writesPredicate() const;
+   inline bool isPredicated() const { return predSrc >= 0; }
+
+   inline void setFlagsSrc(int s, Value *);
+   inline void setFlagsDef(int d, Value *);
+   inline bool usesFlags() const { return flagsSrc >= 0; }
+
+   unsigned int defCount() const { return defs.size(); };
+   unsigned int defCount(unsigned int mask, bool singleFile = false) const;
+   unsigned int srcCount() const { return srcs.size(); };
+   unsigned int srcCount(unsigned int mask, bool singleFile = false) const;
+
+   // save & remove / set indirect[0,1] and predicate source
+   void takeExtraSources(int s, Value *[3]);
+   void putExtraSources(int s, Value *[3]);
+
+   inline void setType(DataType type) { dType = sType = type; }
+
+   inline void setType(DataType dtype, DataType stype)
+   {
+      dType = dtype;
+      sType = stype;
+   }
+
+   inline bool isPseudo() const { return op < OP_MOV; }
+   bool isDead() const;
+   bool isNop() const;
+   bool isCommutationLegal(const Instruction *) const; // must be adjacent !
+   bool isActionEqual(const Instruction *) const;
+   bool isResultEqual(const Instruction *) const;
+
+   void print() const;
+
+   inline CmpInstruction *asCmp();
+   inline TexInstruction *asTex();
+   inline FlowInstruction *asFlow();
+   inline const TexInstruction *asTex() const;
+   inline const CmpInstruction *asCmp() const;
+   inline const FlowInstruction *asFlow() const;
+
+public:
+   Instruction *next;
+   Instruction *prev;
+   int id;
+   int serial; // CFG order
+
+   operation op;
+   DataType dType; // destination or defining type
+   DataType sType; // source or secondary type
+   CondCode cc;
+   RoundMode rnd;
+   CacheMode cache;
+
+   uint16_t subOp; // quadop, 1 for mul-high, etc.
+
+   unsigned encSize    : 4; // encoding size in bytes
+   unsigned saturate   : 1; // to [0.0f, 1.0f]
+   unsigned join       : 1; // converge control flow (use OP_JOIN until end)
+   unsigned fixed      : 1; // prevent dead code elimination
+   unsigned terminator : 1; // end of basic block
+   unsigned ftz        : 1; // flush denormal to zero
+   unsigned dnz        : 1; // denormals, NaN are zero
+   unsigned ipa        : 4; // interpolation mode
+   unsigned lanes      : 4;
+   unsigned perPatch   : 1;
+   unsigned exit       : 1; // terminate program after insn
+   unsigned mask       : 4; // for vector ops
+
+   int8_t postFactor; // MUL/DIV(if < 0) by 1 << postFactor
+
+   int8_t predSrc;
+   int8_t flagsDef;
+   int8_t flagsSrc;
+
+   uint8_t sched; // scheduling data (NOTE: maybe move to separate storage)
+
+   BasicBlock *bb;
+
+protected:
+   std::deque<ValueDef> defs; // no gaps !
+   std::deque<ValueRef> srcs; // no gaps !
+
+   // instruction specific methods:
+   // (don't want to subclass, would need more constructors and memory pools)
+public:
+   inline void setInterpolate(unsigned int mode) { ipa = mode; }
+
+   unsigned int getInterpMode() const { return ipa & 0x3; }
+   unsigned int getSampleMode() const { return ipa & 0xc; }
+
+private:
+   void init();
+};
+
+enum TexQuery
+{
+   TXQ_DIMS,
+   TXQ_TYPE,
+   TXQ_SAMPLE_POSITION,
+   TXQ_FILTER,
+   TXQ_LOD,
+   TXQ_WRAP,
+   TXQ_BORDER_COLOUR
+};
+
+class TexInstruction : public Instruction
+{
+public:
+   class Target
+   {
+   public:
+      Target(TexTarget targ = TEX_TARGET_2D) : target(targ) { }
+
+      const char *getName() const { return descTable[target].name; }
+      unsigned int getArgCount() const { return descTable[target].argc; }
+      unsigned int getDim() const { return descTable[target].dim; }
+      int isArray() const { return descTable[target].array ? 1 : 0; }
+      int isCube() const { return descTable[target].cube ? 1 : 0; }
+      int isShadow() const { return descTable[target].shadow ? 1 : 0; }
+      int isMS() const {
+        return target == TEX_TARGET_2D_MS || target == TEX_TARGET_2D_MS_ARRAY; }
+
+      Target& operator=(TexTarget targ)
+      {
+         assert(targ < TEX_TARGET_COUNT);
+         target = targ;
+         return *this;
+      }
+
+      inline bool operator==(TexTarget targ) const { return target == targ; }
+      inline bool operator!=(TexTarget targ) const { return target != targ; }
+
+      enum TexTarget getEnum() const { return target; }
+
+   private:
+      struct Desc
+      {
+         char name[19];
+         uint8_t dim;
+         uint8_t argc;
+         bool array;
+         bool cube;
+         bool shadow;
+      };
+
+      static const struct Desc descTable[TEX_TARGET_COUNT];
+
+   private:
+      enum TexTarget target;
+   };
+
+public:
+   TexInstruction(Function *, operation);
+   virtual ~TexInstruction();
+
+   virtual TexInstruction *clone(ClonePolicy<Function>&,
+                                 Instruction * = NULL) const;
+
+   inline void setTexture(Target targ, uint8_t r, uint8_t s)
+   {
+      tex.r = r;
+      tex.s = s;
+      tex.target = targ;
+   }
+
+   void setIndirectR(Value *);
+   void setIndirectS(Value *);
+   inline Value *getIndirectR() const;
+   inline Value *getIndirectS() const;
+
+public:
+   struct {
+      Target target;
+
+      uint16_t r;
+      uint16_t s;
+      int8_t rIndirectSrc;
+      int8_t sIndirectSrc;
+
+      uint8_t mask;
+      uint8_t gatherComp;
+
+      bool liveOnly; // only execute on live pixels of a quad (optimization)
+      bool levelZero;
+      bool derivAll;
+
+      int8_t useOffsets; // 0, 1, or 4 for textureGatherOffsets
+      int8_t offset[4][3];
+
+      enum TexQuery query;
+   } tex;
+
+   ValueRef dPdx[3];
+   ValueRef dPdy[3];
+};
+
+class CmpInstruction : public Instruction
+{
+public:
+   CmpInstruction(Function *, operation);
+
+   virtual CmpInstruction *clone(ClonePolicy<Function>&,
+                                 Instruction * = NULL) const;
+
+   void setCondition(CondCode cond) { setCond = cond; }
+   CondCode getCondition() const { return setCond; }
+
+public:
+   CondCode setCond;
+};
+
+class FlowInstruction : public Instruction
+{
+public:
+   FlowInstruction(Function *, operation, void *target);
+
+   virtual FlowInstruction *clone(ClonePolicy<Function>&,
+                                  Instruction * = NULL) const;
+
+public:
+   unsigned allWarp  : 1;
+   unsigned absolute : 1;
+   unsigned limit    : 1;
+   unsigned builtin  : 1; // true for calls to emulation code
+   unsigned indirect : 1; // target in src(0)
+
+   union {
+      BasicBlock *bb;
+      int builtin;
+      Function *fn;
+   } target;
+};
+
+class BasicBlock
+{
+public:
+   BasicBlock(Function *);
+   ~BasicBlock();
+
+   BasicBlock *clone(ClonePolicy<Function>&) const;
+
+   inline int getId() const { return id; }
+   inline unsigned int getInsnCount() const { return numInsns; }
+   inline bool isTerminated() const { return exit && exit->terminator; }
+
+   bool dominatedBy(BasicBlock *bb);
+   inline bool reachableBy(const BasicBlock *by, const BasicBlock *term);
+
+   // returns mask of conditional out blocks
+   // e.g. 3 for IF { .. } ELSE { .. } ENDIF, 1 for IF { .. } ENDIF
+   unsigned int initiatesSimpleConditional() const;
+
+public:
+   Function *getFunction() const { return func; }
+   Program *getProgram() const { return program; }
+
+   Instruction *getEntry() const { return entry; } // first non-phi instruction
+   Instruction *getPhi() const { return phi; }
+   Instruction *getFirst() const { return phi ? phi : entry; }
+   Instruction *getExit() const { return exit; }
+
+   void insertHead(Instruction *);
+   void insertTail(Instruction *);
+   void insertBefore(Instruction *, Instruction *);
+   void insertAfter(Instruction *, Instruction *);
+   void remove(Instruction *);
+   void permuteAdjacent(Instruction *, Instruction *);
+
+   BasicBlock *idom() const;
+
+   // NOTE: currently does not rebuild the dominator tree
+   BasicBlock *splitBefore(Instruction *, bool attach = true);
+   BasicBlock *splitAfter(Instruction *, bool attach = true);
+
+   DLList& getDF() { return df; }
+   DLList::Iterator iterDF() { return df.iterator(); }
+
+   static inline BasicBlock *get(Iterator&);
+   static inline BasicBlock *get(Graph::Node *);
+
+public:
+   Graph::Node cfg; // first edge is branch *taken* (the ELSE branch)
+   Graph::Node dom;
+
+   BitSet liveSet;
+   BitSet defSet;
+
+   uint32_t binPos;
+   uint32_t binSize;
+
+   Instruction *joinAt; // for quick reference
+
+   bool explicitCont; // loop headers: true if loop contains continue stmts
+
+private:
+   int id;
+   DLList df;
+
+   Instruction *phi;
+   Instruction *entry;
+   Instruction *exit;
+
+   unsigned int numInsns;
+
+private:
+   Function *func;
+   Program *program;
+
+   void splitCommon(Instruction *, BasicBlock *, bool attach);
+};
+
+class Function
+{
+public:
+   Function(Program *, const char *name, uint32_t label);
+   ~Function();
+
+   static inline Function *get(Graph::Node *node);
+
+   inline Program *getProgram() const { return prog; }
+   inline const char *getName() const { return name; }
+   inline int getId() const { return id; }
+   inline uint32_t getLabel() const { return label; }
+
+   void print();
+   void printLiveIntervals() const;
+   void printCFGraph(const char *filePath);
+
+   bool setEntry(BasicBlock *);
+   bool setExit(BasicBlock *);
+
+   unsigned int orderInstructions(ArrayList&);
+
+   inline void add(BasicBlock *bb, int& id) { allBBlocks.insert(bb, id); }
+   inline void add(Instruction *insn, int& id) { allInsns.insert(insn, id); }
+   inline void add(LValue *lval, int& id) { allLValues.insert(lval, id); }
+
+   inline LValue *getLValue(int id);
+
+   void buildLiveSets();
+   void buildDefSets();
+   bool convertToSSA();
+
+public:
+   std::deque<ValueDef> ins;
+   std::deque<ValueRef> outs;
+   std::deque<Value *> clobbers;
+
+   Graph cfg;
+   Graph::Node *cfgExit;
+   Graph *domTree;
+   Graph::Node call; // node in the call graph
+
+   BasicBlock **bbArray; // BBs in emission order
+   int bbCount;
+
+   unsigned int loopNestingBound;
+   int regClobberMax;
+
+   uint32_t binPos;
+   uint32_t binSize;
+
+   Value *stackPtr;
+
+   uint32_t tlsBase; // base address for l[] space (if no stack pointer is used)
+   uint32_t tlsSize;
+
+   ArrayList allBBlocks;
+   ArrayList allInsns;
+   ArrayList allLValues;
+
+private:
+   void buildLiveSetsPreSSA(BasicBlock *, const int sequence);
+   void buildDefSetsPreSSA(BasicBlock *bb, const int seq);
+
+private:
+   uint32_t label;
+   int id;
+   const char *const name;
+   Program *prog;
+};
+
+enum CGStage
+{
+   CG_STAGE_PRE_SSA,
+   CG_STAGE_SSA, // expected directly before register allocation
+   CG_STAGE_POST_RA
+};
+
+class Program
+{
+public:
+   enum Type
+   {
+      TYPE_VERTEX,
+      TYPE_TESSELLATION_CONTROL,
+      TYPE_TESSELLATION_EVAL,
+      TYPE_GEOMETRY,
+      TYPE_FRAGMENT,
+      TYPE_COMPUTE
+   };
+
+   Program(Type type, Target *targ);
+   ~Program();
+
+   void print();
+
+   Type getType() const { return progType; }
+
+   inline void add(Function *fn, int& id) { allFuncs.insert(fn, id); }
+   inline void del(Function *fn, int& id) { allFuncs.remove(id); }
+   inline void add(Value *rval, int& id) { allRValues.insert(rval, id); }
+
+   bool makeFromTGSI(struct nv50_ir_prog_info *);
+   bool makeFromSM4(struct nv50_ir_prog_info *);
+   bool convertToSSA();
+   bool optimizeSSA(int level);
+   bool optimizePostRA(int level);
+   bool registerAllocation();
+   bool emitBinary(struct nv50_ir_prog_info *);
+
+   const Target *getTarget() const { return target; }
+
+private:
+   void emitSymbolTable(struct nv50_ir_prog_info *);
+
+   Type progType;
+   Target *target;
+
+public:
+   Function *main;
+   Graph calls;
+
+   ArrayList allFuncs;
+   ArrayList allRValues;
+
+   uint32_t *code;
+   uint32_t binSize;
+   uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
+
+   int maxGPR;
+
+   MemoryPool mem_Instruction;
+   MemoryPool mem_CmpInstruction;
+   MemoryPool mem_TexInstruction;
+   MemoryPool mem_FlowInstruction;
+   MemoryPool mem_LValue;
+   MemoryPool mem_Symbol;
+   MemoryPool mem_ImmediateValue;
+
+   uint32_t dbgFlags;
+   uint8_t  optLevel;
+
+   void *targetPriv; // e.g. to carry information between passes
+
+   const struct nv50_ir_prog_info *driver; // for driver configuration
+
+   void releaseInstruction(Instruction *);
+   void releaseValue(Value *);
+};
+
+// TODO: add const version
+class Pass
+{
+public:
+   bool run(Program *, bool ordered = false, bool skipPhi = false);
+   bool run(Function *, bool ordered = false, bool skipPhi = false);
+
+private:
+   // return false to continue with next entity on next higher level
+   virtual bool visit(Function *) { return true; }
+   virtual bool visit(BasicBlock *) { return true; }
+   virtual bool visit(Instruction *) { return false; }
+
+   bool doRun(Program *, bool ordered, bool skipPhi);
+   bool doRun(Function *, bool ordered, bool skipPhi);
+
+protected:
+   bool err;
+   Function *func;
+   Program *prog;
+};
+
+// =============================================================================
+
+#include "codegen/nv50_ir_inlines.h"
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
new file mode 100644
index 00000000000..51b9225156b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Function::Function(Program *p, const char *fnName, uint32_t label)
+   : call(this),
+     label(label),
+     name(fnName),
+     prog(p)
+{
+   cfgExit = NULL;
+   domTree = NULL;
+
+   bbArray = NULL;
+   bbCount = 0;
+   loopNestingBound = 0;
+   regClobberMax = 0;
+
+   binPos = 0;
+   binSize = 0;
+
+   stackPtr = NULL;
+   tlsBase = 0;
+   tlsSize = 0;
+
+   prog->add(this, id);
+}
+
+Function::~Function()
+{
+   prog->del(this, id);
+
+   if (domTree)
+      delete domTree;
+   if (bbArray)
+      delete[] bbArray;
+
+   // clear value refs and defs
+   ins.clear();
+   outs.clear();
+
+   for (ArrayList::Iterator it = allInsns.iterator(); !it.end(); it.next())
+      delete_Instruction(prog, reinterpret_cast<Instruction *>(it.get()));
+
+   for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next())
+      delete_Value(prog, reinterpret_cast<LValue *>(it.get()));
+
+   for (ArrayList::Iterator BBs = allBBlocks.iterator(); !BBs.end(); BBs.next())
+      delete reinterpret_cast<BasicBlock *>(BBs.get());
+}
+
+BasicBlock::BasicBlock(Function *fn) : cfg(this), dom(this), func(fn)
+{
+   program = func->getProgram();
+
+   joinAt = phi = entry = exit = NULL;
+
+   numInsns = 0;
+   binPos = 0;
+   binSize = 0;
+
+   explicitCont = false;
+
+   func->add(this, this->id);
+}
+
+BasicBlock::~BasicBlock()
+{
+   // nothing yet
+}
+
+BasicBlock *
+BasicBlock::clone(ClonePolicy<Function>& pol) const
+{
+   BasicBlock *bb = new BasicBlock(pol.context());
+
+   pol.set(this, bb);
+
+   for (Instruction *i = getFirst(); i; i = i->next)
+      bb->insertTail(i->clone(pol));
+
+   pol.context()->cfg.insert(&bb->cfg);
+
+   for (Graph::EdgeIterator it = cfg.outgoing(); !it.end(); it.next()) {
+      BasicBlock *obb = BasicBlock::get(it.getNode());
+      bb->cfg.attach(&pol.get(obb)->cfg, it.getType());
+   }
+
+   return bb;
+}
+
+BasicBlock *
+BasicBlock::idom() const
+{
+   Graph::Node *dn = dom.parent();
+   return dn ? BasicBlock::get(dn) : NULL;
+}
+
+void
+BasicBlock::insertHead(Instruction *inst)
+{
+   assert(inst->next == 0 && inst->prev == 0);
+
+   if (inst->op == OP_PHI) {
+      if (phi) {
+         insertBefore(phi, inst);
+      } else {
+         if (entry) {
+            insertBefore(entry, inst);
+         } else {
+            assert(!exit);
+            phi = exit = inst;
+            inst->bb = this;
+            ++numInsns;
+         }
+      }
+   } else {
+      if (entry) {
+         insertBefore(entry, inst);
+      } else {
+         if (phi) {
+            insertAfter(exit, inst); // after last phi
+         } else {
+            assert(!exit);
+            entry = exit = inst;
+            inst->bb = this;
+            ++numInsns;
+         }
+      }
+   }
+}
+
+void
+BasicBlock::insertTail(Instruction *inst)
+{
+   assert(inst->next == 0 && inst->prev == 0);
+
+   if (inst->op == OP_PHI) {
+      if (entry) {
+         insertBefore(entry, inst);
+      } else
+      if (exit) {
+         assert(phi);
+         insertAfter(exit, inst);
+      } else {
+         assert(!phi);
+         phi = exit = inst;
+         inst->bb = this;
+         ++numInsns;
+      }
+   } else {
+      if (exit) {
+         insertAfter(exit, inst);
+      } else {
+         assert(!phi);
+         entry = exit = inst;
+         inst->bb = this;
+         ++numInsns;
+      }
+   }
+}
+
+void
+BasicBlock::insertBefore(Instruction *q, Instruction *p)
+{
+   assert(p && q);
+
+   assert(p->next == 0 && p->prev == 0);
+
+   if (q == entry) {
+      if (p->op == OP_PHI) {
+         if (!phi)
+            phi = p;
+      } else {
+         entry = p;
+      }
+   } else
+   if (q == phi) {
+      assert(p->op == OP_PHI);
+      phi = p;
+   }
+
+   p->next = q;
+   p->prev = q->prev;
+   if (p->prev)
+      p->prev->next = p;
+   q->prev = p;
+
+   p->bb = this;
+   ++numInsns;
+}
+
+void
+BasicBlock::insertAfter(Instruction *p, Instruction *q)
+{
+   assert(p && q);
+   assert(q->op != OP_PHI || p->op == OP_PHI);
+
+   assert(q->next == 0 && q->prev == 0);
+
+   if (p == exit)
+      exit = q;
+   if (p->op == OP_PHI && q->op != OP_PHI)
+      entry = q;
+
+   q->prev = p;
+   q->next = p->next;
+   if (q->next)
+      q->next->prev = q;
+   p->next = q;
+
+   q->bb = this;
+   ++numInsns;
+}
+
+void
+BasicBlock::remove(Instruction *insn)
+{
+   assert(insn->bb == this);
+
+   if (insn->prev)
+      insn->prev->next = insn->next;
+
+   if (insn->next)
+      insn->next->prev = insn->prev;
+   else
+      exit = insn->prev;
+
+   if (insn == entry) {
+      if (insn->next)
+         entry = insn->next;
+      else
+      if (insn->prev && insn->prev->op != OP_PHI)
+         entry = insn->prev;
+      else
+         entry = NULL;
+   }
+
+   if (insn == phi)
+      phi = (insn->next && insn->next->op == OP_PHI) ? insn->next : 0;
+
+   --numInsns;
+   insn->bb = NULL;
+   insn->next =
+   insn->prev = NULL;
+}
+
+void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
+{
+   assert(a->bb == b->bb);
+
+   if (a->next != b) {
+      Instruction *i = a;
+      a = b;
+      b = i;
+   }
+   assert(a->next == b);
+   assert(a->op != OP_PHI && b->op != OP_PHI);
+
+   if (b == exit)
+      exit = a;
+   if (a == entry)
+      entry = b;
+
+   b->prev = a->prev;
+   a->next = b->next;
+   b->next = a;
+   a->prev = b;
+
+   if (b->prev)
+      b->prev->next = b;
+   if (a->prev)
+      a->next->prev = a;
+}
+
+void
+BasicBlock::splitCommon(Instruction *insn, BasicBlock *bb, bool attach)
+{
+   bb->entry = insn;
+
+   if (insn) {
+      exit = insn->prev;
+      insn->prev = NULL;
+   }
+
+   if (exit)
+      exit->next = NULL;
+   else
+      entry = NULL;
+
+   while (!cfg.outgoing(true).end()) {
+      Graph::Edge *e = cfg.outgoing(true).getEdge();
+      bb->cfg.attach(e->getTarget(), e->getType());
+      this->cfg.detach(e->getTarget());
+   }
+
+   for (; insn; insn = insn->next) {
+      this->numInsns--;
+      bb->numInsns++;
+      insn->bb = bb;
+      bb->exit = insn;
+   }
+   if (attach)
+      this->cfg.attach(&bb->cfg, Graph::Edge::TREE);
+}
+
+BasicBlock *
+BasicBlock::splitBefore(Instruction *insn, bool attach)
+{
+   BasicBlock *bb = new BasicBlock(func);
+   assert(!insn || insn->op != OP_PHI);
+
+   splitCommon(insn, bb, attach);
+   return bb;
+}
+
+BasicBlock *
+BasicBlock::splitAfter(Instruction *insn, bool attach)
+{
+   BasicBlock *bb = new BasicBlock(func);
+   assert(!insn || insn->op != OP_PHI);
+
+   bb->joinAt = joinAt;
+   joinAt = NULL;
+
+   splitCommon(insn ? insn->next : NULL, bb, attach);
+   return bb;
+}
+
+bool
+BasicBlock::dominatedBy(BasicBlock *that)
+{
+   Graph::Node *bn = &that->dom;
+   Graph::Node *dn = &this->dom;
+
+   while (dn && dn != bn)
+      dn = dn->parent();
+
+   return dn != NULL;
+}
+
+unsigned int
+BasicBlock::initiatesSimpleConditional() const
+{
+   Graph::Node *out[2];
+   int n;
+   Graph::Edge::Type eR;
+
+   if (cfg.outgoingCount() != 2) // -> if and -> else/endif
+      return false;
+
+   n = 0;
+   for (Graph::EdgeIterator ei = cfg.outgoing(); !ei.end(); ei.next())
+      out[n++] = ei.getNode();
+   eR = out[1]->outgoing().getType();
+
+   // IF block is out edge to the right
+   if (eR == Graph::Edge::CROSS || eR == Graph::Edge::BACK)
+      return 0x2;
+
+   if (out[1]->outgoingCount() != 1) // 0 is IF { RET; }, >1 is more divergence
+      return 0x0;
+   // do they reconverge immediately ?
+   if (out[1]->outgoing().getNode() == out[0])
+      return 0x1;
+   if (out[0]->outgoingCount() == 1)
+      if (out[0]->outgoing().getNode() == out[1]->outgoing().getNode())
+         return 0x3;
+
+   return 0x0;
+}
+
+bool
+Function::setEntry(BasicBlock *bb)
+{
+   if (cfg.getRoot())
+      return false;
+   cfg.insert(&bb->cfg);
+   return true;
+}
+
+bool
+Function::setExit(BasicBlock *bb)
+{
+   if (cfgExit)
+      return false;
+   cfgExit = &bb->cfg;
+   return true;
+}
+
+unsigned int
+Function::orderInstructions(ArrayList &result)
+{
+   result.clear();
+
+   for (IteratorRef it = cfg.iteratorCFG(); !it->end(); it->next()) {
+      BasicBlock *bb =
+         BasicBlock::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+      for (Instruction *insn = bb->getFirst(); insn; insn = insn->next)
+         result.insert(insn, insn->serial);
+   }
+
+   return result.getSize();
+}
+
+void
+Function::buildLiveSets()
+{
+   for (unsigned i = 0; i <= loopNestingBound; ++i)
+      buildLiveSetsPreSSA(BasicBlock::get(cfg.getRoot()), cfg.nextSequence());
+
+   for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+      BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+void
+Function::buildDefSets()
+{
+   for (unsigned i = 0; i <= loopNestingBound; ++i)
+      buildDefSetsPreSSA(BasicBlock::get(cfgExit), cfg.nextSequence());
+
+   for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+      BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+bool
+Pass::run(Program *prog, bool ordered, bool skipPhi)
+{
+   this->prog = prog;
+   err = false;
+   return doRun(prog, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Program *prog, bool ordered, bool skipPhi)
+{
+   for (IteratorRef it = prog->calls.iteratorDFS(false);
+        !it->end(); it->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(it->get());
+      if (!doRun(Function::get(n), ordered, skipPhi))
+         return false;
+   }
+   return !err;
+}
+
+bool
+Pass::run(Function *func, bool ordered, bool skipPhi)
+{
+   prog = func->getProgram();
+   err = false;
+   return doRun(func, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Function *func, bool ordered, bool skipPhi)
+{
+   IteratorRef bbIter;
+   BasicBlock *bb;
+   Instruction *insn, *next;
+
+   this->func = func;
+   if (!visit(func))
+      return false;
+
+   bbIter = ordered ? func->cfg.iteratorCFG() : func->cfg.iteratorDFS();
+
+   for (; !bbIter->end(); bbIter->next()) {
+      bb = BasicBlock::get(reinterpret_cast<Graph::Node *>(bbIter->get()));
+      if (!visit(bb))
+         break;
+      for (insn = skipPhi ? bb->getEntry() : bb->getFirst(); insn != NULL;
+           insn = next) {
+         next = insn->next;
+         if (!visit(insn))
+            break;
+      }
+   }
+
+   return !err;
+}
+
+void
+Function::printCFGraph(const char *filePath)
+{
+   FILE *out = fopen(filePath, "a");
+   if (!out) {
+      ERROR("failed to open file: %s\n", filePath);
+      return;
+   }
+   INFO("printing control flow graph to: %s\n", filePath);
+
+   fprintf(out, "digraph G {\n");
+
+   for (IteratorRef it = cfg.iteratorDFS(); !it->end(); it->next()) {
+      BasicBlock *bb = BasicBlock::get(
+         reinterpret_cast<Graph::Node *>(it->get()));
+      int idA = bb->getId();
+      for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+         int idB = BasicBlock::get(ei.getNode())->getId();
+         switch (ei.getType()) {
+         case Graph::Edge::TREE:
+            fprintf(out, "\t%i -> %i;\n", idA, idB);
+            break;
+         case Graph::Edge::FORWARD:
+            fprintf(out, "\t%i -> %i [color=green];\n", idA, idB);
+            break;
+         case Graph::Edge::CROSS:
+            fprintf(out, "\t%i -> %i [color=red];\n", idA, idB);
+            break;
+         case Graph::Edge::BACK:
+            fprintf(out, "\t%i -> %i;\n", idA, idB);
+            break;
+         case Graph::Edge::DUMMY:
+            fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB);
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+   }
+
+   fprintf(out, "}\n");
+   fclose(out);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
new file mode 100644
index 00000000000..70e5e226bed
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace nv50_ir {
+
+BuildUtil::BuildUtil()
+{
+   init(NULL);
+}
+
+BuildUtil::BuildUtil(Program *prog)
+{
+   init(prog);
+}
+
+void
+BuildUtil::init(Program *prog)
+{
+   this->prog = prog;
+
+   func = NULL;
+   bb = NULL;
+   pos = NULL;
+
+   memset(imms, 0, sizeof(imms));
+   immCount = 0;
+}
+
+void
+BuildUtil::addImmediate(ImmediateValue *imm)
+{
+   if (immCount > (NV50_IR_BUILD_IMM_HT_SIZE * 3) / 4)
+      return;
+
+   unsigned int pos = u32Hash(imm->reg.data.u32);
+
+   while (imms[pos])
+      pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+   imms[pos] = imm;
+   immCount++;
+}
+
+Instruction *
+BuildUtil::mkOp1(operation op, DataType ty, Value *dst, Value *src)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkOp2(operation op, DataType ty, Value *dst,
+                 Value *src0, Value *src1)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkOp3(operation op, DataType ty, Value *dst,
+                 Value *src0, Value *src1, Value *src2)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+   insn->setSrc(2, src2);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkLoad(DataType ty, Value *dst, Symbol *mem, Value *ptr)
+{
+   Instruction *insn = new_Instruction(func, OP_LOAD, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, mem);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkStore(operation op, DataType ty, Symbol *mem, Value *ptr,
+                   Value *stVal)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setSrc(0, mem);
+   insn->setSrc(1, stVal);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkFetch(Value *dst, DataType ty, DataFile file, int32_t offset,
+                   Value *attrRel, Value *primRel)
+{
+   Symbol *sym = mkSymbol(file, 0, ty, offset);
+
+   Instruction *insn = mkOp1(OP_VFETCH, ty, dst, sym);
+
+   insn->setIndirect(0, 0, attrRel);
+   insn->setIndirect(0, 1, primRel);
+
+   // already inserted
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkInterp(unsigned mode, Value *dst, int32_t offset, Value *rel)
+{
+   operation op = OP_LINTERP;
+   DataType ty = TYPE_F32;
+
+   if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_FLAT)
+      ty = TYPE_U32;
+   else
+   if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_PERSPECTIVE)
+      op = OP_PINTERP;
+
+   Symbol *sym = mkSymbol(FILE_SHADER_INPUT, 0, ty, offset);
+
+   Instruction *insn = mkOp1(op, ty, dst, sym);
+   insn->setIndirect(0, 0, rel);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMov(Value *dst, Value *src, DataType ty)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMovToReg(int id, Value *src)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(src->reg.size));
+
+   insn->setDef(0, new_LValue(func, FILE_GPR));
+   insn->getDef(0)->reg.data.id = id;
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMovFromReg(Value *dst, int id)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(dst->reg.size));
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, new_LValue(func, FILE_GPR));
+   insn->getSrc(0)->reg.data.id = id;
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkCvt(operation op,
+                 DataType dstTy, Value *dst, DataType srcTy, Value *src)
+{
+   Instruction *insn = new_Instruction(func, op, dstTy);
+
+   insn->setType(dstTy, srcTy);
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+CmpInstruction *
+BuildUtil::mkCmp(operation op, CondCode cc, DataType ty, Value *dst,
+                 Value *src0, Value *src1, Value *src2)
+{
+   CmpInstruction *insn = new_CmpInstruction(func, op);
+
+   insn->setType((dst->reg.file == FILE_PREDICATE ||
+                  dst->reg.file == FILE_FLAGS) ? TYPE_U8 : ty, ty);
+   insn->setCondition(cc);
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+   if (src2)
+      insn->setSrc(2, src2);
+
+   if (dst->reg.file == FILE_FLAGS)
+      insn->flagsDef = 0;
+
+   insert(insn);
+   return insn;
+}
+
+TexInstruction *
+BuildUtil::mkTex(operation op, TexTarget targ,
+                 uint16_t tic, uint16_t tsc,
+                 const std::vector<Value *> &def,
+                 const std::vector<Value *> &src)
+{
+   TexInstruction *tex = new_TexInstruction(func, op);
+
+   for (size_t d = 0; d < def.size() && def[d]; ++d)
+      tex->setDef(d, def[d]);
+   for (size_t s = 0; s < src.size() && src[s]; ++s)
+      tex->setSrc(s, src[s]);
+
+   tex->setTexture(targ, tic, tsc);
+
+   insert(tex);
+   return tex;
+}
+
+Instruction *
+BuildUtil::mkQuadop(uint8_t q, Value *def, uint8_t l, Value *src0, Value *src1)
+{
+   Instruction *quadop = mkOp2(OP_QUADOP, TYPE_F32, def, src0, src1);
+   quadop->subOp = q;
+   quadop->lanes = l;
+   return quadop;
+}
+
+Instruction *
+BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc)
+{
+   LValue *def0 = getSSA();
+   LValue *def1 = getSSA();
+
+   mkMov(def0, trSrc)->setPredicate(CC_P, pred);
+   mkMov(def1, flSrc)->setPredicate(CC_NOT_P, pred);
+
+   return mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1);
+}
+
+Instruction *
+BuildUtil::mkSplit(Value *h[2], uint8_t halfSize, Value *val)
+{
+   Instruction *insn = NULL;
+
+   const DataType fTy = typeOfSize(halfSize * 2);
+
+   if (val->reg.file == FILE_IMMEDIATE)
+      val = mkMov(getSSA(halfSize * 2), val, fTy)->getDef(0);
+
+   if (isMemoryFile(val->reg.file)) {
+      h[0] = cloneShallow(getFunction(), val);
+      h[1] = cloneShallow(getFunction(), val);
+      h[0]->reg.size = halfSize;
+      h[1]->reg.size = halfSize;
+      h[1]->reg.data.offset += halfSize;
+   } else {
+      h[0] = getSSA(halfSize, val->reg.file);
+      h[1] = getSSA(halfSize, val->reg.file);
+      insn = mkOp1(OP_SPLIT, fTy, h[0], val);
+      insn->setDef(1, h[1]);
+   }
+   return insn;
+}
+
+FlowInstruction *
+BuildUtil::mkFlow(operation op, void *targ, CondCode cc, Value *pred)
+{
+   FlowInstruction *insn = new_FlowInstruction(func, op, targ);
+
+   if (pred)
+      insn->setPredicate(cc, pred);
+
+   insert(insn);
+   return insn;
+}
+
+void
+BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
+{
+   static const uint16_t baseSize2[16] =
+   {
+      0x0000, 0x0010, 0x0011, 0x0020, 0x0012, 0x1210, 0x1211, 0x1220,
+      0x0013, 0x1310, 0x1311, 0x1320, 0x0022, 0x2210, 0x2211, 0x0040,
+   };
+
+   int base = 0;
+
+   for (; rMask; rMask >>= 4, base += 4) {
+      const uint32_t mask = rMask & 0xf;
+      if (!mask)
+         continue;
+      int base1 = (baseSize2[mask] >>  0) & 0xf;
+      int size1 = (baseSize2[mask] >>  4) & 0xf;
+      int base2 = (baseSize2[mask] >>  8) & 0xf;
+      int size2 = (baseSize2[mask] >> 12) & 0xf;
+      Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
+      if (1) { // size1 can't be 0
+         LValue *reg = new_LValue(func, f);
+         reg->reg.size = size1 << unit;
+         reg->reg.data.id = base + base1;
+         insn->setDef(0, reg);
+      }
+      if (size2) {
+         LValue *reg = new_LValue(func, f);
+         reg->reg.size = size2 << unit;
+         reg->reg.data.id = base + base2;
+         insn->setDef(1, reg);
+      }
+   }
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint32_t u)
+{
+   unsigned int pos = u32Hash(u);
+
+   while (imms[pos] && imms[pos]->reg.data.u32 != u)
+      pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+
+   ImmediateValue *imm = imms[pos];
+   if (!imm) {
+      imm = new_ImmediateValue(prog, u);
+      addImmediate(imm);
+   }
+   return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint64_t u)
+{
+   ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0);
+
+   imm->reg.size = 8;
+   imm->reg.type = TYPE_U64;
+   imm->reg.data.u64 = u;
+
+   return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(float f)
+{
+   union {
+      float f32;
+      uint32_t u32;
+   } u;
+   u.f32 = f;
+   return mkImm(u.u32);
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, float f)
+{
+   return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint32_t u)
+{
+   return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint64_t u)
+{
+   return mkOp1v(OP_MOV, TYPE_U64, dst ? dst : getScratch(8), mkImm(u));
+}
+
+Symbol *
+BuildUtil::mkSymbol(DataFile file, int8_t fileIndex, DataType ty,
+                    uint32_t baseAddr)
+{
+   Symbol *sym = new_Symbol(prog, file, fileIndex);
+
+   sym->setOffset(baseAddr);
+   sym->reg.type = ty;
+   sym->reg.size = typeSizeof(ty);
+
+   return sym;
+}
+
+Symbol *
+BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
+{
+   Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
+
+   assert(svIndex < 4 ||
+          (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+
+   switch (svName) {
+   case SV_POSITION:
+   case SV_FACE:
+   case SV_YDIR:
+   case SV_POINT_SIZE:
+   case SV_POINT_COORD:
+   case SV_CLIP_DISTANCE:
+   case SV_TESS_FACTOR:
+      sym->reg.type = TYPE_F32;
+      break;
+   default:
+      sym->reg.type = TYPE_U32;
+      break;
+   }
+   sym->reg.size = typeSizeof(sym->reg.type);
+
+   sym->reg.data.sv.sv = svName;
+   sym->reg.data.sv.index = svIndex;
+
+   return sym;
+}
+
+void
+BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx,
+                            uint32_t base, int len, int vecDim, int eltSize,
+                            DataFile file, int8_t fileIdx)
+{
+   this->array = array;
+   this->arrayIdx = arrayIdx;
+   this->baseAddr = base;
+   this->arrayLen = len;
+   this->vecDim = vecDim;
+   this->eltSize = eltSize;
+   this->file = file;
+   this->regOnly = !isMemoryFile(file);
+
+   if (!regOnly) {
+      baseSym = new_Symbol(up->getProgram(), file, fileIdx);
+      baseSym->setOffset(baseAddr);
+      baseSym->reg.size = eltSize;
+   } else {
+      baseSym = NULL;
+   }
+}
+
+Value *
+BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
+{
+   if (regOnly) {
+      Value *v = lookup(m, i, c);
+      if (!v)
+         v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+      return v;
+   } else {
+      return up->getScratch();
+   }
+}
+
+Value *
+BuildUtil::DataArray::load(ValueMap &m, int i, int c, Value *ptr)
+{
+   if (regOnly) {
+      Value *v = lookup(m, i, c);
+      if (!v)
+         v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+      return v;
+   } else {
+      Value *sym = lookup(m, i, c);
+      if (!sym)
+         sym = insert(m, i, c, mkSymbol(i, c));
+
+      return up->mkLoadv(typeOfSize(eltSize), static_cast<Symbol *>(sym), ptr);
+   }
+}
+
+void
+BuildUtil::DataArray::store(ValueMap &m, int i, int c, Value *ptr, Value *value)
+{
+   if (regOnly) {
+      assert(!ptr);
+      if (!lookup(m, i, c))
+         insert(m, i, c, value);
+
+      assert(lookup(m, i, c) == value);
+   } else {
+      Value *sym = lookup(m, i, c);
+      if (!sym)
+         sym = insert(m, i, c, mkSymbol(i, c));
+
+      const DataType stTy = typeOfSize(value->reg.size);
+
+      up->mkStore(OP_STORE, stTy, static_cast<Symbol *>(sym), ptr, value);
+   }
+}
+
+Symbol *
+BuildUtil::DataArray::mkSymbol(int i, int c)
+{
+   const unsigned int idx = i * vecDim + c;
+   Symbol *sym = new_Symbol(up->getProgram(), file, 0);
+
+   assert(baseSym || (idx < arrayLen && c < vecDim));
+
+   sym->reg.size = eltSize;
+   sym->reg.type = typeOfSize(eltSize);
+   sym->setAddress(baseSym, baseAddr + idx * eltSize);
+   return sym;
+}
+
+
+Instruction *
+BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
+                              Value *zero,
+                              Value *carry)
+{
+   DataType hTy;
+   int srcNr;
+
+   switch (i->dType) {
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return NULL;
+   }
+
+   switch (i->op) {
+   case OP_MOV: srcNr = 1; break;
+   case OP_ADD:
+   case OP_SUB:
+      if (!carry)
+         return NULL;
+      srcNr = 2;
+      break;
+   default:
+      // TODO when needed
+      return NULL;
+   }
+
+   i->setType(hTy);
+   i->setDef(0, cloneShallow(fn, i->getDef(0)));
+   i->getDef(0)->reg.size = 4;
+   Instruction *lo = i;
+   Instruction *hi = cloneForward(fn, i);
+   lo->bb->insertAfter(lo, hi);
+
+   hi->getDef(0)->reg.data.id++;
+
+   for (int s = 0; s < srcNr; ++s) {
+      if (lo->getSrc(s)->reg.size < 8) {
+         hi->setSrc(s, zero);
+      } else {
+         if (lo->getSrc(s)->refCount() > 1)
+            lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+         lo->getSrc(s)->reg.size /= 2;
+         hi->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+
+         switch (hi->src(s).getFile()) {
+         case FILE_IMMEDIATE:
+            hi->getSrc(s)->reg.data.u64 >>= 32;
+            break;
+         case FILE_MEMORY_CONST:
+         case FILE_MEMORY_SHARED:
+         case FILE_SHADER_INPUT:
+            hi->getSrc(s)->reg.data.offset += 4;
+            break;
+         default:
+            assert(hi->src(s).getFile() == FILE_GPR);
+            hi->getSrc(s)->reg.data.id++;
+            break;
+         }
+      }
+   }
+   if (srcNr == 2) {
+      lo->setDef(1, carry);
+      hi->setFlagsSrc(hi->srcCount(), carry);
+   }
+   return hi;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
new file mode 100644
index 00000000000..2305a275d0d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_BUILD_UTIL__
+#define __NV50_IR_BUILD_UTIL__
+
+namespace nv50_ir {
+
+class BuildUtil
+{
+public:
+   BuildUtil();
+   BuildUtil(Program *);
+
+   inline void setProgram(Program *);
+   inline Program *getProgram() const { return prog; }
+   inline Function *getFunction() const { return func; }
+
+   // keeps inserting at head/tail of block
+   inline void setPosition(BasicBlock *, bool tail);
+   // position advances only if @after is true
+   inline void setPosition(Instruction *, bool after);
+
+   inline BasicBlock *getBB() { return bb; }
+
+   inline void insert(Instruction *);
+   inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); }
+
+   inline LValue *getScratch(int size = 4, DataFile = FILE_GPR);
+   // scratch value for a single assignment:
+   inline LValue *getSSA(int size = 4, DataFile = FILE_GPR);
+
+   inline Instruction *mkOp(operation, DataType, Value *);
+   Instruction *mkOp1(operation, DataType, Value *, Value *);
+   Instruction *mkOp2(operation, DataType, Value *, Value *, Value *);
+   Instruction *mkOp3(operation, DataType, Value *, Value *, Value *, Value *);
+
+   LValue *mkOp1v(operation, DataType, Value *, Value *);
+   LValue *mkOp2v(operation, DataType, Value *, Value *, Value *);
+   LValue *mkOp3v(operation, DataType, Value *, Value *, Value *, Value *);
+
+   Instruction *mkLoad(DataType, Value *dst, Symbol *, Value *ptr);
+   Instruction *mkStore(operation, DataType, Symbol *, Value *ptr, Value *val);
+
+   LValue *mkLoadv(DataType, Symbol *, Value *ptr);
+
+   Instruction *mkMov(Value *, Value *, DataType = TYPE_U32);
+   Instruction *mkMovToReg(int id, Value *);
+   Instruction *mkMovFromReg(Value *, int id);
+
+   Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel);
+   Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset,
+                        Value *attrRel, Value *primRel);
+
+   Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
+   CmpInstruction *mkCmp(operation, CondCode, DataType,
+			 Value *,
+			 Value *, Value *, Value * = NULL);
+   TexInstruction *mkTex(operation, TexTarget,
+                         uint16_t tic, uint16_t tsc,
+                         const std::vector<Value *> &def,
+                         const std::vector<Value *> &src);
+   Instruction *mkQuadop(uint8_t qop, Value *, uint8_t l, Value *, Value *);
+
+   FlowInstruction *mkFlow(operation, void *target, CondCode, Value *pred);
+
+   Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc);
+
+   Instruction *mkSplit(Value *half[2], uint8_t halfSize, Value *);
+
+   void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
+
+   ImmediateValue *mkImm(float);
+   ImmediateValue *mkImm(uint32_t);
+   ImmediateValue *mkImm(uint64_t);
+
+   ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
+
+   Value *loadImm(Value *dst, float);
+   Value *loadImm(Value *dst, uint32_t);
+   Value *loadImm(Value *dst, uint64_t);
+
+   Value *loadImm(Value *dst, int i) { return loadImm(dst, (uint32_t)i); }
+
+   // returns high part of the operation
+   static Instruction *split64BitOpPostRA(Function *, Instruction *,
+                                          Value *zero, Value *carry);
+
+   struct Location
+   {
+      Location(unsigned array, unsigned arrayIdx, unsigned i, unsigned c)
+         : array(array), arrayIdx(arrayIdx), i(i), c(c) { }
+      Location(const Location &l)
+         : array(l.array), arrayIdx(l.arrayIdx), i(l.i), c(l.c) { }
+
+      bool operator==(const Location &l) const
+      {
+         return
+            array == l.array && arrayIdx == l.arrayIdx && i == l.i && c == l.c;
+      }
+
+      bool operator<(const Location &l) const
+      {
+         return array != l.array ? array < l.array :
+            arrayIdx != l.arrayIdx ? arrayIdx < l.arrayIdx :
+            i != l.i ? i < l.i :
+            c != l.c ? c < l.c :
+            false;
+      }
+
+      unsigned array, arrayIdx, i, c;
+   };
+
+   typedef bimap<Location, Value *> ValueMap;
+
+   class DataArray
+   {
+   public:
+      DataArray(BuildUtil *bld) : up(bld) { }
+
+      void setup(unsigned array, unsigned arrayIdx,
+                 uint32_t base, int len, int vecDim, int eltSize,
+                 DataFile file, int8_t fileIdx);
+
+      inline bool exists(ValueMap&, unsigned int i, unsigned int c);
+
+      Value *load(ValueMap&, int i, int c, Value *ptr);
+      void store(ValueMap&, int i, int c, Value *ptr, Value *value);
+      Value *acquire(ValueMap&, int i, int c);
+
+   private:
+      inline Value *lookup(ValueMap&, unsigned i, unsigned c);
+      inline Value *insert(ValueMap&, unsigned i, unsigned c, Value *v);
+
+      Symbol *mkSymbol(int i, int c);
+
+   private:
+      BuildUtil *up;
+      unsigned array, arrayIdx;
+
+      uint32_t baseAddr;
+      uint32_t arrayLen;
+      Symbol *baseSym;
+
+      uint8_t vecDim;
+      uint8_t eltSize; // in bytes
+
+      DataFile file;
+      bool regOnly;
+   };
+
+   Symbol *mkSymbol(DataFile file, int8_t fileIndex,
+                    DataType ty, uint32_t baseAddress);
+
+   Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex);
+
+private:
+   void init(Program *);
+   void addImmediate(ImmediateValue *);
+   inline unsigned int u32Hash(uint32_t);
+
+protected:
+   Program *prog;
+   Function *func;
+   Instruction *pos;
+   BasicBlock *bb;
+   bool tail;
+
+#define NV50_IR_BUILD_IMM_HT_SIZE 256
+
+   ImmediateValue *imms[NV50_IR_BUILD_IMM_HT_SIZE];
+   unsigned int immCount;
+};
+
+unsigned int BuildUtil::u32Hash(uint32_t u)
+{
+   return (u % 273) % NV50_IR_BUILD_IMM_HT_SIZE;
+}
+
+void BuildUtil::setProgram(Program *program)
+{
+   prog = program;
+}
+
+void
+BuildUtil::setPosition(BasicBlock *block, bool atTail)
+{
+   bb = block;
+   prog = bb->getProgram();
+   func = bb->getFunction();
+   pos = NULL;
+   tail = atTail;
+}
+
+void
+BuildUtil::setPosition(Instruction *i, bool after)
+{
+   bb = i->bb;
+   prog = bb->getProgram();
+   func = bb->getFunction();
+   pos = i;
+   tail = after;
+   assert(bb);
+}
+
+LValue *
+BuildUtil::getScratch(int size, DataFile f)
+{
+   LValue *lval = new_LValue(func, f);
+   lval->reg.size = size;
+   return lval;
+}
+
+LValue *
+BuildUtil::getSSA(int size, DataFile f)
+{
+   LValue *lval = new_LValue(func, f);
+   lval->ssa = 1;
+   lval->reg.size = size;
+   return lval;
+}
+
+void BuildUtil::insert(Instruction *i)
+{
+   if (!pos) {
+      tail ? bb->insertTail(i) : bb->insertHead(i);
+   } else {
+      if (tail) {
+         bb->insertAfter(pos, i);
+         pos = i;
+      } else {
+         bb->insertBefore(pos, i);
+      }
+   }
+}
+
+Instruction *
+BuildUtil::mkOp(operation op, DataType ty, Value *dst)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+   insn->setDef(0, dst);
+   insert(insn);
+   if (op == OP_DISCARD || op == OP_EXIT ||
+       op == OP_JOIN ||
+       op == OP_QUADON || op == OP_QUADPOP ||
+       op == OP_EMIT || op == OP_RESTART)
+      insn->fixed = 1;
+   return insn;
+}
+
+inline LValue *
+BuildUtil::mkOp1v(operation op, DataType ty, Value *dst, Value *src)
+{
+   mkOp1(op, ty, dst, src);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp2v(operation op, DataType ty, Value *dst,
+                  Value *src0, Value *src1)
+{
+   mkOp2(op, ty, dst, src0, src1);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
+                  Value *src0, Value *src1, Value *src2)
+{
+   mkOp3(op, ty, dst, src0, src1, src2);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
+{
+   LValue *dst = getScratch();
+   mkLoad(ty, dst, mem, ptr);
+   return dst;
+}
+
+bool
+BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c)
+{
+   assert(i < arrayLen && c < vecDim);
+   return !regOnly || m.r.count(Location(array, arrayIdx, i, c));
+}
+
+Value *
+BuildUtil::DataArray::lookup(ValueMap &m, unsigned i, unsigned c)
+{
+   ValueMap::r_iterator it = m.r.find(Location(array, arrayIdx, i, c));
+   return it != m.r.end() ? it->second : NULL;
+}
+
+Value *
+BuildUtil::DataArray::insert(ValueMap &m, unsigned i, unsigned c, Value *v)
+{
+   m.insert(Location(array, arrayIdx, i, c), v);
+   return v;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_BUILD_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
new file mode 100644
index 00000000000..752bad37941
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_DRIVER_H__
+#define __NV50_IR_DRIVER_H__
+
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+/*
+ * This struct constitutes linkage information in TGSI terminology.
+ *
+ * It is created by the code generator and handed to the pipe driver
+ * for input/output slot assignment.
+ */
+struct nv50_ir_varying
+{
+   uint8_t slot[4]; /* native slots for xyzw (addresses in 32-bit words) */
+
+   unsigned mask     : 4; /* vec4 mask */
+   unsigned linear   : 1; /* linearly interpolated if true (and not flat) */
+   unsigned flat     : 1;
+   unsigned sc       : 1; /* special colour interpolation mode (SHADE_MODEL) */
+   unsigned centroid : 1;
+   unsigned patch    : 1; /* patch constant value */
+   unsigned regular  : 1; /* driver-specific meaning (e.g. input in sreg) */
+   unsigned input    : 1; /* indicates direction of system values */
+   unsigned oread    : 1; /* true if output is read from parallel TCP */
+
+   ubyte id; /* TGSI register index */
+   ubyte sn; /* TGSI semantic name */
+   ubyte si; /* TGSI semantic index */
+};
+
+#define NV50_PROGRAM_IR_TGSI 0
+#define NV50_PROGRAM_IR_SM4  1
+#define NV50_PROGRAM_IR_GLSL 2
+#define NV50_PROGRAM_IR_LLVM 3
+
+#ifdef DEBUG
+# define NV50_IR_DEBUG_BASIC     (1 << 0)
+# define NV50_IR_DEBUG_VERBOSE   (2 << 0)
+# define NV50_IR_DEBUG_REG_ALLOC (1 << 2)
+#else
+# define NV50_IR_DEBUG_BASIC     0
+# define NV50_IR_DEBUG_VERBOSE   0
+# define NV50_IR_DEBUG_REG_ALLOC 0
+#endif
+
+#define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
+#define NV50_SEMANTIC_VIEWPORTINDEX (TGSI_SEMANTIC_COUNT + 4)
+#define NV50_SEMANTIC_LAYER         (TGSI_SEMANTIC_COUNT + 5)
+#define NV50_SEMANTIC_INVOCATIONID  (TGSI_SEMANTIC_COUNT + 6)
+#define NV50_SEMANTIC_TESSFACTOR    (TGSI_SEMANTIC_COUNT + 7)
+#define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
+#define NV50_SEMANTIC_SAMPLEMASK    (TGSI_SEMANTIC_COUNT + 9)
+#define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
+
+#define NV50_TESS_PART_FRACT_ODD  0
+#define NV50_TESS_PART_FRACT_EVEN 1
+#define NV50_TESS_PART_POW2       2
+#define NV50_TESS_PART_INTEGER    3
+
+#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
+
+struct nv50_ir_prog_symbol
+{
+   uint32_t label;
+   uint32_t offset;
+};
+
+#define NVISA_GF100_CHIPSET_C0 0xc0
+#define NVISA_GF100_CHIPSET_D0 0xd0
+#define NVISA_GK104_CHIPSET    0xe0
+#define NVISA_GK110_CHIPSET    0xf0
+
+struct nv50_ir_prog_info
+{
+   uint16_t target; /* chipset (0x50, 0x84, 0xc0, ...) */
+
+   uint8_t type; /* PIPE_SHADER */
+
+   uint8_t optLevel; /* optimization level (0 to 3) */
+   uint8_t dbgFlags;
+
+   struct {
+      int16_t maxGPR;     /* may be -1 if none used */
+      int16_t maxOutput;
+      uint32_t tlsSpace;  /* required local memory per thread */
+      uint32_t *code;
+      uint32_t codeSize;
+      uint8_t sourceRep;  /* NV50_PROGRAM_IR */
+      const void *source;
+      void *relocData;
+      struct nv50_ir_prog_symbol *syms;
+      uint16_t numSyms;
+   } bin;
+
+   struct nv50_ir_varying sv[PIPE_MAX_SHADER_INPUTS];
+   struct nv50_ir_varying in[PIPE_MAX_SHADER_INPUTS];
+   struct nv50_ir_varying out[PIPE_MAX_SHADER_OUTPUTS];
+   uint8_t numInputs;
+   uint8_t numOutputs;
+   uint8_t numPatchConstants; /* also included in numInputs/numOutputs */
+   uint8_t numSysVals;
+
+   struct {
+      uint32_t *buf;    /* for IMMEDIATE_ARRAY */
+      uint16_t bufSize; /* size of immediate array */
+      uint16_t count;   /* count of inline immediates */
+      uint32_t *data;   /* inline immediate data */
+      uint8_t *type;    /* for each vec4 (128 bit) */
+   } immd;
+
+   union {
+      struct {
+         uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+      } vp;
+      struct {
+         uint8_t inputPatchSize;
+         uint8_t outputPatchSize;
+         uint8_t partitioning;    /* PIPE_TESS_PART */
+         int8_t winding;          /* +1 (clockwise) / -1 (counter-clockwise) */
+         uint8_t domain;          /* PIPE_PRIM_{QUADS,TRIANGLES,LINES} */
+         uint8_t outputPrim;      /* PIPE_PRIM_{TRIANGLES,LINES,POINTS} */
+      } tp;
+      struct {
+         uint8_t inputPrim;
+         uint8_t outputPrim;
+         unsigned instanceCount;
+         unsigned maxVertices;
+      } gp;
+      struct {
+         unsigned numColourResults;
+         boolean writesDepth;
+         boolean earlyFragTests;
+         boolean separateFragData;
+         boolean usesDiscard;
+      } fp;
+      struct {
+         uint32_t inputOffset; /* base address for user args */
+         uint32_t sharedOffset; /* reserved space in s[] */
+         uint32_t gridInfoBase;  /* base address for NTID,NCTAID */
+      } cp;
+   } prop;
+
+   uint8_t numBarriers;
+
+   struct {
+      uint8_t clipDistance;      /* index of first clip distance output */
+      uint8_t clipDistanceMask;  /* mask of clip distances defined */
+      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
+      int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint16_t ucpBase;          /* base address for UCPs */
+      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint8_t pointSize;         /* output index for PointSize */
+      uint8_t instanceId;        /* system value index of InstanceID */
+      uint8_t vertexId;          /* system value index of VertexID */
+      uint8_t edgeFlagIn;
+      uint8_t edgeFlagOut;
+      uint8_t fragDepth;         /* output index of FragDepth */
+      uint8_t sampleMask;        /* output index of SampleMask */
+      uint8_t backFaceColor[2];  /* input/output indices of back face colour */
+      uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
+      boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+      uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
+      uint16_t texBindBase;      /* base address for tex handles (nve4) */
+      uint16_t suInfoBase;       /* base address for surface info (nve4) */
+      uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
+      uint16_t msInfoBase;       /* base address for multisample info */
+   } io;
+
+   /* driver callback to assign input/output locations */
+   int (*assignSlots)(struct nv50_ir_prog_info *);
+
+   void *driverPriv;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int nv50_ir_generate_code(struct nv50_ir_prog_info *);
+
+extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
+                                  uint32_t codePos,
+                                  uint32_t libPos,
+                                  uint32_t dataPos);
+
+/* obtain code that will be shared among programs */
+extern void nv50_ir_get_target_library(uint32_t chipset,
+                                       const uint32_t **code, uint32_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __NV50_IR_DRIVER_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
new file mode 100644
index 00000000000..ac59187130c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -0,0 +1,1682 @@
+/*
+ * Copyright 2012 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+// CodeEmitter for GK110 encoding of the Fermi/Kepler ISA.
+
+namespace nv50_ir {
+
+class CodeEmitterGK110 : public CodeEmitter
+{
+public:
+   CodeEmitterGK110(const TargetNVC0 *);
+
+   virtual bool emitInstruction(Instruction *);
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+   virtual void prepareEmission(Function *);
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+   const TargetNVC0 *targNVC0;
+
+   Program::Type progType;
+
+   const bool writeIssueDelays;
+
+private:
+   void emitForm_21(const Instruction *, uint32_t opc2, uint32_t opc1);
+   void emitForm_C(const Instruction *, uint32_t opc, uint8_t ctg);
+   void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier);
+
+   void emitPredicate(const Instruction *);
+
+   void setCAddress14(const ValueRef&);
+   void setShortImmediate(const Instruction *, const int s);
+   void setImmediate32(const Instruction *, const int s, Modifier);
+
+   void modNegAbsF32_3b(const Instruction *, const int s);
+
+   void emitCondCode(CondCode cc, int pos, uint8_t mask);
+   void emitInterpMode(const Instruction *);
+   void emitLoadStoreType(DataType ty, const int pos);
+   void emitCachingMode(CacheMode c, const int pos);
+
+   inline uint8_t getSRegEncoding(const ValueRef&);
+
+   void emitRoundMode(RoundMode, const int pos, const int rintPos);
+   void emitRoundModeF(RoundMode, const int pos);
+   void emitRoundModeI(RoundMode, const int pos);
+
+   void emitNegAbs12(const Instruction *);
+
+   void emitNOP(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitVFETCH(const Instruction *);
+   void emitEXPORT(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitIMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+   void emitFMAD(const Instruction *);
+
+   void emitNOT(const Instruction *);
+   void emitLogicOp(const Instruction *, uint8_t subOp);
+   void emitPOPC(const Instruction *);
+   void emitINSBF(const Instruction *);
+   void emitShift(const Instruction *);
+
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitCVT(const Instruction *);
+   void emitMINMAX(const Instruction *);
+   void emitPreOp(const Instruction *);
+
+   void emitSET(const CmpInstruction *);
+   void emitSLCT(const CmpInstruction *);
+   void emitSELP(const Instruction *);
+
+   void emitTEXBAR(const Instruction *);
+   void emitTEX(const TexInstruction *);
+   void emitTEXCSAA(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+   void emitFlow(const Instruction *);
+
+   inline void defId(const ValueDef&, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+   inline void srcId(const Instruction *, int s, const int pos);
+
+   inline void srcAddr32(const ValueRef&, const int pos); // address / 4
+
+   inline bool isLIMM(const ValueRef&, DataType ty, bool mod = false);
+};
+
+#define GK110_GPR_ZERO 255
+
+#define NEG_(b, s) \
+   if (i->src(s).mod.neg()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+#define ABS_(b, s) \
+   if (i->src(s).mod.abs()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define NOT_(b, s) if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))       \
+   code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define FTZ_(b) if (i->ftz) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define SAT_(b) if (i->saturate) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define RND_(b, t) emitRoundMode##t(i->rnd, 0x##b)
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterGK110::srcId(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (src.get() ? SDATA(src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const ValueRef *src, const int pos)
+{
+   code[pos / 32] |= (src ? SDATA(*src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const Instruction *insn, int s, int pos)
+{
+   int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : GK110_GPR_ZERO;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+void CodeEmitterGK110::srcAddr32(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterGK110::defId(const ValueDef& def, const int pos)
+{
+   code[pos / 32] |= (def.get() ? DDATA(def).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+bool CodeEmitterGK110::isLIMM(const ValueRef& ref, DataType ty, bool mod)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterGK110::emitRoundMode(RoundMode rnd, const int pos, const int rintPos)
+{
+   bool rint = false;
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_MI: rint = true; /* fall through */ case ROUND_M: n = 1; break;
+   case ROUND_PI: rint = true; /* fall through */ case ROUND_P: n = 2; break;
+   case ROUND_ZI: rint = true; /* fall through */ case ROUND_Z: n = 3; break;
+   default:
+      rint = rnd == ROUND_NI;
+      n = 0;
+      assert(rnd == ROUND_N || rnd == ROUND_NI);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+   if (rint && rintPos >= 0)
+      code[rintPos / 32] |= 1 << (rintPos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeF(RoundMode rnd, const int pos)
+{
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_M: n = 1; break;
+   case ROUND_P: n = 2; break;
+   case ROUND_Z: n = 3; break;
+   default:
+      n = 0;
+      assert(rnd == ROUND_N);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeI(RoundMode rnd, const int pos)
+{
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_MI: n = 1; break;
+   case ROUND_PI: n = 2; break;
+   case ROUND_ZI: n = 3; break;
+   default:
+      n = 0;
+      assert(rnd == ROUND_NI);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void CodeEmitterGK110::emitCondCode(CondCode cc, int pos, uint8_t mask)
+{
+   uint8_t n;
+
+   switch (cc) {
+   case CC_FL:  n = 0x00; break;
+   case CC_LT:  n = 0x01; break;
+   case CC_EQ:  n = 0x02; break;
+   case CC_LE:  n = 0x03; break;
+   case CC_GT:  n = 0x04; break;
+   case CC_NE:  n = 0x05; break;
+   case CC_GE:  n = 0x06; break;
+   case CC_LTU: n = 0x09; break;
+   case CC_EQU: n = 0x0a; break;
+   case CC_LEU: n = 0x0b; break;
+   case CC_GTU: n = 0x0c; break;
+   case CC_NEU: n = 0x0d; break;
+   case CC_GEU: n = 0x0e; break;
+   case CC_TR:  n = 0x0f; break;
+   case CC_NO:  n = 0x10; break;
+   case CC_NC:  n = 0x11; break;
+   case CC_NS:  n = 0x12; break;
+   case CC_NA:  n = 0x13; break;
+   case CC_A:   n = 0x14; break;
+   case CC_S:   n = 0x15; break;
+   case CC_C:   n = 0x16; break;
+   case CC_O:   n = 0x17; break;
+   default:
+      n = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   code[pos / 32] |= (n & mask) << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitPredicate(const Instruction *i)
+{
+   if (i->predSrc >= 0) {
+      srcId(i->src(i->predSrc), 18);
+      if (i->cc == CC_NOT_P)
+         code[0] |= 8 << 18; // negate
+      assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+   } else {
+      code[0] |= 7 << 18;
+   }
+}
+
+void
+CodeEmitterGK110::setCAddress14(const ValueRef& src)
+{
+   const int32_t addr = src.get()->asSym()->reg.data.offset / 4;
+
+   code[0] |= (addr & 0x01ff) << 23;
+   code[1] |= (addr & 0x3e00) >> 9;
+}
+
+void
+CodeEmitterGK110::setShortImmediate(const Instruction *i, const int s)
+{
+   const uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+   const uint64_t u64 = i->getSrc(s)->asImm()->reg.data.u64;
+
+   if (i->sType == TYPE_F32) {
+      assert(!(u32 & 0x00000fff));
+      code[0] |= ((u32 & 0x001ff000) >> 12) << 23;
+      code[1] |= ((u32 & 0x7fe00000) >> 21);
+      code[1] |= ((u32 & 0x80000000) >> 4);
+   } else
+   if (i->sType == TYPE_F64) {
+      assert(!(u64 & 0x00000fffffffffffULL));
+      code[0] |= ((u64 & 0x001ff00000000000ULL) >> 44) << 23;
+      code[1] |= ((u64 & 0x7fe0000000000000ULL) >> 53);
+      code[1] |= ((u64 & 0x8000000000000000ULL) >> 36);
+   } else {
+      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      code[0] |= (u32 & 0x001ff) << 23;
+      code[1] |= (u32 & 0x7fe00) >> 9;
+      code[1] |= (u32 & 0x80000) << 8;
+   }
+}
+
+void
+CodeEmitterGK110::setImmediate32(const Instruction *i, const int s,
+                                 Modifier mod)
+{
+   uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+
+   if (mod) {
+      ImmediateValue imm(i->getSrc(s)->asImm(), i->sType);
+      mod.applyTo(imm);
+      u32 = imm.reg.data.u32;
+   }
+
+   code[0] |= u32 << 23;
+   code[1] |= u32 >> 9;
+}
+
+void
+CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg,
+                             Modifier mod)
+{
+   code[0] = ctg;
+   code[1] = opc << 20;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_GPR:
+         srcId(i->src(s), s ? 42 : 10);
+         break;
+      case FILE_IMMEDIATE:
+         setImmediate32(i, s, mod);
+         break;
+      default:
+         break;
+      }
+   }
+}
+
+
+void
+CodeEmitterGK110::emitForm_C(const Instruction *i, uint32_t opc, uint8_t ctg)
+{
+   code[0] = ctg;
+   code[1] = opc << 20;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_CONST:
+      code[1] |= 0x4 << 28;
+      setCAddress14(i->src(0));
+      break;
+   case FILE_GPR:
+      code[1] |= 0xc << 28;
+      srcId(i->src(0), 23);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+// 0x2 for GPR, c[] and 0x1 for short immediate
+void
+CodeEmitterGK110::emitForm_21(const Instruction *i, uint32_t opc2,
+                              uint32_t opc1)
+{
+   const bool imm = i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE;
+
+   int s1 = 23;
+   if (i->srcExists(2) && i->src(2).getFile() == FILE_MEMORY_CONST)
+      s1 = 42;
+
+   if (imm) {
+      code[0] = 0x1;
+      code[1] = opc1 << 20;
+   } else {
+      code[0] = 0x2;
+      code[1] = (0xc << 28) | (opc2 << 20);
+   }
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_MEMORY_CONST:
+         code[1] &= (s == 2) ? ~(0x4 << 28) : ~(0x8 << 28);
+         setCAddress14(i->src(s));
+         code[1] |= i->getSrc(s)->reg.fileIndex << 5;
+         break;
+      case FILE_IMMEDIATE:
+         setShortImmediate(i, s);
+         break;
+      case FILE_GPR:
+         srcId(i->src(s), s ? ((s == 2) ? 42 : s1) : 10);
+         break;
+      default:
+         // ignore here, can be predicate or flags, but must not be address
+         break;
+      }
+   }
+   // 0x0 = invalid
+   // 0xc = rrr
+   // 0x8 = rrc
+   // 0x4 = rcr
+   assert(imm || (code[1] & (0xc << 28)));
+}
+
+inline void
+CodeEmitterGK110::modNegAbsF32_3b(const Instruction *i, const int s)
+{
+   if (i->src(s).mod.abs()) code[1] &= ~(1 << 27);
+   if (i->src(s).mod.neg()) code[1] ^=  (1 << 27);
+}
+
+void
+CodeEmitterGK110::emitNOP(const Instruction *i)
+{
+   code[0] = 0x00003c02;
+   code[1] = 0x85800000;
+
+   if (i)
+      emitPredicate(i);
+   else
+      code[0] = 0x001c3c02;
+}
+
+void
+CodeEmitterGK110::emitFMAD(const Instruction *i)
+{
+   assert(!isLIMM(i->src(1), TYPE_F32));
+
+   emitForm_21(i, 0x0c0, 0x940);
+
+   NEG_(34, 2);
+   SAT_(35);
+   RND_(36, F);
+   FTZ_(38);
+
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   if (code[0] & 0x1) {
+      if (neg1)
+         code[1] ^= 1 << 27;
+   } else
+   if (neg1) {
+      code[1] |= 1 << 19;
+   }
+}
+
+void
+CodeEmitterGK110::emitFMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+   if (isLIMM(i->src(1), TYPE_F32)) {
+      emitForm_L(i, 0x200, 0x2, Modifier(0));
+
+      FTZ_(38);
+      SAT_(3a);
+      if (neg)
+         code[1] ^= 1 << 22;
+
+      assert(i->postFactor == 0);
+   } else {
+      emitForm_21(i, 0x234, 0xc34);
+
+      RND_(2a, F);
+      FTZ_(2f);
+      SAT_(35);
+
+      if (code[0] & 0x1) {
+         if (neg)
+            code[1] ^= 1 << 27;
+      } else
+      if (neg) {
+         code[1] |= 1 << 19;
+      }
+   }
+}
+
+void
+CodeEmitterGK110::emitIMUL(const Instruction *i)
+{
+   assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x280, 2, Modifier(0));
+
+      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
+
+      if (i->sType == TYPE_S32)
+         code[1] |= 3 << 25;
+   } else {
+      emitForm_21(i, 0x21c, 0xc1c);
+
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 10;
+      if (i->sType == TYPE_S32)
+         code[1] |= 3 << 11;
+   }
+}
+
+void
+CodeEmitterGK110::emitFADD(const Instruction *i)
+{
+   if (isLIMM(i->src(1), TYPE_F32)) {
+      assert(i->rnd == ROUND_N);
+      assert(!i->saturate);
+
+      emitForm_L(i, 0x400, 0, i->src(1).mod);
+
+      FTZ_(3a);
+      NEG_(3b, 0);
+      ABS_(39, 0);
+   } else {
+      emitForm_21(i, 0x22c, 0xc2c);
+
+      FTZ_(2f);
+      RND_(2a, F);
+      ABS_(31, 0);
+      NEG_(33, 0);
+
+      if (code[0] & 0x1) {
+         modNegAbsF32_3b(i, 1);
+      } else {
+         ABS_(34, 1);
+         NEG_(30, 1);
+      }
+   }
+}
+
+void
+CodeEmitterGK110::emitUADD(const Instruction *i)
+{
+   uint8_t addOp = (i->src(0).mod.neg() << 1) | i->src(1).mod.neg();
+
+   if (i->op == OP_SUB)
+      addOp ^= 1;
+
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x400, 1, Modifier((addOp & 1) ? NV50_IR_MOD_NEG : 0));
+
+      if (addOp & 2)
+         code[1] |= 1 << 27;
+
+      assert(!i->defExists(1));
+      assert(i->flagsSrc < 0);
+
+      SAT_(39);
+   } else {
+      emitForm_21(i, 0x208, 0xc08);
+
+      assert(addOp != 3); // would be add-plus-one
+
+      code[1] |= addOp << 19;
+
+      if (i->defExists(1))
+         code[1] |= 1 << 18; // write carry
+      if (i->flagsSrc >= 0)
+         code[1] |= 1 << 14; // add carry
+
+      SAT_(35);
+   }
+}
+
+// TODO: shl-add
+void
+CodeEmitterGK110::emitIMAD(const Instruction *i)
+{
+   uint8_t addOp =
+      (i->src(2).mod.neg() << 1) | (i->src(0).mod.neg() ^ i->src(1).mod.neg());
+
+   emitForm_21(i, 0x100, 0xa00);
+
+   assert(addOp != 3);
+   code[1] |= addOp << 26;
+
+   if (i->sType == TYPE_S32)
+      code[1] |= (1 << 19) | (1 << 24);
+
+   if (code[0] & 0x1) {
+      assert(!i->subOp);
+      SAT_(39);
+   } else {
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 25;
+      SAT_(35);
+   }
+}
+
+void
+CodeEmitterGK110::emitISAD(const Instruction *i)
+{
+   assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+
+   emitForm_21(i, 0x1fc, 0xb74);
+
+   if (i->dType == TYPE_S32)
+      code[1] |= 1 << 19;
+}
+
+void
+CodeEmitterGK110::emitNOT(const Instruction *i)
+{
+   code[0] = 0x0003fc02; // logop(mov2) dst, 0, not src
+   code[1] = 0x22003800;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   switch (i->src(0).getFile()) {
+   case FILE_GPR:
+      code[1] |= 0xc << 28;
+      srcId(i->src(0), 23);
+      break;
+   case FILE_MEMORY_CONST:
+      code[1] |= 0x4 << 28;
+      setCAddress14(i->src(1));
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterGK110::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+   assert(!(i->src(0).mod & Modifier(NV50_IR_MOD_NOT))); // XXX: find me
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x200, 0, i->src(1).mod);
+      code[1] |= subOp << 24;
+   } else {
+      emitForm_21(i, 0x220, 0xc20);
+      code[1] |= subOp << 12;
+      NOT_(2b, 1);
+   }
+   assert(!(code[0] & 0x1) || !(i->src(1).mod & Modifier(NV50_IR_MOD_NOT)));
+}
+
+void
+CodeEmitterGK110::emitPOPC(const Instruction *i)
+{
+   assert(!isLIMM(i->src(1), TYPE_S32, true));
+
+   emitForm_21(i, 0x204, 0xc04);
+
+   NOT_(2a, 0);
+   if (!(code[0] & 0x1))
+      NOT_(2b, 1);
+}
+
+void
+CodeEmitterGK110::emitINSBF(const Instruction *i)
+{
+   emitForm_21(i, 0x1f8, 0xb78);
+}
+
+void
+CodeEmitterGK110::emitShift(const Instruction *i)
+{
+   const bool sar = i->op == OP_SHR && isSignedType(i->sType);
+
+   if (sar) {
+      emitForm_21(i, 0x214, 0x014);
+      code[1] |= 1 << 19;
+   } else
+   if (i->op == OP_SHR) {
+      // this is actually RSHF
+      emitForm_21(i, 0x27c, 0x87c);
+      code[1] |= GK110_GPR_ZERO << 10;
+   } else {
+      // this is actually LSHF
+      emitForm_21(i, 0x1fc, 0xb7c);
+      code[1] |= GK110_GPR_ZERO << 10;
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP) {
+      if (!sar)
+         code[1] |= 1 << 21;
+      // XXX: find wrap modifier for SHR S32
+   }
+}
+
+void
+CodeEmitterGK110::emitPreOp(const Instruction *i)
+{
+   emitForm_21(i, 0x248, -1);
+
+   if (i->op == OP_PREEX2)
+      code[1] |= 1 << 10;
+
+   NEG_(30, 0);
+   ABS_(34, 0);
+}
+
+void
+CodeEmitterGK110::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   code[0] = 0x00000002 | (subOp << 23);
+   code[1] = 0x84000000;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0), 10);
+
+   NEG_(33, 0);
+   ABS_(31, 0);
+
+   // XXX: find saturate
+}
+
+void
+CodeEmitterGK110::emitMINMAX(const Instruction *i)
+{
+   uint32_t op2, op1;
+
+   switch (i->dType) {
+   case TYPE_U32:
+   case TYPE_S32:
+      op2 = 0x210;
+      op1 = 0xc10;
+      break;
+   case TYPE_F32:
+      op2 = 0x230;
+      op1 = 0xc30;
+      break;
+   case TYPE_F64:
+      op2 = 0x228;
+      op1 = 0xc28;
+      break;
+   default:
+      assert(0);
+      op2 = 0;
+      op1 = 0;
+      break;
+   }
+   emitForm_21(i, op2, op1);
+
+   if (i->dType == TYPE_S32)
+      code[1] |= 1 << 19;
+   code[1] |= (i->op == OP_MIN) ? 0x1c00 : 0x3c00; // [!]pt
+
+   FTZ_(2f);
+   ABS_(31, 0);
+   NEG_(33, 0);
+   if (code[0] & 0x1) {
+      modNegAbsF32_3b(i, 1);
+   } else {
+      ABS_(34, 1);
+      NEG_(30, 1);
+   }
+}
+
+void
+CodeEmitterGK110::emitCVT(const Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+   const bool f2i = !isFloatType(i->dType) && isFloatType(i->sType);
+   const bool i2f = isFloatType(i->dType) && !isFloatType(i->sType);
+
+   bool sat = i->saturate;
+   bool abs = i->src(0).mod.abs();
+   bool neg = i->src(0).mod.neg();
+
+   RoundMode rnd = i->rnd;
+
+   switch (i->op) {
+   case OP_CEIL:  rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   case OP_SAT: sat = true; break;
+   case OP_NEG: neg = !neg; break;
+   case OP_ABS: abs = true; neg = false; break;
+   default:
+      break;
+   }
+
+   uint32_t op;
+
+   if      (f2f) op = 0x254;
+   else if (f2i) op = 0x258;
+   else if (i2f) op = 0x25c;
+   else          op = 0x260;
+
+   emitForm_C(i, op, 0x2);
+
+   FTZ_(2f);
+   if (neg) code[1] |= 1 << 16;
+   if (abs) code[1] |= 1 << 20;
+   if (sat) code[1] |= 1 << 21;
+
+   emitRoundMode(rnd, 32 + 10, f2f ? (32 + 13) : -1);
+
+   code[0] |= typeSizeofLog2(i->dType) << 10;
+   code[0] |= typeSizeofLog2(i->sType) << 12;
+
+   if (isSignedIntType(i->dType))
+      code[0] |= 0x4000;
+   if (isSignedIntType(i->sType))
+      code[0] |= 0x8000;
+}
+
+void
+CodeEmitterGK110::emitSET(const CmpInstruction *i)
+{
+   uint16_t op1, op2;
+
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      switch (i->sType) {
+      case TYPE_F32: op2 = 0x1d8; op1 = 0xb58; break;
+      case TYPE_F64: op2 = 0x1c0; op1 = 0xb40; break;
+      default:
+         op2 = 0x1b0;
+         op1 = 0xb30;
+         break;
+      }
+      emitForm_21(i, op2, op1);
+
+      NEG_(2e, 0);
+      ABS_(9, 0);
+      if (!(code[0] & 0x1)) {
+         NEG_(8, 1);
+         ABS_(2f, 1);
+      } else {
+         modNegAbsF32_3b(i, 1);
+      }
+      FTZ_(32);
+
+      // normal DST field is negated predicate result
+      code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0);
+      if (i->defExists(1))
+         defId(i->def(1), 2);
+   else
+      code[0] |= 0x1c;
+   } else {
+      switch (i->sType) {
+      case TYPE_F32: op2 = 0x000; op1 = 0x820; break;
+      case TYPE_F64: op2 = 0x080; op1 = 0x900; break;
+      default:
+         op2 = 0x1a8;
+         op1 = 0xb28;
+         break;
+      }
+      emitForm_21(i, op2, op1);
+
+      NEG_(2e, 0);
+      ABS_(39, 0);
+      if (!(code[0] & 0x1)) {
+         NEG_(38, 1);
+         ABS_(2f, 1);
+      } else {
+         modNegAbsF32_3b(i, 1);
+      }
+      FTZ_(3a);
+   }
+   if (i->sType == TYPE_S32)
+      code[1] |= 1 << 19;
+
+   if (i->op != OP_SET) {
+      switch (i->op) {
+      case OP_SET_AND: code[1] |= 0x0 << 16; break;
+      case OP_SET_OR:  code[1] |= 0x1 << 16; break;
+      case OP_SET_XOR: code[1] |= 0x2 << 16; break;
+      default:
+         assert(0);
+         break;
+      }
+      srcId(i->src(2), 0x2a);
+   } else {
+      code[1] |= 0x7 << 10;
+   }
+   emitCondCode(i->setCond,
+                isFloatType(i->sType) ? 0x33 : 0x34,
+                isFloatType(i->sType) ? 0xf : 0x7);
+}
+
+void
+CodeEmitterGK110::emitSLCT(const CmpInstruction *i)
+{
+   CondCode cc = i->setCond;
+   if (i->src(2).mod.neg())
+      cc = reverseCondCode(cc);
+
+   if (i->dType == TYPE_F32) {
+      emitForm_21(i, 0x1d0, 0xb50);
+      FTZ_(32);
+      emitCondCode(cc, 0x33, 0xf);
+   } else {
+      emitForm_21(i, 0x1a4, 0xb20);
+      emitCondCode(cc, 0x34, 0x7);
+   }
+}
+
+void CodeEmitterGK110::emitSELP(const Instruction *i)
+{
+   emitForm_21(i, 0x250, 0x050);
+
+   if ((i->cc == CC_NOT_P) ^ (bool)(i->src(2).mod & Modifier(NV50_IR_MOD_NOT)))
+      code[1] |= 1 << 13;
+}
+
+void CodeEmitterGK110::emitTEXBAR(const Instruction *i)
+{
+   code[0] = 0x00000002 | (i->subOp << 23);
+   code[1] = 0x77000000;
+
+   emitPredicate(i);
+}
+
+void CodeEmitterGK110::emitTEXCSAA(const TexInstruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+   if (!i->next || !isTextureOp(i->next->op))
+      return false;
+   if (i->getDef(0)->interfers(i->next->getSrc(0)))
+      return false;
+   return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterGK110::emitTEX(const TexInstruction *i)
+{
+   const bool ind = i->tex.rIndirectSrc >= 0;
+
+   if (ind) {
+      code[0] = 0x00000002;
+      switch (i->op) {
+      case OP_TXD:
+         code[1] = 0x7e000000;
+         break;
+      default:
+         code[1] = 0x7d800000;
+         break;
+      }
+   } else {
+      switch (i->op) {
+      case OP_TXD:
+         code[0] = 0x00000002;
+         code[1] = 0x76000000;
+         break;
+      default:
+         code[0] = 0x00000001;
+         code[1] = 0x60000000;
+         break;
+      }
+      code[1] |= i->tex.r << 15;
+   }
+
+   code[1] |= isNextIndependentTex(i) ? 0x1 : 0x2; // t : p mode
+
+   // if (i->tex.liveOnly)
+   //    ?
+
+   switch (i->op) {
+   case OP_TEX: break;
+   case OP_TXB: code[1] |= 0x2000; break;
+   case OP_TXL: code[1] |= 0x3000; break;
+   case OP_TXF: break; // XXX
+   case OP_TXG: break; // XXX
+   case OP_TXD: break;
+   default:
+      assert(!"invalid texture op");
+      break;
+   }
+   /*
+   if (i->op == OP_TXF) {
+      if (!i->tex.levelZero)
+         code[1] |= 0x02000000;
+   } else */
+   if (i->tex.levelZero) {
+      code[1] |= 0x1000;
+   }
+
+   // if (i->op != OP_TXD && i->tex.derivAll)
+   //   code[1] |= 1 << 13;
+
+   emitPredicate(i);
+
+   code[1] |= i->tex.mask << 2;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   defId(i->def(0), 2);
+   srcId(i->src(0), 10);
+   srcId(i, src1, 23);
+
+   // if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+   // texture target:
+   code[1] |= (i->tex.target.isCube() ? 3 : (i->tex.target.getDim() - 1)) << 7;
+   if (i->tex.target.isArray())
+      code[1] |= 0x40;
+   // if (i->tex.target.isShadow())
+   //   ?
+   // if (i->tex.target == TEX_TARGET_2D_MS ||
+   //     i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+   //   ?
+
+   if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+      // ?
+   }
+
+   // if (i->tex.useOffsets)
+   //   ?
+}
+
+void
+CodeEmitterGK110::emitTXQ(const TexInstruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitFlow(const Instruction *i)
+{
+   const FlowInstruction *f = i->asFlow();
+
+   unsigned mask; // bit 0: predicate, bit 1: target
+
+   code[0] = 0x00000000;
+
+   switch (i->op) {
+   case OP_BRA:
+      code[1] = f->absolute ? 0x00000 : 0x12000000; // XXX
+      // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+      //   code[0] |= 0x4000;
+      mask = 3;
+      break;
+   case OP_CALL:
+      code[1] = f->absolute ? 0x00000 : 0x13000000; // XXX
+      // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+      //   code[0] |= 0x4000;
+      mask = 2;
+      break;
+
+   case OP_EXIT:    code[1] = 0x18000000; mask = 1; break;
+   case OP_RET:     code[1] = 0x19000000; mask = 1; break;
+   case OP_DISCARD: code[1] = 0x19800000; mask = 1; break; // XXX: guess
+   case OP_BREAK:   code[1] = 0x1a800000; mask = 1; break; // XXX: guess
+   case OP_CONT:    code[1] = 0x1b000000; mask = 1; break; // XXX: guess
+
+   case OP_JOINAT:   code[1] = 0x14800000; mask = 2; break;
+   case OP_PREBREAK: code[1] = 0x15000000; mask = 2; break; // XXX: guess
+   case OP_PRECONT:  code[1] = 0x15800000; mask = 2; break; // XXX: guess
+   case OP_PRERET:   code[1] = 0x16000000; mask = 2; break; // XXX: guess
+
+   case OP_QUADON:  code[1] = 0x1c000000; mask = 0; break; // XXX: guess
+   case OP_QUADPOP: code[1] = 0x1c800000; mask = 0; break; // XXX: guess
+   case OP_BRKPT:   code[1] = 0x1d000000; mask = 0; break; // XXX: guess
+   default:
+      assert(!"invalid flow operation");
+      return;
+   }
+
+   if (mask & 1) {
+      emitPredicate(i);
+      if (i->flagsSrc < 0)
+         code[0] |= 0x3c;
+   }
+
+   if (!f)
+      return;
+
+   // TODO
+   /*
+   if (f->allWarp)
+      code[0] |= 1 << 15;
+   if (f->limit)
+      code[0] |= 1 << 16;
+   */
+
+   if (f->op == OP_CALL) {
+      if (f->builtin) {
+         assert(f->absolute);
+         uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+         addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xff800000, 23);
+         addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x007fffff, -9);
+      } else {
+         assert(!f->absolute);
+         int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+         code[0] |= (pcRel & 0x1ff) << 23;
+         code[1] |= (pcRel >> 9) & 0x7fff;
+      }
+   } else
+   if (mask & 2) {
+      int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      // currently we don't want absolute branches
+      assert(!f->absolute);
+      code[0] |= (pcRel & 0x1ff) << 23;
+      code[1] |= (pcRel >> 9) & 0x7fff;
+   }
+}
+
+void
+CodeEmitterGK110::emitPFETCH(const Instruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitVFETCH(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7ec00000 | (offset >> 9);
+
+#if 0
+   if (i->perPatch)
+      code[0] |= 0x100;
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+#endif
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+   srcId(i->src(0).getIndirect(1), 32 + 10); // vertex address
+}
+
+void
+CodeEmitterGK110::emitEXPORT(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7f000000 | (offset >> 9);
+
+#if 0
+   if (i->perPatch)
+      code[0] |= 0x100;
+#endif
+
+   emitPredicate(i);
+
+   assert(i->src(1).getFile() == FILE_GPR);
+
+   srcId(i->src(0).getIndirect(0), 10);
+   srcId(i->src(0).getIndirect(1), 32 + 10); // vertex base address
+   srcId(i->src(1), 2);
+}
+
+void
+CodeEmitterGK110::emitOUT(const Instruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitInterpMode(const Instruction *i)
+{
+   code[1] |= i->ipa << 21; // TODO: INTERP_SAMPLEID
+}
+
+void
+CodeEmitterGK110::emitINTERP(const Instruction *i)
+{
+   const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+   code[0] = 0x00000002 | (base << 31);
+   code[1] = 0x74800000 | (base >> 1);
+
+   if (i->saturate)
+      code[1] |= 1 << 18;
+
+   if (i->op == OP_PINTERP)
+      srcId(i->src(1), 23);
+   else
+      code[0] |= 0xff << 23;
+
+   srcId(i->src(0).getIndirect(0), 10);
+   emitInterpMode(i);
+
+   emitPredicate(i);
+   defId(i->def(0), 2);
+
+   if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+      srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 32 + 10);
+   else
+      code[1] |= 0xff << 10;
+}
+
+void
+CodeEmitterGK110::emitLoadStoreType(DataType ty, const int pos)
+{
+   uint8_t n;
+
+   switch (ty) {
+   case TYPE_U8:
+      n = 0;
+      break;
+   case TYPE_S8:
+      n = 1;
+      break;
+   case TYPE_U16:
+      n = 2;
+      break;
+   case TYPE_S16:
+      n = 3;
+      break;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      n = 4;
+      break;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      n = 5;
+      break;
+   case TYPE_B128:
+      n = 6;
+      break;
+   default:
+      n = 0;
+      assert(!"invalid ld/st type");
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitCachingMode(CacheMode c, const int pos)
+{
+   uint8_t n;
+
+   switch (c) {
+   case CACHE_CA:
+// case CACHE_WB:
+      n = 0;
+      break;
+   case CACHE_CG:
+      n = 1;
+      break;
+   case CACHE_CS:
+      n = 2;
+      break;
+   case CACHE_CV:
+// case CACHE_WT:
+      n = 3;
+      break;
+   default:
+      n = 0;
+      assert(!"invalid caching mode");
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitSTORE(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: code[1] = 0xe0000000; code[0] = 0x00000000; break;
+   case FILE_MEMORY_LOCAL:  code[1] = 0x7a800000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   default:
+      assert(!"invalid memory file");
+      break;
+   }
+
+   if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
+      offset &= 0xffffff;
+
+   if (code[0] & 0x2) {
+      emitLoadStoreType(i->dType, 0x33);
+      if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+         emitCachingMode(i->cache, 0x2f);
+   } else {
+      emitLoadStoreType(i->dType, 0x38);
+      emitCachingMode(i->cache, 0x3b);
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   emitPredicate(i);
+
+   srcId(i->src(1), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
+void
+CodeEmitterGK110::emitLOAD(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
+   case FILE_MEMORY_LOCAL:  code[1] = 0x7a000000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_CONST:
+      if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+         emitMOV(i);
+         return;
+      }
+      offset &= 0xffff;
+      code[0] = 0x00000002;
+      code[1] = 0x7c800000 | (i->src(0).get()->reg.fileIndex << 7);
+      break;
+   default:
+      assert(!"invalid memory file");
+      break;
+   }
+
+   if (code[0] & 0x2) {
+      offset &= 0xffffff;
+      emitLoadStoreType(i->dType, 0x33);
+      if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+         emitCachingMode(i->cache, 0x2f);
+   } else {
+      emitLoadStoreType(i->dType, 0x38);
+      emitCachingMode(i->cache, 0x3b);
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
+uint8_t
+CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
+{
+   switch (SDATA(ref).sv.sv) {
+   case SV_LANEID:        return 0x00;
+   case SV_PHYSID:        return 0x03;
+   case SV_VERTEX_COUNT:  return 0x10;
+   case SV_INVOCATION_ID: return 0x11;
+   case SV_YDIR:          return 0x12;
+   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
+   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
+   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
+   case SV_GRIDID:        return 0x2c;
+   case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
+   case SV_LBASE:         return 0x34;
+   case SV_SBASE:         return 0x30;
+   case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
+   default:
+      assert(!"no sreg for system value");
+      return 0;
+   }
+}
+
+void
+CodeEmitterGK110::emitMOV(const Instruction *i)
+{
+   if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+      code[0] = 0x00000002 | (getSRegEncoding(i->src(0)) << 23);
+      code[1] = 0x86400000;
+      emitPredicate(i);
+      defId(i->def(0), 2);
+   } else
+   if (i->src(0).getFile() == FILE_IMMEDIATE) {
+      code[0] = 0x00000002 | (i->lanes << 14);
+      code[1] = 0x74000000;
+      emitPredicate(i);
+      defId(i->def(0), 2);
+      setImmediate32(i, 0, Modifier(0));
+   } else
+   if (i->src(0).getFile() == FILE_PREDICATE) {
+      // TODO
+   } else {
+      emitForm_C(i, 0x24c, 2);
+      code[1] |= i->lanes << 10;
+   }
+}
+
+bool
+CodeEmitterGK110::emitInstruction(Instruction *insn)
+{
+   const unsigned int size = (writeIssueDelays && !(codeSize & 0x3f)) ? 16 : 8;
+
+   if (insn->encSize != 8) {
+      ERROR("skipping unencodable instruction: ");
+      insn->print();
+      return false;
+   } else
+   if (codeSize + size > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (writeIssueDelays) {
+      int id = (codeSize & 0x3f) / 8 - 1;
+      if (id < 0) {
+         id += 1;
+         code[0] = 0x00000000; // cf issue delay "instruction"
+         code[1] = 0x08000000;
+         code += 2;
+         codeSize += 8;
+      }
+      uint32_t *data = code - (id * 2 + 2);
+
+      switch (id) {
+      case 0: data[0] |= insn->sched << 2; break;
+      case 1: data[0] |= insn->sched << 10; break;
+      case 2: data[0] |= insn->sched << 18; break;
+      case 3: data[0] |= insn->sched << 26; data[1] |= insn->sched >> 6; break;
+      case 4: data[1] |= insn->sched << 2;
+      case 5: data[1] |= insn->sched << 10; break;
+      case 6: data[1] |= insn->sched << 18; break;
+      default:
+         assert(0);
+         break;
+      }
+   }
+
+   // assert that instructions with multiple defs don't corrupt registers
+   for (int d = 0; insn->defExists(d); ++d)
+      assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+   switch (insn->op) {
+   case OP_MOV:
+   case OP_RDSV:
+      emitMOV(insn);
+      break;
+   case OP_NOP:
+      break;
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_VFETCH:
+      emitVFETCH(insn);
+      break;
+   case OP_EXPORT:
+      emitEXPORT(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitIMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+      emitLogicOp(insn, 0);
+      break;
+   case OP_OR:
+      emitLogicOp(insn, 1);
+      break;
+   case OP_XOR:
+      emitLogicOp(insn, 2);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+      emitSET(insn->asCmp());
+      break;
+   case OP_SELP:
+      emitSELP(insn);
+      break;
+   case OP_SLCT:
+      emitSLCT(insn->asCmp());
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_ABS:
+   case OP_NEG:
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_CVT:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 1);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXD:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXBAR:
+      emitTEXBAR(insn);
+      break;
+   case OP_BRA:
+   case OP_CALL:
+   case OP_PRERET:
+   case OP_RET:
+   case OP_DISCARD:
+   case OP_EXIT:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_PREBREAK:
+   case OP_BREAK:
+   case OP_JOINAT:
+   case OP_BRKPT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+      emitFlow(insn);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->subOp, insn->lanes);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+      break;
+   case OP_POPCNT:
+      emitPOPC(insn);
+      break;
+   case OP_JOIN:
+      emitNOP(insn);
+      insn->join = 1;
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknow op\n");
+      return false;
+   }
+
+   if (insn->join)
+      code[0] |= 1 << 22;
+
+   code += 2;
+   codeSize += 8;
+   return true;
+}
+
+uint32_t
+CodeEmitterGK110::getMinEncodingSize(const Instruction *i) const
+{
+   // No more short instruction encodings.
+   return 8;
+}
+
+void
+CodeEmitterGK110::prepareEmission(Function *func)
+{
+   const Target *targ = func->getProgram()->getTarget();
+
+   CodeEmitter::prepareEmission(func);
+
+   if (targ->hasSWSched)
+      calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterGK110::CodeEmitterGK110(const TargetNVC0 *target)
+   : CodeEmitter(target),
+     targNVC0(target),
+     writeIssueDelays(target->hasSWSched)
+{
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterGK110(Program::Type type)
+{
+   CodeEmitterGK110 *emit = new CodeEmitterGK110(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
new file mode 100644
index 00000000000..3eca27d0bbc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -0,0 +1,1962 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+#define NV50_OP_ENC_LONG     0
+#define NV50_OP_ENC_SHORT    1
+#define NV50_OP_ENC_IMM      2
+#define NV50_OP_ENC_LONG_ALT 3
+
+class CodeEmitterNV50 : public CodeEmitter
+{
+public:
+   CodeEmitterNV50(const TargetNV50 *);
+
+   virtual bool emitInstruction(Instruction *);
+
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+   virtual void prepareEmission(Function *);
+
+private:
+   Program::Type progType;
+
+   const TargetNV50 *targNV50;
+
+private:
+   inline void defId(const ValueDef&, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+
+   inline void srcAddr16(const ValueRef&, bool adj, const int pos);
+   inline void srcAddr8(const ValueRef&, const int pos);
+
+   void emitFlagsRd(const Instruction *);
+   void emitFlagsWr(const Instruction *);
+
+   void emitCondCode(CondCode cc, DataType ty, int pos);
+
+   inline void setARegBits(unsigned int);
+
+   void setAReg16(const Instruction *, int s);
+   void setImmediate(const Instruction *, int s);
+
+   void setDst(const Value *);
+   void setDst(const Instruction *, int d);
+   void setSrcFileBits(const Instruction *, int enc);
+   void setSrc(const Instruction *, unsigned int s, int slot);
+
+   void emitForm_MAD(const Instruction *);
+   void emitForm_ADD(const Instruction *);
+   void emitForm_MUL(const Instruction *);
+   void emitForm_IMM(const Instruction *);
+
+   void emitLoadStoreSizeLG(DataType ty, int pos);
+   void emitLoadStoreSizeCS(DataType ty);
+
+   void roundMode_MAD(const Instruction *);
+   void roundMode_CVT(RoundMode);
+
+   void emitMNeg12(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+   void emitNOP();
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitAADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitIMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitFMAD(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+
+   void emitMINMAX(const Instruction *);
+
+   void emitPreOp(const Instruction *);
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitShift(const Instruction *);
+   void emitARL(const Instruction *, unsigned int shl);
+   void emitLogicOp(const Instruction *);
+   void emitNOT(const Instruction *);
+
+   void emitCVT(const Instruction *);
+   void emitSET(const Instruction *);
+
+   void emitTEX(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+   void emitTEXPREP(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp);
+
+   void emitFlow(const Instruction *, uint8_t flowOp);
+   void emitPRERETEmu(const FlowInstruction *);
+   void emitBAR(const Instruction *);
+
+   void emitATOM(const Instruction *);
+};
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNV50::srcId(const ValueRef& src, const int pos)
+{
+   assert(src.get());
+   code[pos / 32] |= SDATA(src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcId(const ValueRef *src, const int pos)
+{
+   assert(src->get());
+   code[pos / 32] |= SDATA(*src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr16(const ValueRef& src, bool adj, const int pos)
+{
+   assert(src.get());
+
+   int32_t offset = SDATA(src).offset;
+
+   assert(!adj || src.get()->reg.size <= 4);
+   if (adj)
+      offset /= src.get()->reg.size;
+
+   assert(offset <= 0x7fff && offset >= (int32_t)-0x8000 && (pos % 32) <= 16);
+
+   if (offset < 0)
+      offset &= adj ? (0xffff >> (src.get()->reg.size >> 1)) : 0xffff;
+
+   code[pos / 32] |= offset << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos)
+{
+   assert(src.get());
+
+   uint32_t offset = SDATA(src).offset;
+
+   assert((offset <= 0x1fc || offset == 0x3fc) && !(offset & 0x3));
+
+   code[pos / 32] |= (offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterNV50::defId(const ValueDef& def, const int pos)
+{
+   assert(def.get() && def.getFile() != FILE_SHADER_OUTPUT);
+
+   code[pos / 32] |= DDATA(def).id << (pos % 32);
+}
+
+void
+CodeEmitterNV50::roundMode_MAD(const Instruction *insn)
+{
+   switch (insn->rnd) {
+   case ROUND_M: code[1] |= 1 << 22; break;
+   case ROUND_P: code[1] |= 2 << 22; break;
+   case ROUND_Z: code[1] |= 3 << 22; break;
+   default:
+      assert(insn->rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitMNeg12(const Instruction *i)
+{
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.neg() << 27;
+}
+
+void CodeEmitterNV50::emitCondCode(CondCode cc, DataType ty, int pos)
+{
+   uint8_t enc;
+
+   assert(pos >= 32 || pos <= 27);
+
+   switch (cc) {
+   case CC_LT:  enc = 0x1; break;
+   case CC_LTU: enc = 0x9; break;
+   case CC_EQ:  enc = 0x2; break;
+   case CC_EQU: enc = 0xa; break;
+   case CC_LE:  enc = 0x3; break;
+   case CC_LEU: enc = 0xb; break;
+   case CC_GT:  enc = 0x4; break;
+   case CC_GTU: enc = 0xc; break;
+   case CC_NE:  enc = 0x5; break;
+   case CC_NEU: enc = 0xd; break;
+   case CC_GE:  enc = 0x6; break;
+   case CC_GEU: enc = 0xe; break;
+   case CC_TR:  enc = 0xf; break;
+   case CC_FL:  enc = 0x0; break;
+
+   case CC_O:  enc = 0x10; break;
+   case CC_C:  enc = 0x11; break;
+   case CC_A:  enc = 0x12; break;
+   case CC_S:  enc = 0x13; break;
+   case CC_NS: enc = 0x1c; break;
+   case CC_NA: enc = 0x1d; break;
+   case CC_NC: enc = 0x1e; break;
+   case CC_NO: enc = 0x1f; break;
+
+   default:
+      enc = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   if (ty != TYPE_NONE && !isFloatType(ty))
+      enc &= ~0x8; // unordered only exists for float types
+
+   code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitFlagsRd(const Instruction *i)
+{
+   int s = (i->flagsSrc >= 0) ? i->flagsSrc : i->predSrc;
+
+   assert(!(code[1] & 0x00003f80));
+
+   if (s >= 0) {
+      assert(i->getSrc(s)->reg.file == FILE_FLAGS);
+      emitCondCode(i->cc, TYPE_NONE, 32 + 7);
+      srcId(i->src(s), 32 + 12);
+   } else {
+      code[1] |= 0x0780;
+   }
+}
+
+void
+CodeEmitterNV50::emitFlagsWr(const Instruction *i)
+{
+   assert(!(code[1] & 0x70));
+
+   int flagsDef = i->flagsDef;
+
+   // find flags definition and check that it is the last def
+   if (flagsDef < 0) {
+      for (int d = 0; i->defExists(d); ++d)
+         if (i->def(d).getFile() == FILE_FLAGS)
+            flagsDef = d;
+      if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
+         WARN("Instruction::flagsDef was not set properly\n");
+   }
+   if (flagsDef == 0 && i->defExists(1))
+      WARN("flags def should not be the primary definition\n");
+
+   if (flagsDef >= 0)
+      code[1] |= (DDATA(i->def(flagsDef)).id << 4) | 0x40;
+
+}
+
+void
+CodeEmitterNV50::setARegBits(unsigned int u)
+{
+   code[0] |= (u & 3) << 26;
+   code[1] |= (u & 4);
+}
+
+void
+CodeEmitterNV50::setAReg16(const Instruction *i, int s)
+{
+   if (i->srcExists(s)) {
+      s = i->src(s).indirect[0];
+      if (s >= 0)
+         setARegBits(SDATA(i->src(s)).id + 1);
+   }
+}
+
+void
+CodeEmitterNV50::setImmediate(const Instruction *i, int s)
+{
+   const ImmediateValue *imm = i->src(s).get()->asImm();
+   assert(imm);
+
+   uint32_t u = imm->reg.data.u32;
+
+   if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))
+      u = ~u;
+
+   code[1] |= 3;
+   code[0] |= (u & 0x3f) << 16;
+   code[1] |= (u >> 6) << 2;
+}
+
+void
+CodeEmitterNV50::setDst(const Value *dst)
+{
+   const Storage *reg = &dst->join->reg;
+
+   assert(reg->file != FILE_ADDRESS);
+
+   if (reg->data.id < 0 || reg->file == FILE_FLAGS) {
+      code[0] |= (127 << 2) | 1;
+      code[1] |= 8;
+   } else {
+      int id;
+      if (reg->file == FILE_SHADER_OUTPUT) {
+         code[1] |= 8;
+         id = reg->data.offset / 4;
+      } else {
+         id = reg->data.id;
+      }
+      code[0] |= id << 2;
+   }
+}
+
+void
+CodeEmitterNV50::setDst(const Instruction *i, int d)
+{
+   if (i->defExists(d)) {
+      setDst(i->getDef(d));
+   } else
+   if (!d) {
+      code[0] |= 0x01fc; // bit bucket
+      code[1] |= 0x0008;
+   }
+}
+
+// 3 * 2 bits:
+// 0: r
+// 1: a/s
+// 2: c
+// 3: i
+void
+CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
+{
+   uint8_t mode = 0;
+
+   for (unsigned int s = 0; s < Target::operationSrcNr[i->op]; ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (s * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (s * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (s * 2);
+         break;
+      default:
+	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+         assert(0);
+         break;
+      }
+   }
+   switch (mode) {
+   case 0x00: // rrr
+      break;
+   case 0x01: // arr/grr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+         if (enc == NV50_OP_ENC_LONG || enc == NV50_OP_ENC_LONG_ALT)
+            code[1] |= 0x00200000;
+      } else {
+         if (enc == NV50_OP_ENC_SHORT)
+            code[0] |= 0x01000000;
+         else
+            code[1] |= 0x00200000;
+      }
+      break;
+   case 0x03: // irr
+      assert(i->op == OP_MOV);
+      return;
+   case 0x0c: // rir
+      break;
+   case 0x0d: // gir
+      code[0] |= 0x01000000;
+      assert(progType == Program::TYPE_GEOMETRY ||
+             progType == Program::TYPE_COMPUTE);
+      break;
+   case 0x08: // rcr
+      code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x09: // acr/gcr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+      } else {
+         code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+         code[1] |= 0x00200000;
+      }
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x20: // rrc
+      code[0] |= 0x01000000;
+      code[1] |= (i->getSrc(2)->reg.fileIndex << 22);
+      break;
+   case 0x21: // arc
+      code[0] |= 0x01000000;
+      code[1] |= 0x00200000 | (i->getSrc(2)->reg.fileIndex << 22);
+      assert(progType != Program::TYPE_GEOMETRY);
+      break;
+   default:
+      ERROR("not encodable: %x\n", mode);
+      assert(0);
+      break;
+   }
+   if (progType != Program::TYPE_COMPUTE)
+      return;
+
+   if ((mode & 3) == 1) {
+      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+
+      switch (i->getSrc(0)->reg.type) {
+      case TYPE_U8:
+         break;
+      case TYPE_U16:
+         code[0] |= 1 << pos;
+         break;
+      case TYPE_S16:
+         code[0] |= 2 << pos;
+         break;
+      default:
+         code[0] |= 3 << pos;
+         assert(i->getSrc(0)->reg.size == 4);
+         break;
+      }
+   }
+}
+
+void
+CodeEmitterNV50::setSrc(const Instruction *i, unsigned int s, int slot)
+{
+   if (Target::operationSrcNr[i->op] <= s)
+      return;
+   const Storage *reg = &i->src(s).rep()->reg;
+
+   unsigned int id = (reg->file == FILE_GPR) ?
+      reg->data.id :
+      reg->data.offset >> (reg->size >> 1); // no > 4 byte sources here
+
+   switch (slot) {
+   case 0: code[0] |= id << 9; break;
+   case 1: code[0] |= id << 16; break;
+   case 2: code[1] |= id << 14; break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+// the default form:
+//  - long instruction
+//  - 1 to 3 sources in slots 0, 1, 2 (rrr, arr, rcr, acr, rrc, arc, gcr, grr)
+//  - address & flags
+void
+CodeEmitterNV50::emitForm_MAD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_LONG);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
+   setSrc(i, 2, 2);
+
+   setAReg16(i, 1);
+}
+
+// like default form, but 2nd source in slot 2, and no 3rd source
+void
+CodeEmitterNV50::emitForm_ADD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 2);
+
+   setAReg16(i, 1);
+}
+
+// default short form (rr, ar, rc, gr)
+void
+CodeEmitterNV50::emitForm_MUL(const Instruction *i)
+{
+   assert(i->encSize == 4 && !(code[0] & 1));
+   assert(i->defExists(0));
+   assert(!i->getPredicate());
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_SHORT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
+}
+
+// usual immediate form
+// - 1 to 3 sources where last is immediate (rir, gir)
+// - no address or predicate possible
+void
+CodeEmitterNV50::emitForm_IMM(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   assert(i->defExists(0) && i->srcExists(0));
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   if (Target::operationSrcNr[i->op] > 1) {
+      setSrc(i, 0, 0);
+      setImmediate(i, 1);
+      setSrc(i, 2, 1);
+   } else {
+      setImmediate(i, 0);
+   }
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeLG(DataType ty, int pos)
+{
+   uint8_t enc;
+
+   switch (ty) {
+   case TYPE_F32: // fall through
+   case TYPE_S32: // fall through
+   case TYPE_U32:  enc = 0x6; break;
+   case TYPE_B128: enc = 0x5; break;
+   case TYPE_F64: // fall through
+   case TYPE_S64: // fall through
+   case TYPE_U64:  enc = 0x4; break;
+   case TYPE_S16:  enc = 0x3; break;
+   case TYPE_U16:  enc = 0x2; break;
+   case TYPE_S8:   enc = 0x1; break;
+   case TYPE_U8:   enc = 0x0; break;
+   default:
+      enc = 0;
+      assert(!"invalid load/store type");
+      break;
+   }
+   code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeCS(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U8: break;
+   case TYPE_U16: code[1] |= 0x4000; break;
+   case TYPE_S16: code[1] |= 0x8000; break;
+   case TYPE_F32:
+   case TYPE_S32:
+   case TYPE_U32: code[1] |= 0xc000; break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitLOAD(const Instruction *i)
+{
+   DataFile sf = i->src(0).getFile();
+   int32_t offset = i->getSrc(0)->reg.data.offset;
+
+   switch (sf) {
+   case FILE_SHADER_INPUT:
+      // use 'mov' where we can
+      code[0] = i->src(0).isIndirect(0) ? 0x00000001 : 0x10000001;
+      code[1] = 0x00200000 | (i->lanes << 14);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      break;
+   case FILE_MEMORY_SHARED:
+      if (targ->getChipset() >= 0x84) {
+         assert(offset <= (int32_t)(0x3fff * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x40000000;
+
+         if (typeSizeof(i->dType) == 4)
+            code[1] |= 0x04000000;
+
+         emitLoadStoreSizeCS(i->sType);
+      } else {
+         assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x00200000 | (i->lanes << 14);
+         emitLoadStoreSizeCS(i->sType);
+      }
+      break;
+   case FILE_MEMORY_CONST:
+      code[0] = 0x10000001;
+      code[1] = 0x20000000 | (i->getSrc(0)->reg.fileIndex << 22);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      emitLoadStoreSizeCS(i->sType);
+      break;
+   case FILE_MEMORY_LOCAL:
+      code[0] = 0xd0000001;
+      code[1] = 0x40000000;
+      break;
+   case FILE_MEMORY_GLOBAL:
+      code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+      code[1] = 0x80000000;
+      break;
+   default:
+      assert(!"invalid load source file");
+      break;
+   }
+   if (sf == FILE_MEMORY_LOCAL ||
+       sf == FILE_MEMORY_GLOBAL)
+      emitLoadStoreSizeLG(i->sType, 21 + 32);
+
+   setDst(i, 0);
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      srcId(*i->src(0).getIndirect(0), 9);
+   } else {
+      setAReg16(i, 0);
+      srcAddr16(i->src(0), i->src(0).getFile() != FILE_MEMORY_LOCAL, 9);
+   }
+}
+
+void
+CodeEmitterNV50::emitSTORE(const Instruction *i)
+{
+   DataFile f = i->getSrc(0)->reg.file;
+   int32_t offset = i->getSrc(0)->reg.data.offset;
+
+   switch (f) {
+   case FILE_SHADER_OUTPUT:
+      code[0] = 0x00000001 | ((offset >> 2) << 9);
+      code[1] = 0x80c00000;
+      srcId(i->src(1), 32 + 14);
+      break;
+   case FILE_MEMORY_GLOBAL:
+      code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+      code[1] = 0xa0000000;
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
+      break;
+   case FILE_MEMORY_LOCAL:
+      code[0] = 0xd0000001;
+      code[1] = 0x60000000;
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
+      break;
+   case FILE_MEMORY_SHARED:
+      code[0] = 0x00000001;
+      code[1] = 0xe0000000;
+      switch (typeSizeof(i->dType)) {
+      case 1:
+         code[0] |= offset << 9;
+         code[1] |= 0x00400000;
+         break;
+      case 2:
+         code[0] |= (offset >> 1) << 9;
+         break;
+      case 4:
+         code[0] |= (offset >> 2) << 9;
+         code[1] |= 0x04200000;
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      srcId(i->src(1), 32 + 14);
+      break;
+   default:
+      assert(!"invalid store destination file");
+      break;
+   }
+
+   if (f == FILE_MEMORY_GLOBAL)
+      srcId(*i->src(0).getIndirect(0), 9);
+   else
+      setAReg16(i, 0);
+
+   if (f == FILE_MEMORY_LOCAL)
+      srcAddr16(i->src(0), false, 9);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitMOV(const Instruction *i)
+{
+   DataFile sf = i->getSrc(0)->reg.file;
+   DataFile df = i->getDef(0)->reg.file;
+
+   assert(sf == FILE_GPR || df == FILE_GPR);
+
+   if (sf == FILE_FLAGS) {
+      code[0] = 0x00000001;
+      code[1] = 0x20000000;
+      defId(i->def(0), 2);
+      srcId(i->src(0), 12);
+      emitFlagsRd(i);
+   } else
+   if (sf == FILE_ADDRESS) {
+      code[0] = 0x00000001;
+      code[1] = 0x40000000;
+      defId(i->def(0), 2);
+      setARegBits(SDATA(i->src(0)).id + 1);
+      emitFlagsRd(i);
+   } else
+   if (df == FILE_FLAGS) {
+      code[0] = 0x00000001;
+      code[1] = 0xa0000000;
+      defId(i->def(0), 4);
+      srcId(i->src(0), 9);
+      emitFlagsRd(i);
+   } else
+   if (sf == FILE_IMMEDIATE) {
+      code[0] = 0x10008001;
+      code[1] = 0x00000003;
+      emitForm_IMM(i);
+   } else {
+      if (i->encSize == 4) {
+         code[0] = 0x10008000;
+      } else {
+         code[0] = 0x10000001;
+         code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+         code[1] |= (i->lanes << 14);
+         emitFlagsRd(i);
+      }
+      defId(i->def(0), 2);
+      srcId(i->src(0), 9);
+   }
+   if (df == FILE_SHADER_OUTPUT) {
+      assert(i->encSize == 8);
+      code[1] |= 0x8;
+   }
+}
+
+void
+CodeEmitterNV50::emitNOP()
+{
+   code[0] = 0xf0000001;
+   code[1] = 0xe0000000;
+}
+
+void
+CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
+{
+   code[0] = 0xc0000000 | (lane << 16);
+   code[1] = 0x80000000;
+
+   code[0] |= (quOp & 0x03) << 20;
+   code[1] |= (quOp & 0xfc) << 20;
+
+   emitForm_ADD(i);
+
+   if (!i->srcExists(1))
+      srcId(i->src(0), 32 + 14);
+}
+
+void
+CodeEmitterNV50::emitPFETCH(const Instruction *i)
+{
+   code[0] = 0x11800001;
+   code[1] = 0x04200000 | (0xf << 14);
+
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 9);
+   setAReg16(i, 0);
+}
+
+void
+CodeEmitterNV50::emitINTERP(const Instruction *i)
+{
+   code[0] = 0x80000000;
+
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 16);
+
+   if (i->getInterpMode() == NV50_IR_INTERP_FLAT) {
+      code[0] |= 1 << 8;
+   } else {
+      if (i->op == OP_PINTERP) {
+         code[0] |= 1 << 25;
+         srcId(i->src(1), 9);
+      }
+      if (i->getSampleMode() == NV50_IR_INTERP_CENTROID)
+         code[0] |= 1 << 24;
+   }
+
+   if (i->encSize == 8) {
+      code[1] =
+         (code[0] & (3 << 24)) >> (24 - 16) |
+         (code[0] & (1 <<  8)) << (18 -  8);
+      code[0] &= ~0x03000100;
+      code[0] |= 1;
+      emitFlagsRd(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitMINMAX(const Instruction *i)
+{
+   if (i->dType == TYPE_F64) {
+      code[0] = 0xe0000000;
+      code[1] = (i->op == OP_MIN) ? 0xa0000000 : 0xc0000000;
+   } else {
+      code[0] = 0x30000000;
+      code[1] = 0x80000000;
+      if (i->op == OP_MIN)
+         code[1] |= 0x20000000;
+
+      switch (i->dType) {
+      case TYPE_F32: code[0] |= 0x80000000; break;
+      case TYPE_S32: code[1] |= 0x8c000000; break;
+      case TYPE_U32: code[1] |= 0x84000000; break;
+      case TYPE_S16: code[1] |= 0x80000000; break;
+      case TYPE_U16: break;
+      default:
+         assert(0);
+         break;
+      }
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(1).mod.abs() << 19;
+   }
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitFMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   code[0] = 0xe0000000;
+
+   if (i->encSize == 4) {
+      emitForm_MUL(i);
+      assert(!neg_mul && !neg_add);
+   } else {
+      code[1]  = neg_mul << 26;
+      code[1] |= neg_add << 27;
+      if (i->saturate)
+         code[1] |= 1 << 29;
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitFADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   code[0] = 0xb0000000;
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+      code[0] |= neg0 << 15;
+      code[0] |= neg1 << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
+   } else
+   if (i->encSize == 8) {
+      code[1] = 0;
+      emitForm_ADD(i);
+      code[1] |= neg0 << 26;
+      code[1] |= neg1 << 27;
+      if (i->saturate)
+         code[1] |= 1 << 29;
+   } else {
+      emitForm_MUL(i);
+      code[0] |= neg0 << 15;
+      code[0] |= neg1 << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
+   }
+}
+
+void
+CodeEmitterNV50::emitUADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   code[0] = 0x20008000;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+   } else
+   if (i->encSize == 8) {
+      code[0] = 0x20000000;
+      code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+      emitForm_ADD(i);
+   } else {
+      emitForm_MUL(i);
+   }
+   assert(!(neg0 && neg1));
+   code[0] |= neg0 << 28;
+   code[0] |= neg1 << 22;
+
+   if (i->flagsSrc >= 0) {
+      // addc == sub | subr
+      assert(!(code[0] & 0x10400000) && !i->getPredicate());
+      code[0] |= 0x10400000;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
+}
+
+void
+CodeEmitterNV50::emitAADD(const Instruction *i)
+{
+   const int s = (i->op == OP_MOV) ? 0 : 1;
+
+   code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9);
+   code[1] = 0x20000000;
+
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+   emitFlagsRd(i);
+
+   if (s && i->srcExists(0))
+      setARegBits(SDATA(i->src(0)).id + 1);
+}
+
+void
+CodeEmitterNV50::emitIMUL(const Instruction *i)
+{
+   code[0] = 0x40000000;
+
+   if (i->encSize == 8) {
+      code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
+      emitForm_MAD(i);
+   } else {
+      if (i->sType == TYPE_S16)
+         code[0] |= 0x8100;
+      emitForm_MUL(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitFMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   code[0] = 0xc0000000;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+      if (neg)
+         code[0] |= 0x8000;
+   } else
+   if (i->encSize == 8) {
+      code[1] = i->rnd == ROUND_Z ? 0x0000c000 : 0;
+      if (neg)
+         code[1] |= 0x08000000;
+      emitForm_MAD(i);
+   } else {
+      emitForm_MUL(i);
+      if (neg)
+         code[0] |= 0x8000;
+   }
+}
+
+void
+CodeEmitterNV50::emitIMAD(const Instruction *i)
+{
+   code[0] = 0x60000000;
+   if (isSignedType(i->sType))
+      code[1] = i->saturate ? 0x40000000 : 0x20000000;
+   else
+      code[1] = 0x00000000;
+
+   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   int neg2 = i->src(2).mod.neg();
+
+   assert(!(neg1 & neg2));
+   code[1] |= neg1 << 27;
+   code[1] |= neg2 << 26;
+
+   emitForm_MAD(i);
+
+   if (i->flagsSrc >= 0) {
+      // add with carry from $cX
+      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+      code[1] |= 0xc << 24;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
+}
+
+void
+CodeEmitterNV50::emitISAD(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      code[0] = 0x50000000;
+      switch (i->sType) {
+      case TYPE_U32: code[1] = 0x04000000; break;
+      case TYPE_S32: code[1] = 0x0c000000; break;
+      case TYPE_U16: code[1] = 0x00000000; break;
+      case TYPE_S16: code[1] = 0x08000000; break;
+      default:
+         assert(0);
+         break;
+      }
+      emitForm_MAD(i);
+   } else {
+      switch (i->sType) {
+      case TYPE_U32: code[0] = 0x50008000; break;
+      case TYPE_S32: code[0] = 0x50008100; break;
+      case TYPE_U16: code[0] = 0x50000000; break;
+      case TYPE_S16: code[0] = 0x50000100; break;
+      default:
+         assert(0);
+         break;
+      }
+      emitForm_MUL(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitSET(const Instruction *i)
+{
+   code[0] = 0x30000000;
+   code[1] = 0x60000000;
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
+   switch (i->sType) {
+   case TYPE_F32: code[0] |= 0x80000000; break;
+   case TYPE_S32: code[1] |= 0x0c000000; break;
+   case TYPE_U32: code[1] |= 0x04000000; break;
+   case TYPE_S16: code[1] |= 0x08000000; break;
+   case TYPE_U16: break;
+   default:
+      assert(0);
+      break;
+   }
+   if (i->src(0).mod.neg()) code[1] |= 0x04000000;
+   if (i->src(1).mod.neg()) code[1] |= 0x08000000;
+   if (i->src(0).mod.abs()) code[1] |= 0x00100000;
+   if (i->src(1).mod.abs()) code[1] |= 0x00080000;
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::roundMode_CVT(RoundMode rnd)
+{
+   switch (rnd) {
+   case ROUND_NI: code[1] |= 0x08000000; break;
+   case ROUND_M:  code[1] |= 0x00020000; break;
+   case ROUND_MI: code[1] |= 0x08020000; break;
+   case ROUND_P:  code[1] |= 0x00040000; break;
+   case ROUND_PI: code[1] |= 0x08040000; break;
+   case ROUND_Z:  code[1] |= 0x00060000; break;
+   case ROUND_ZI: code[1] |= 0x08060000; break;
+   default:
+      assert(rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitCVT(const Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+   RoundMode rnd;
+
+   switch (i->op) {
+   case OP_CEIL:  rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   default:
+      rnd = i->rnd;
+      break;
+   }
+
+   code[0] = 0xa0000000;
+
+   switch (i->dType) {
+   case TYPE_F64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0xc4404000; break;
+      case TYPE_S64: code[1] = 0x44414000; break;
+      case TYPE_U64: code[1] = 0x44404000; break;
+      case TYPE_F32: code[1] = 0xc4400000; break;
+      case TYPE_S32: code[1] = 0x44410000; break;
+      case TYPE_U32: code[1] = 0x44400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x8c404000; break;
+      case TYPE_F32: code[1] = 0x8c400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_U64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x84404000; break;
+      case TYPE_F32: code[1] = 0x84400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_F32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0xc0404000; break;
+      case TYPE_S64: code[1] = 0x40414000; break;
+      case TYPE_U64: code[1] = 0x40404000; break;
+      case TYPE_F32: code[1] = 0xc4004000; break;
+      case TYPE_S32: code[1] = 0x44014000; break;
+      case TYPE_U32: code[1] = 0x44004000; break;
+      case TYPE_F16: code[1] = 0xc4000000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x88404000; break;
+      case TYPE_F32: code[1] = 0x8c004000; break;
+      case TYPE_S32: code[1] = 0x0c014000; break;
+      case TYPE_U32: code[1] = 0x0c004000; break;
+      case TYPE_F16: code[1] = 0x8c000000; break;
+      case TYPE_S16: code[1] = 0x0c010000; break;
+      case TYPE_U16: code[1] = 0x0c000000; break;
+      case TYPE_S8:  code[1] = 0x0c018000; break;
+      case TYPE_U8:  code[1] = 0x0c008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_U32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x80404000; break;
+      case TYPE_F32: code[1] = 0x84004000; break;
+      case TYPE_S32: code[1] = 0x04014000; break;
+      case TYPE_U32: code[1] = 0x04004000; break;
+      case TYPE_F16: code[1] = 0x84000000; break;
+      case TYPE_S16: code[1] = 0x04010000; break;
+      case TYPE_U16: code[1] = 0x04000000; break;
+      case TYPE_S8:  code[1] = 0x04018000; break;
+      case TYPE_U8:  code[1] = 0x04008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S16:
+   case TYPE_U16:
+   case TYPE_S8:
+   case TYPE_U8:
+   default:
+      assert(0);
+      break;
+   }
+   if (typeSizeof(i->sType) == 1 && i->getSrc(0)->reg.size == 4)
+      code[1] |= 0x00004000;
+
+   roundMode_CVT(rnd);
+
+   switch (i->op) {
+   case OP_ABS: code[1] |= 1 << 20; break;
+   case OP_SAT: code[1] |= 1 << 19; break;
+   case OP_NEG: code[1] |= 1 << 29; break;
+   default:
+      break;
+   }
+   code[1] ^= i->src(0).mod.neg() << 29;
+   code[1] |= i->src(0).mod.abs() << 20;
+   if (i->saturate)
+      code[1] |= 1 << 19;
+
+   assert(i->op != OP_ABS || !i->src(0).mod.neg());
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitPreOp(const Instruction *i)
+{
+   code[0] = 0xb0000000;
+   code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000;
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   code[0] = 0x90000000;
+
+   if (i->encSize == 4) {
+      assert(i->op == OP_RCP);
+      code[0] |= i->src(0).mod.abs() << 15;
+      code[0] |= i->src(0).mod.neg() << 22;
+      emitForm_MUL(i);
+   } else {
+      code[1] = subOp << 29;
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(0).mod.neg() << 26;
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitNOT(const Instruction *i)
+{
+   code[0] = 0xd0000000;
+   code[1] = 0x0002c000;
+
+   switch (i->sType) {
+   case TYPE_U32:
+   case TYPE_S32:
+      code[1] |= 0x04000000;
+      break;
+   default:
+      break;
+   }
+   emitForm_MAD(i);
+   setSrc(i, 0, 1);
+}
+
+void
+CodeEmitterNV50::emitLogicOp(const Instruction *i)
+{
+   code[0] = 0xd0000000;
+   code[1] = 0;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      switch (i->op) {
+      case OP_OR:  code[0] |= 0x0100; break;
+      case OP_XOR: code[0] |= 0x8000; break;
+      default:
+         assert(i->op == OP_AND);
+         break;
+      }
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+         code[0] |= 1 << 22;
+
+      emitForm_IMM(i);
+   } else {
+      switch (i->op) {
+      case OP_AND: code[1] = 0x04000000; break;
+      case OP_OR:  code[1] = 0x04004000; break;
+      case OP_XOR: code[1] = 0x04008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 16;
+      if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 17;
+
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitARL(const Instruction *i, unsigned int shl)
+{
+   code[0] = 0x00000001 | (shl << 16);
+   code[1] = 0xc0000000;
+
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   setSrc(i, 0, 0);
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitShift(const Instruction *i)
+{
+   if (i->def(0).getFile() == FILE_ADDRESS) {
+      assert(i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE);
+      emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
+   } else {
+      code[0] = 0x30000001;
+      code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
+      if (i->op == OP_SHR && isSignedType(i->sType))
+          code[1] |= 1 << 27;
+
+      if (i->src(1).getFile() == FILE_IMMEDIATE) {
+         code[1] |= 1 << 20;
+         code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16;
+         defId(i->def(0), 2);
+         srcId(i->src(0), 9);
+         emitFlagsRd(i);
+      } else {
+         emitForm_MAD(i);
+      }
+   }
+}
+
+void
+CodeEmitterNV50::emitOUT(const Instruction *i)
+{
+   code[0] = (i->op == OP_EMIT) ? 0xf0000200 : 0xf0000400;
+   code[1] = 0xc0000001;
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEX(const TexInstruction *i)
+{
+   code[0] = 0xf0000001;
+   code[1] = 0x00000000;
+
+   switch (i->op) {
+   case OP_TXB:
+      code[1] = 0x20000000;
+      break;
+   case OP_TXL:
+      code[1] = 0x40000000;
+      break;
+   case OP_TXF:
+      code[0] |= 0x01000000;
+      break;
+   case OP_TXG:
+      code[0] = 0x01000000;
+      code[1] = 0x80000000;
+      break;
+   default:
+      assert(i->op == OP_TEX);
+      break;
+   }
+
+   code[0] |= i->tex.r << 9;
+   code[0] |= i->tex.s << 17;
+
+   int argc = i->tex.target.getArgCount();
+
+   if (i->op == OP_TXB || i->op == OP_TXL || i->op == OP_TXF)
+      argc += 1;
+   if (i->tex.target.isShadow())
+      argc += 1;
+   assert(argc <= 4);
+
+   code[0] |= (argc - 1) << 22;
+
+   if (i->tex.target.isCube()) {
+      code[0] |= 0x08000000;
+   } else
+   if (i->tex.useOffsets) {
+      code[1] |= (i->tex.offset[0][0] & 0xf) << 24;
+      code[1] |= (i->tex.offset[0][1] & 0xf) << 20;
+      code[1] |= (i->tex.offset[0][2] & 0xf) << 16;
+   }
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+
+   if (i->tex.liveOnly)
+      code[1] |= 4;
+
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTXQ(const TexInstruction *i)
+{
+   assert(i->tex.query == TXQ_DIMS);
+
+   code[0] = 0xf0000001;
+   code[1] = 0x60000000;
+
+   code[0] |= i->tex.r << 9;
+   code[0] |= i->tex.s << 17;
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEXPREP(const TexInstruction *i)
+{
+   code[0] = 0xf8000001 | (3 << 22) | (i->tex.s << 17) | (i->tex.r << 9);
+   code[1] = 0x60010000;
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitPRERETEmu(const FlowInstruction *i)
+{
+   uint32_t pos = i->target.bb->binPos + 8; // +8 to skip an op */
+
+   code[0] = 0x10000003; // bra
+   code[1] = 0x00000780; // always
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_EMU_PRERET + 0: // bra to the call
+      break;
+   case NV50_IR_SUBOP_EMU_PRERET + 1: // bra to skip the call
+      pos += 8;
+      break;
+   default:
+      assert(i->subOp == (NV50_IR_SUBOP_EMU_PRERET + 2));
+      code[0] = 0x20000003; // call
+      code[1] = 0x00000000; // no predicate
+      break;
+   }
+   addReloc(RelocEntry::TYPE_CODE, 0, pos, 0x07fff800, 9);
+   addReloc(RelocEntry::TYPE_CODE, 1, pos, 0x000fc000, -4);
+}
+
+void
+CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp)
+{
+   const FlowInstruction *f = i->asFlow();
+   bool hasPred = false;
+   bool hasTarg = false;
+
+   code[0] = 0x00000003 | (flowOp << 28);
+   code[1] = 0x00000000;
+
+   switch (i->op) {
+   case OP_BRA:
+      hasPred = true;
+      hasTarg = true;
+      break;
+   case OP_BREAK:
+   case OP_BRKPT:
+   case OP_DISCARD:
+   case OP_RET:
+      hasPred = true;
+      break;
+   case OP_CALL:
+   case OP_PREBREAK:
+   case OP_JOINAT:
+      hasTarg = true;
+      break;
+   case OP_PRERET:
+      hasTarg = true;
+      if (i->subOp >= NV50_IR_SUBOP_EMU_PRERET) {
+         emitPRERETEmu(f);
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   if (hasPred)
+      emitFlagsRd(i);
+
+   if (hasTarg && f) {
+      uint32_t pos;
+
+      if (f->op == OP_CALL) {
+         if (f->builtin) {
+            pos = targNV50->getBuiltinOffset(f->target.builtin);
+         } else {
+            pos = f->target.fn->binPos;
+         }
+      } else {
+         pos = f->target.bb->binPos;
+      }
+
+      code[0] |= ((pos >>  2) & 0xffff) << 11;
+      code[1] |= ((pos >> 18) & 0x003f) << 14;
+
+      RelocEntry::Type relocTy;
+
+      relocTy = f->builtin ? RelocEntry::TYPE_BUILTIN : RelocEntry::TYPE_CODE;
+
+      addReloc(relocTy, 0, pos, 0x07fff800, 9);
+      addReloc(relocTy, 1, pos, 0x000fc000, -4);
+   }
+}
+
+void
+CodeEmitterNV50::emitBAR(const Instruction *i)
+{
+   ImmediateValue *barId = i->getSrc(0)->asImm();
+   assert(barId);
+
+   code[0] = 0x82000003 | (barId->reg.data.u32 << 21);
+   code[1] = 0x00004000;
+
+   if (i->subOp == NV50_IR_SUBOP_BAR_SYNC)
+      code[0] |= 1 << 26;
+}
+
+void
+CodeEmitterNV50::emitATOM(const Instruction *i)
+{
+   uint8_t subOp;
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_ATOM_ADD:  subOp = 0x0; break;
+   case NV50_IR_SUBOP_ATOM_MIN:  subOp = 0x7; break;
+   case NV50_IR_SUBOP_ATOM_MAX:  subOp = 0x6; break;
+   case NV50_IR_SUBOP_ATOM_INC:  subOp = 0x4; break;
+   case NV50_IR_SUBOP_ATOM_DEC:  subOp = 0x5; break;
+   case NV50_IR_SUBOP_ATOM_AND:  subOp = 0xa; break;
+   case NV50_IR_SUBOP_ATOM_OR:   subOp = 0xb; break;
+   case NV50_IR_SUBOP_ATOM_XOR:  subOp = 0xc; break;
+   case NV50_IR_SUBOP_ATOM_CAS:  subOp = 0x2; break;
+   case NV50_IR_SUBOP_ATOM_EXCH: subOp = 0x1; break;
+   default:
+      assert(!"invalid subop");
+      return;
+   }
+   code[0] = 0xd0000001;
+   code[1] = 0xe0c00000 | (subOp << 2);
+   if (isSignedType(i->dType))
+      code[1] |= 1 << 21;
+
+   // args
+   emitFlagsRd(i);
+   setDst(i, 0);
+   setSrc(i, 1, 1);
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      setSrc(i, 2, 2);
+
+   // g[] pointer
+   code[0] |= i->getSrc(0)->reg.fileIndex << 23;
+   srcId(i->getIndirect(0, 0), 9);
+}
+
+bool
+CodeEmitterNV50::emitInstruction(Instruction *insn)
+{
+   if (!insn->encSize) {
+      ERROR("skipping unencodable instruction: "); insn->print();
+      return false;
+   } else
+   if (codeSize + insn->encSize > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (insn->bb->getProgram()->dbgFlags & NV50_IR_DEBUG_BASIC) {
+      INFO("EMIT: "); insn->print();
+   }
+
+   switch (insn->op) {
+   case OP_MOV:
+      emitMOV(insn);
+      break;
+   case OP_EXIT:
+   case OP_NOP:
+   case OP_JOIN:
+      emitNOP();
+      break;
+   case OP_VFETCH:
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_EXPORT:
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
+         emitAADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitIMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+   case OP_OR:
+   case OP_XOR:
+      emitLogicOp(insn);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+      emitSET(insn);
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_ABS:
+   case OP_NEG:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_ADDRESS)
+         emitARL(insn, 0);
+      else
+      if (insn->def(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_ADDRESS)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 6);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXPREP:
+      emitTEXPREP(insn->asTex());
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_DISCARD:
+      emitFlow(insn, 0x0);
+      break;
+   case OP_BRA:
+      emitFlow(insn, 0x1);
+      break;
+   case OP_CALL:
+      emitFlow(insn, 0x2);
+      break;
+   case OP_RET:
+      emitFlow(insn, 0x3);
+      break;
+   case OP_PREBREAK:
+      emitFlow(insn, 0x4);
+      break;
+   case OP_BREAK:
+      emitFlow(insn, 0x5);
+      break;
+   case OP_QUADON:
+      emitFlow(insn, 0x6);
+      break;
+   case OP_QUADPOP:
+      emitFlow(insn, 0x7);
+      break;
+   case OP_JOINAT:
+      emitFlow(insn, 0xa);
+      break;
+   case OP_PRERET:
+      emitFlow(insn, 0xd);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->lanes, insn->subOp);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, 4, insn->src(0).mod.neg() ? 0x66 : 0x99);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, 5, insn->src(0).mod.neg() ? 0x5a : 0xa5);
+      break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated\n");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+   case OP_SELP:
+   case OP_SLCT:
+   case OP_TXD:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_POPCNT:
+   case OP_INSBF:
+   case OP_EXTBF:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknown op: %u\n", insn->op);
+      return false;
+   }
+   if (insn->join || insn->op == OP_JOIN)
+      code[1] |= 0x2;
+   else
+   if (insn->exit || insn->op == OP_EXIT)
+      code[1] |= 0x1;
+
+   assert((insn->encSize == 8) == (code[0] & 1));
+
+   code += insn->encSize / 4;
+   codeSize += insn->encSize;
+   return true;
+}
+
+uint32_t
+CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
+{
+   const Target::OpInfo &info = targ->getOpInfo(i);
+
+   if (info.minEncSize > 4)
+      return 8;
+
+   // check constraints on dst and src operands
+   for (int d = 0; i->defExists(d); ++d) {
+      if (i->def(d).rep()->reg.data.id > 63 ||
+          i->def(d).rep()->reg.file != FILE_GPR)
+         return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      DataFile sf = i->src(s).getFile();
+      if (sf != FILE_GPR)
+         if (sf != FILE_SHADER_INPUT || progType != Program::TYPE_FRAGMENT)
+            return 8;
+      if (i->src(s).rep()->reg.data.id > 63)
+         return 8;
+   }
+
+   // check modifiers & rounding
+   if (i->join || i->lanes != 0xf || i->exit)
+      return 8;
+   if (i->op == OP_MUL && i->rnd != ROUND_N)
+      return 8;
+
+   if (i->asTex())
+      return 8; // TODO: short tex encoding
+
+   // check constraints on short MAD
+   if (info.srcNr >= 2 && i->srcExists(2)) {
+      if (i->saturate || i->src(2).mod)
+         return 8;
+      if ((i->src(0).mod ^ i->src(1).mod) ||
+          (i->src(0).mod | i->src(1).mod).abs())
+         return 8;
+      if (!i->defExists(0) ||
+          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+         return 8;
+   }
+
+   return info.minEncSize;
+}
+
+// Change the encoding size of an instruction after BBs have been scheduled.
+static void
+makeInstructionLong(Instruction *insn)
+{
+   if (insn->encSize == 8)
+      return;
+   Function *fn = insn->bb->getFunction();
+   int n = 0;
+   int adj = 4;
+
+   for (Instruction *i = insn->next; i && i->encSize == 4; ++n, i = i->next);
+
+   if (n & 1) {
+      adj = 8;
+      insn->next->encSize = 8;
+   } else
+   if (insn->prev && insn->prev->encSize == 4) {
+      adj = 8;
+      insn->prev->encSize = 8;
+   }
+   insn->encSize = 8;
+
+   for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
+      fn->bbArray[i]->binPos += 4;
+   }
+   fn->binSize += adj;
+   insn->bb->binSize += adj;
+}
+
+static bool
+trySetExitModifier(Instruction *insn)
+{
+   if (insn->op == OP_DISCARD ||
+       insn->op == OP_QUADON ||
+       insn->op == OP_QUADPOP)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   if (insn->asFlow()) {
+      if (insn->op == OP_CALL) // side effects !
+         return false;
+      if (insn->getPredicate()) // cannot do conditional exit (or can we ?)
+         return false;
+      insn->op = OP_EXIT;
+   }
+   insn->exit = 1;
+   makeInstructionLong(insn);
+   return true;
+}
+
+static void
+replaceExitWithModifier(Function *func)
+{
+   BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+
+   if (!epilogue->getExit() ||
+       epilogue->getExit()->op != OP_EXIT) // only main will use OP_EXIT
+      return;
+
+   if (epilogue->getEntry()->op != OP_EXIT) {
+      Instruction *insn = epilogue->getExit()->prev;
+      if (!insn || !trySetExitModifier(insn))
+         return;
+      insn->exit = 1;
+   } else {
+      for (Graph::EdgeIterator ei = func->cfgExit->incident();
+           !ei.end(); ei.next()) {
+         BasicBlock *bb = BasicBlock::get(ei.getNode());
+         Instruction *i = bb->getExit();
+
+         if (!i || !trySetExitModifier(i))
+            return;
+      }
+   }
+   epilogue->binSize -= 8;
+   func->binSize -= 8;
+   delete_Instruction(func->getProgram(), epilogue->getExit());
+}
+
+void
+CodeEmitterNV50::prepareEmission(Function *func)
+{
+   CodeEmitter::prepareEmission(func);
+
+   replaceExitWithModifier(func);
+}
+
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) :
+   CodeEmitter(target), targNV50(target)
+{
+   targ = target; // specialized
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNV50::getCodeEmitter(Program::Type type)
+{
+   CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
new file mode 100644
index 00000000000..90c409d35e6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -0,0 +1,2988 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+// Argh, all these assertions ...
+
+class CodeEmitterNVC0 : public CodeEmitter
+{
+public:
+   CodeEmitterNVC0(const TargetNVC0 *);
+
+   virtual bool emitInstruction(Instruction *);
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+   virtual void prepareEmission(Function *);
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+   const TargetNVC0 *targNVC0;
+
+   Program::Type progType;
+
+   const bool writeIssueDelays;
+
+private:
+   void emitForm_A(const Instruction *, uint64_t);
+   void emitForm_B(const Instruction *, uint64_t);
+   void emitForm_S(const Instruction *, uint32_t, bool pred);
+
+   void emitPredicate(const Instruction *);
+
+   void setAddress16(const ValueRef&);
+   void setAddress24(const ValueRef&);
+   void setAddressByFile(const ValueRef&);
+   void setImmediate(const Instruction *, const int s); // needs op already set
+   void setImmediateS8(const ValueRef&);
+   void setSUConst16(const Instruction *, const int s);
+   void setSUPred(const Instruction *, const int s);
+
+   void emitCondCode(CondCode cc, int pos);
+   void emitInterpMode(const Instruction *);
+   void emitLoadStoreType(DataType ty);
+   void emitSUGType(DataType);
+   void emitCachingMode(CacheMode c);
+
+   void emitShortSrc2(const ValueRef&);
+
+   inline uint8_t getSRegEncoding(const ValueRef&);
+
+   void roundMode_A(const Instruction *);
+   void roundMode_C(const Instruction *);
+   void roundMode_CS(const Instruction *);
+
+   void emitNegAbs12(const Instruction *);
+
+   void emitNOP(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+   void emitATOM(const Instruction *);
+   void emitMEMBAR(const Instruction *);
+   void emitCCTL(const Instruction *);
+
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitVFETCH(const Instruction *);
+   void emitEXPORT(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitUMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+   void emitFMAD(const Instruction *);
+   void emitMADSP(const Instruction *);
+
+   void emitNOT(Instruction *);
+   void emitLogicOp(const Instruction *, uint8_t subOp);
+   void emitPOPC(const Instruction *);
+   void emitINSBF(const Instruction *);
+   void emitEXTBF(const Instruction *);
+   void emitPERMT(const Instruction *);
+   void emitShift(const Instruction *);
+
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitCVT(Instruction *);
+   void emitMINMAX(const Instruction *);
+   void emitPreOp(const Instruction *);
+
+   void emitSET(const CmpInstruction *);
+   void emitSLCT(const CmpInstruction *);
+   void emitSELP(const Instruction *);
+
+   void emitTEXBAR(const Instruction *);
+   void emitTEX(const TexInstruction *);
+   void emitTEXCSAA(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+   void emitFlow(const Instruction *);
+   void emitBAR(const Instruction *);
+
+   void emitSUCLAMPMode(uint16_t);
+   void emitSUCalc(Instruction *);
+   void emitSULDGB(const TexInstruction *);
+   void emitSUSTGx(const TexInstruction *);
+
+   void emitVSHL(const Instruction *);
+   void emitVectorSubOp(const Instruction *);
+
+   inline void defId(const ValueDef&, const int pos);
+   inline void defId(const Instruction *, int d, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+   inline void srcId(const Instruction *, int s, const int pos);
+   inline void srcAddr32(const ValueRef&, int pos, int shr);
+
+   inline bool isLIMM(const ValueRef&, DataType ty);
+};
+
+// for better visibility
+#define HEX64(h, l) 0x##h##l##ULL
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos)
+{
+   code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const Instruction *insn, int s, int pos)
+{
+   int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : 63;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::srcAddr32(const ValueRef& src, int pos, int shr)
+{
+   const uint32_t offset = SDATA(src).offset >> shr;
+
+   code[pos / 32] |= offset << (pos % 32);
+   if (pos && (pos < 32))
+      code[1] |= offset >> (32 - pos);
+}
+
+void CodeEmitterNVC0::defId(const ValueDef& def, const int pos)
+{
+   code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::defId(const Instruction *insn, int d, int pos)
+{
+   int r = insn->defExists(d) ? DDATA(insn->def(d)).id : 63;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterNVC0::roundMode_A(const Instruction *insn)
+{
+   switch (insn->rnd) {
+   case ROUND_M: code[1] |= 1 << 23; break;
+   case ROUND_P: code[1] |= 2 << 23; break;
+   case ROUND_Z: code[1] |= 3 << 23; break;
+   default:
+      assert(insn->rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitNegAbs12(const Instruction *i)
+{
+   if (i->src(1).mod.abs()) code[0] |= 1 << 6;
+   if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+   if (i->src(1).mod.neg()) code[0] |= 1 << 8;
+   if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+}
+
+void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos)
+{
+   uint8_t val;
+
+   switch (cc) {
+   case CC_LT:  val = 0x1; break;
+   case CC_LTU: val = 0x9; break;
+   case CC_EQ:  val = 0x2; break;
+   case CC_EQU: val = 0xa; break;
+   case CC_LE:  val = 0x3; break;
+   case CC_LEU: val = 0xb; break;
+   case CC_GT:  val = 0x4; break;
+   case CC_GTU: val = 0xc; break;
+   case CC_NE:  val = 0x5; break;
+   case CC_NEU: val = 0xd; break;
+   case CC_GE:  val = 0x6; break;
+   case CC_GEU: val = 0xe; break;
+   case CC_TR:  val = 0xf; break;
+   case CC_FL:  val = 0x0; break;
+
+   case CC_A:  val = 0x14; break;
+   case CC_NA: val = 0x13; break;
+   case CC_S:  val = 0x15; break;
+   case CC_NS: val = 0x12; break;
+   case CC_C:  val = 0x16; break;
+   case CC_NC: val = 0x11; break;
+   case CC_O:  val = 0x17; break;
+   case CC_NO: val = 0x10; break;
+
+   default:
+      val = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   code[pos / 32] |= val << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::emitPredicate(const Instruction *i)
+{
+   if (i->predSrc >= 0) {
+      assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+      srcId(i->src(i->predSrc), 10);
+      if (i->cc == CC_NOT_P)
+         code[0] |= 0x2000; // negate
+   } else {
+      code[0] |= 0x1c00;
+   }
+}
+
+void
+CodeEmitterNVC0::setAddressByFile(const ValueRef& src)
+{
+   switch (src.getFile()) {
+   case FILE_MEMORY_GLOBAL:
+      srcAddr32(src, 26, 0);
+      break;
+   case FILE_MEMORY_LOCAL:
+   case FILE_MEMORY_SHARED:
+      setAddress24(src);
+      break;
+   default:
+      assert(src.getFile() == FILE_MEMORY_CONST);
+      setAddress16(src);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::setAddress16(const ValueRef& src)
+{
+   Symbol *sym = src.get()->asSym();
+
+   assert(sym);
+
+   code[0] |= (sym->reg.data.offset & 0x003f) << 26;
+   code[1] |= (sym->reg.data.offset & 0xffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setAddress24(const ValueRef& src)
+{
+   Symbol *sym = src.get()->asSym();
+
+   assert(sym);
+
+   code[0] |= (sym->reg.data.offset & 0x00003f) << 26;
+   code[1] |= (sym->reg.data.offset & 0xffffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
+{
+   const ImmediateValue *imm = i->src(s).get()->asImm();
+   uint32_t u32;
+
+   assert(imm);
+   u32 = imm->reg.data.u32;
+
+   if ((code[0] & 0xf) == 0x2) {
+      // LIMM
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= u32 >> 6;
+   } else
+   if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
+      // integer immediate
+      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      assert(!(code[1] & 0xc000));
+      u32 &= 0xfffff;
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 6);
+   } else {
+      // float immediate
+      assert(!(u32 & 0x00000fff));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u32 >> 12) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 18);
+   }
+}
+
+void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   int8_t s8 = static_cast<int8_t>(imm->reg.data.s32);
+
+   assert(s8 == imm->reg.data.s32);
+
+   code[0] |= (s8 & 0x3f) << 26;
+   code[0] |= (s8 >> 6) << 8;
+}
+
+void
+CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+
+   int s1 = 26;
+   if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST)
+      s1 = 49;
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->getSrc(s)->reg.file) {
+      case FILE_MEMORY_CONST:
+         assert(!(code[1] & 0xc000));
+         code[1] |= (s == 2) ? 0x8000 : 0x4000;
+         code[1] |= i->getSrc(s)->reg.fileIndex << 10;
+         setAddress16(i->src(s));
+         break;
+      case FILE_IMMEDIATE:
+         assert(s == 1 ||
+                i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2);
+         assert(!(code[1] & 0xc000));
+         setImmediate(i, s);
+         break;
+      case FILE_GPR:
+         if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst
+            break;
+         srcId(i->src(s), s ? ((s == 2) ? 49 : s1) : 20);
+         break;
+      default:
+         // ignore here, can be predicate or flags, but must not be address
+         break;
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_CONST:
+      assert(!(code[1] & 0xc000));
+      code[1] |= 0x4000 | (i->src(0).get()->reg.fileIndex << 10);
+      setAddress16(i->src(0));
+      break;
+   case FILE_IMMEDIATE:
+      assert(!(code[1] & 0xc000));
+      setImmediate(i, 0);
+      break;
+   case FILE_GPR:
+      srcId(i->src(0), 26);
+      break;
+   default:
+      // ignore here, can be predicate or flags, but must not be address
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred)
+{
+   code[0] = opc;
+
+   int ss2a = 0;
+   if (opc == 0x0d || opc == 0x0e)
+      ss2a = 2;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+
+   assert(pred || (i->predSrc < 0));
+   if (pred)
+      emitPredicate(i);
+
+   for (int s = 1; s < 3 && i->srcExists(s); ++s) {
+      if (i->src(s).get()->reg.file == FILE_MEMORY_CONST) {
+         assert(!(code[0] & (0x300 >> ss2a)));
+         switch (i->src(s).get()->reg.fileIndex) {
+         case 0:  code[0] |= 0x100 >> ss2a; break;
+         case 1:  code[0] |= 0x200 >> ss2a; break;
+         case 16: code[0] |= 0x300 >> ss2a; break;
+         default:
+            ERROR("invalid c[] space for short form\n");
+            break;
+         }
+         if (s == 1)
+            code[0] |= i->getSrc(s)->reg.data.offset << 24;
+         else
+            code[0] |= i->getSrc(s)->reg.data.offset << 6;
+      } else
+      if (i->src(s).getFile() == FILE_IMMEDIATE) {
+         assert(s == 1);
+         setImmediateS8(i->src(s));
+      } else
+      if (i->src(s).getFile() == FILE_GPR) {
+         srcId(i->src(s), (s == 1) ? 26 : 8);
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitShortSrc2(const ValueRef &src)
+{
+   if (src.getFile() == FILE_MEMORY_CONST) {
+      switch (src.get()->reg.fileIndex) {
+      case 0:  code[0] |= 0x100; break;
+      case 1:  code[0] |= 0x200; break;
+      case 16: code[0] |= 0x300; break;
+      default:
+         assert(!"unsupported file index for short op");
+         break;
+      }
+      srcAddr32(src, 20, 2);
+   } else {
+      srcId(src, 20);
+      assert(src.getFile() == FILE_GPR);
+   }
+}
+
+void
+CodeEmitterNVC0::emitNOP(const Instruction *i)
+{
+   code[0] = 0x000001e4;
+   code[1] = 0x40000000;
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFMAD(const Instruction *i)
+{
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         emitForm_A(i, HEX64(20000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(30000000, 00000000));
+
+         if (i->src(2).mod.neg())
+            code[0] |= 1 << 8;
+      }
+      roundMode_A(i);
+
+      if (neg1)
+         code[0] |= 1 << 9;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!i->saturate && !i->src(2).mod.neg());
+      emitForm_S(i, (i->src(2).getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e,
+                 false);
+      if (neg1)
+         code[0] |= 1 << 4;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         assert(i->postFactor == 0); // constant folded, hopefully
+         emitForm_A(i, HEX64(30000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(58000000, 00000000));
+         roundMode_A(i);
+         code[1] |= ((i->postFactor > 0) ?
+                     (7 - i->postFactor) : (0 - i->postFactor)) << 17;
+      }
+      if (neg)
+         code[1] ^= 1 << 25; // aliases with LIMM sign bit
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->dnz)
+         code[0] |= 1 << 7;
+      else
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!neg && !i->saturate && !i->ftz && !i->postFactor);
+      emitForm_S(i, 0xa8, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitUMUL(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (i->src(1).getFile() == FILE_IMMEDIATE) {
+         emitForm_A(i, HEX64(10000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000003));
+      }
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[0] |= 1 << 6;
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 5;
+      if (i->dType == TYPE_S32)
+         code[0] |= 1 << 7;
+   } else {
+      emitForm_S(i, i->src(1).getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true);
+
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 6;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFADD(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         assert(!i->saturate);
+         emitForm_A(i, HEX64(28000000, 00000002));
+
+         code[0] |= i->src(0).mod.abs() << 7;
+         code[0] |= i->src(0).mod.neg() << 9;
+
+         if (i->src(1).mod.abs())
+            code[1] &= 0xfdffffff;
+         if ((i->op == OP_SUB) != static_cast<bool>(i->src(1).mod.neg()))
+            code[1] ^= 0x02000000;
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000000));
+
+         roundMode_A(i);
+         if (i->saturate)
+            code[1] |= 1 << 17;
+
+         emitNegAbs12(i);
+         if (i->op == OP_SUB) code[0] ^= 1 << 8;
+      }
+      if (i->ftz)
+         code[0] |= 1 << 5;
+   } else {
+      assert(!i->saturate && i->op != OP_SUB &&
+             !i->src(0).mod.abs() &&
+             !i->src(1).mod.neg() && !i->src(1).mod.abs());
+
+      emitForm_S(i, 0x49, true);
+
+      if (i->src(0).mod.neg())
+         code[0] |= 1 << 7;
+   }
+}
+
+void
+CodeEmitterNVC0::emitUADD(const Instruction *i)
+{
+   uint32_t addOp = 0;
+
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+   assert(!i->src(0).mod.neg() || !i->src(1).mod.neg());
+
+   if (i->src(0).mod.neg())
+      addOp |= 0x200;
+   if (i->src(1).mod.neg())
+      addOp |= 0x100;
+   if (i->op == OP_SUB) {
+      addOp ^= 0x100;
+      assert(addOp != 0x300); // would be add-plus-one
+   }
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_U32)) {
+         emitForm_A(i, HEX64(08000000, 00000002));
+         if (i->defExists(1))
+            code[1] |= 1 << 26; // write carry
+      } else {
+         emitForm_A(i, HEX64(48000000, 00000003));
+         if (i->defExists(1))
+            code[1] |= 1 << 16; // write carry
+      }
+      code[0] |= addOp;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->flagsSrc >= 0) // add carry
+         code[0] |= 1 << 6;
+   } else {
+      assert(!(addOp & 0x100));
+      emitForm_S(i, (addOp >> 3) |
+                 ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true);
+   }
+}
+
+// TODO: shl-add
+void
+CodeEmitterNVC0::emitIMAD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   emitForm_A(i, HEX64(20000000, 00000003));
+
+   if (isSignedType(i->dType))
+      code[0] |= 1 << 7;
+   if (isSignedType(i->sType))
+      code[0] |= 1 << 5;
+
+   code[1] |= i->saturate << 24;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 16;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 23;
+
+   if (i->src(2).mod.neg()) code[0] |= 0x10;
+   if (i->src(1).mod.neg() ^
+       i->src(0).mod.neg()) code[0] |= 0x20;
+
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[0] |= 1 << 6;
+}
+
+void
+CodeEmitterNVC0::emitMADSP(const Instruction *i)
+{
+   assert(targ->getChipset() >= NVISA_GK104_CHIPSET);
+
+   emitForm_A(i, HEX64(00000000, 00000003));
+
+   if (i->subOp == NV50_IR_SUBOP_MADSP_SD) {
+      code[1] |= 0x01800000;
+   } else {
+      code[0] |= (i->subOp & 0x00f) << 7;
+      code[0] |= (i->subOp & 0x0f0) << 1;
+      code[0] |= (i->subOp & 0x100) >> 3;
+      code[0] |= (i->subOp & 0x200) >> 2;
+      code[1] |= (i->subOp & 0xc00) << 13;
+   }
+
+   if (i->flagsDef >= 0)
+      code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitISAD(const Instruction *i)
+{
+   assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+   assert(i->encSize == 8);
+
+   emitForm_A(i, HEX64(38000000, 00000003));
+
+   if (i->dType == TYPE_S32)
+      code[0] |= 1 << 5;
+}
+
+void
+CodeEmitterNVC0::emitNOT(Instruction *i)
+{
+   assert(i->encSize == 8);
+   i->setSrc(1, i->src(0));
+   emitForm_A(i, HEX64(68000000, 000001c3));
+}
+
+void
+CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      code[0] = 0x00000004 | (subOp << 30);
+      code[1] = 0x0c000000;
+
+      emitPredicate(i);
+
+      defId(i->def(0), 17);
+      srcId(i->src(0), 20);
+      if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 23;
+      srcId(i->src(1), 26);
+      if (i->src(1).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 29;
+
+      if (i->defExists(1)) {
+         defId(i->def(1), 14);
+      } else {
+         code[0] |= 7 << 14;
+      }
+      // (a OP b) OP c
+      if (i->predSrc != 2 && i->srcExists(2)) {
+         code[1] |= subOp << 21;
+         srcId(i->src(2), 17);
+         if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 20;
+      } else {
+         code[1] |= 0x000e0000;
+      }
+   } else
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_U32)) {
+         emitForm_A(i, HEX64(38000000, 00000002));
+
+         if (i->flagsDef >= 0)
+            code[1] |= 1 << 26;
+      } else {
+         emitForm_A(i, HEX64(68000000, 00000003));
+
+         if (i->flagsDef >= 0)
+            code[1] |= 1 << 16;
+      }
+      code[0] |= subOp << 6;
+
+      if (i->flagsSrc >= 0) // carry
+         code[0] |= 1 << 5;
+
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+      if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, (subOp << 5) |
+                 ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitPOPC(const Instruction *i)
+{
+   emitForm_A(i, HEX64(54000000, 00000004));
+
+   if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+   if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitINSBF(const Instruction *i)
+{
+   emitForm_A(i, HEX64(28000000, 00000003));
+}
+
+void
+CodeEmitterNVC0::emitEXTBF(const Instruction *i)
+{
+   emitForm_A(i, HEX64(70000000, 00000003));
+
+   if (i->dType == TYPE_S32)
+      code[0] |= 1 << 5;
+   if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
+      code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitPERMT(const Instruction *i)
+{
+   emitForm_A(i, HEX64(24000000, 00000004));
+
+   code[0] |= i->subOp << 5;
+}
+
+void
+CodeEmitterNVC0::emitShift(const Instruction *i)
+{
+   if (i->op == OP_SHR) {
+      emitForm_A(i, HEX64(58000000, 00000003)
+                 | (isSignedType(i->dType) ? 0x20 : 0x00));
+   } else {
+      emitForm_A(i, HEX64(60000000, 00000003));
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP)
+      code[0] |= 1 << 9;
+}
+
+void
+CodeEmitterNVC0::emitPreOp(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(60000000, 00000000));
+
+      if (i->op == OP_PREEX2)
+         code[0] |= 0x20;
+
+      if (i->src(0).mod.abs()) code[0] |= 1 << 6;
+      if (i->src(0).mod.neg()) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->encSize == 8) {
+      code[0] = 0x00000000 | (subOp << 26);
+      code[1] = 0xc8000000;
+
+      emitPredicate(i);
+
+      defId(i->def(0), 14);
+      srcId(i->src(0), 20);
+
+      assert(i->src(0).getFile() == FILE_GPR);
+
+      if (i->saturate) code[0] |= 1 << 5;
+
+      if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+      if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+   } else {
+      emitForm_S(i, 0x80000008 | (subOp << 26), true);
+
+      assert(!i->src(0).mod.neg());
+      if (i->src(0).mod.abs()) code[0] |= 1 << 30;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMINMAX(const Instruction *i)
+{
+   uint64_t op;
+
+   assert(i->encSize == 8);
+
+   op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL;
+
+   if (i->ftz)
+      op |= 1 << 5;
+   else
+   if (!isFloatType(i->dType))
+      op |= isSignedType(i->dType) ? 0x23 : 0x03;
+
+   emitForm_A(i, op);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::roundMode_C(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:  code[1] |= 1 << 17; break;
+   case ROUND_P:  code[1] |= 2 << 17; break;
+   case ROUND_Z:  code[1] |= 3 << 17; break;
+   case ROUND_NI: code[0] |= 1 << 7; break;
+   case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break;
+   case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break;
+   case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break;
+   case ROUND_N: break;
+   default:
+      assert(!"invalid round mode");
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::roundMode_CS(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:
+   case ROUND_MI: code[0] |= 1 << 16; break;
+   case ROUND_P:
+   case ROUND_PI: code[0] |= 2 << 16; break;
+   case ROUND_Z:
+   case ROUND_ZI: code[0] |= 3 << 16; break;
+   default:
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitCVT(Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+
+   switch (i->op) {
+   case OP_CEIL:  i->rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   default:
+      break;
+   }
+
+   const bool sat = (i->op == OP_SAT) || i->saturate;
+   const bool abs = (i->op == OP_ABS) || i->src(0).mod.abs();
+   const bool neg = (i->op == OP_NEG) || i->src(0).mod.neg();
+
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(10000000, 00000004));
+
+      roundMode_C(i);
+
+      // cvt u16 f32 sets high bits to 0, so we don't have to use Value::Size()
+      code[0] |= util_logbase2(typeSizeof(i->dType)) << 20;
+      code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
+
+      if (sat)
+         code[0] |= 0x20;
+      if (abs)
+         code[0] |= 1 << 6;
+      if (neg && i->op != OP_ABS)
+         code[0] |= 1 << 8;
+
+      if (i->ftz)
+         code[1] |= 1 << 23;
+
+      if (isSignedIntType(i->dType))
+         code[0] |= 0x080;
+      if (isSignedIntType(i->sType))
+         code[0] |= 0x200;
+
+      if (isFloatType(i->dType)) {
+         if (!isFloatType(i->sType))
+            code[1] |= 0x08000000;
+      } else {
+         if (isFloatType(i->sType))
+            code[1] |= 0x04000000;
+         else
+            code[1] |= 0x0c000000;
+      }
+   } else {
+      if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) {
+         code[0] = 0x298;
+      } else
+      if (isFloatType(i->dType)) {
+         if (isFloatType(i->sType))
+            code[0] = 0x098;
+         else
+            code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      } else {
+         assert(isFloatType(i->sType));
+
+         code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      }
+
+      if (neg) code[0] |= 1 << 16;
+      if (sat) code[0] |= 1 << 18;
+      if (abs) code[0] |= 1 << 19;
+
+      roundMode_CS(i);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSET(const CmpInstruction *i)
+{
+   uint32_t hi;
+   uint32_t lo = 0;
+
+   if (i->sType == TYPE_F64)
+      lo = 0x1;
+   else
+   if (!isFloatType(i->sType))
+      lo = 0x3;
+
+   if (isFloatType(i->dType) || isSignedIntType(i->sType))
+      lo |= 0x20;
+
+   switch (i->op) {
+   case OP_SET_AND: hi = 0x10000000; break;
+   case OP_SET_OR:  hi = 0x10200000; break;
+   case OP_SET_XOR: hi = 0x10400000; break;
+   default:
+      hi = 0x100e0000;
+      break;
+   }
+   emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo);
+
+   if (i->op != OP_SET)
+      srcId(i->src(2), 32 + 17);
+
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      if (i->sType == TYPE_F32)
+         code[1] += 0x10000000;
+      else
+         code[1] += 0x08000000;
+
+      code[0] &= ~0xfc000;
+      defId(i->def(0), 17);
+      if (i->defExists(1))
+         defId(i->def(1), 14);
+      else
+         code[0] |= 0x1c000;
+   }
+
+   if (i->ftz)
+      code[1] |= 1 << 27;
+
+   emitCondCode(i->setCond, 32 + 23);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::emitSLCT(const CmpInstruction *i)
+{
+   uint64_t op;
+
+   switch (i->dType) {
+   case TYPE_S32:
+      op = HEX64(30000000, 00000023);
+      break;
+   case TYPE_U32:
+      op = HEX64(30000000, 00000003);
+      break;
+   case TYPE_F32:
+      op = HEX64(38000000, 00000000);
+      break;
+   default:
+      assert(!"invalid type for SLCT");
+      op = 0;
+      break;
+   }
+   emitForm_A(i, op);
+
+   CondCode cc = i->setCond;
+
+   if (i->src(2).mod.neg())
+      cc = reverseCondCode(cc);
+
+   emitCondCode(cc, 32 + 23);
+
+   if (i->ftz)
+      code[0] |= 1 << 5;
+}
+
+void CodeEmitterNVC0::emitSELP(const Instruction *i)
+{
+   emitForm_A(i, HEX64(20000000, 00000004));
+
+   if (i->cc == CC_NOT_P || i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
+      code[1] |= 1 << 20;
+}
+
+void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
+{
+   code[0] = 0x00000006 | (i->subOp << 26);
+   code[1] = 0xf0000000;
+   emitPredicate(i);
+   emitCondCode(i->flagsSrc >= 0 ? i->cc : CC_ALWAYS, 5);
+}
+
+void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xd0000000;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+   if (!i->next || !isTextureOp(i->next->op))
+      return false;
+   if (i->getDef(0)->interfers(i->next->getSrc(0)))
+      return false;
+   return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterNVC0::emitTEX(const TexInstruction *i)
+{
+   code[0] = 0x00000006;
+
+   if (isNextIndependentTex(i))
+      code[0] |= 0x080; // t mode
+   else
+      code[0] |= 0x100; // p mode
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   switch (i->op) {
+   case OP_TEX: code[1] = 0x80000000; break;
+   case OP_TXB: code[1] = 0x84000000; break;
+   case OP_TXL: code[1] = 0x86000000; break;
+   case OP_TXF: code[1] = 0x90000000; break;
+   case OP_TXG: code[1] = 0xa0000000; break;
+   case OP_TXD: code[1] = 0xe0000000; break;
+   default:
+      assert(!"invalid texture op");
+      break;
+   }
+   if (i->op == OP_TXF) {
+      if (!i->tex.levelZero)
+         code[1] |= 0x02000000;
+   } else
+   if (i->tex.levelZero) {
+      code[1] |= 0x02000000;
+   }
+
+   if (i->op != OP_TXD && i->tex.derivAll)
+      code[1] |= 1 << 13;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+
+   emitPredicate(i);
+
+   if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0)
+      code[1] |= 1 << 18; // in 1st source (with array index)
+
+   // texture target:
+   code[1] |= (i->tex.target.getDim() - 1) << 20;
+   if (i->tex.target.isCube())
+      code[1] += 2 << 20;
+   if (i->tex.target.isArray())
+      code[1] |= 1 << 19;
+   if (i->tex.target.isShadow())
+      code[1] |= 1 << 24;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+      // lzero
+      if (i->op == OP_TXL)
+         code[1] &= ~(1 << 26);
+      else
+      if (i->op == OP_TXF)
+         code[1] &= ~(1 << 25);
+   }
+   if (i->tex.target == TEX_TARGET_2D_MS ||
+       i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+      code[1] |= 1 << 23;
+
+   if (i->tex.useOffsets) // in vecSrc0.w
+      code[1] |= 1 << 22;
+
+   srcId(i, src1, 26);
+}
+
+void
+CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xc0000000;
+
+   switch (i->tex.query) {
+   case TXQ_DIMS:            code[1] |= 0 << 22; break;
+   case TXQ_TYPE:            code[1] |= 1 << 22; break;
+   case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break;
+   case TXQ_FILTER:          code[1] |= 3 << 22; break;
+   case TXQ_LOD:             code[1] |= 4 << 22; break;
+   case TXQ_BORDER_COLOUR:   code[1] |= 5 << 22; break;
+   default:
+      assert(!"invalid texture query");
+      break;
+   }
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0)
+      code[1] |= 1 << 18;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+   srcId(i, src1, 26);
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+   code[0] = 0x00000000 | (laneMask << 6);
+   code[1] = 0x48000000 | qOp;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+   srcId(i->srcExists(1) ? i->src(1) : i->src(0), 26);
+
+   if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
+      code[0] |= 1 << 9; // dall
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFlow(const Instruction *i)
+{
+   const FlowInstruction *f = i->asFlow();
+
+   unsigned mask; // bit 0: predicate, bit 1: target
+
+   code[0] = 0x00000007;
+
+   switch (i->op) {
+   case OP_BRA:
+      code[1] = f->absolute ? 0x00000000 : 0x40000000;
+      if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+         code[0] |= 0x4000;
+      mask = 3;
+      break;
+   case OP_CALL:
+      code[1] = f->absolute ? 0x10000000 : 0x50000000;
+      if (f->indirect)
+         code[0] |= 0x4000; // indirect calls always use c[] source
+      mask = 2;
+      break;
+
+   case OP_EXIT:    code[1] = 0x80000000; mask = 1; break;
+   case OP_RET:     code[1] = 0x90000000; mask = 1; break;
+   case OP_DISCARD: code[1] = 0x98000000; mask = 1; break;
+   case OP_BREAK:   code[1] = 0xa8000000; mask = 1; break;
+   case OP_CONT:    code[1] = 0xb0000000; mask = 1; break;
+
+   case OP_JOINAT:   code[1] = 0x60000000; mask = 2; break;
+   case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break;
+   case OP_PRECONT:  code[1] = 0x70000000; mask = 2; break;
+   case OP_PRERET:   code[1] = 0x78000000; mask = 2; break;
+
+   case OP_QUADON:  code[1] = 0xc0000000; mask = 0; break;
+   case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break;
+   case OP_BRKPT:   code[1] = 0xd0000000; mask = 0; break;
+   default:
+      assert(!"invalid flow operation");
+      return;
+   }
+
+   if (mask & 1) {
+      emitPredicate(i);
+      if (i->flagsSrc < 0)
+         code[0] |= 0x1e0;
+   }
+
+   if (!f)
+      return;
+
+   if (f->allWarp)
+      code[0] |= 1 << 15;
+   if (f->limit)
+      code[0] |= 1 << 16;
+
+   if (f->indirect) {
+      if (code[0] & 0x4000) {
+         assert(i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST);
+         setAddress16(i->src(0));
+         code[1] |= i->getSrc(0)->reg.fileIndex << 10;
+         if (f->op == OP_BRA)
+            srcId(f->src(0).getIndirect(0), 20);
+      } else {
+         srcId(f, 0, 20);
+      }
+   }
+
+   if (f->op == OP_CALL) {
+      if (f->indirect) {
+         // nothing
+      } else
+      if (f->builtin) {
+         assert(f->absolute);
+         uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+         addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26);
+         addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6);
+      } else {
+         assert(!f->absolute);
+         int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+         code[0] |= (pcRel & 0x3f) << 26;
+         code[1] |= (pcRel >> 6) & 0x3ffff;
+      }
+   } else
+   if (mask & 2) {
+      int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      // currently we don't want absolute branches
+      assert(!f->absolute);
+      code[0] |= (pcRel & 0x3f) << 26;
+      code[1] |= (pcRel >> 6) & 0x3ffff;
+   }
+}
+
+void
+CodeEmitterNVC0::emitBAR(const Instruction *i)
+{
+   Value *rDef = NULL, *pDef = NULL;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_BAR_ARRIVE:   code[0] = 0x84; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  code[0] = 0x24; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   code[0] = 0x44; break;
+   case NV50_IR_SUBOP_BAR_RED_POPC: code[0] = 0x04; break;
+   default:
+      code[0] = 0x04;
+      assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+   code[1] = 0x50000000;
+
+   code[0] |= 63 << 14;
+   code[1] |= 7 << 21;
+
+   emitPredicate(i);
+
+   // barrier id
+   if (i->src(0).getFile() == FILE_GPR) {
+      srcId(i->src(0), 20);
+   } else {
+      ImmediateValue *imm = i->getSrc(0)->asImm();
+      assert(imm);
+      code[0] |= imm->reg.data.u32 << 20;
+   }
+
+   // thread count
+   if (i->src(1).getFile() == FILE_GPR) {
+      srcId(i->src(1), 26);
+   } else {
+      ImmediateValue *imm = i->getSrc(1)->asImm();
+      assert(imm);
+      code[0] |= imm->reg.data.u32 << 26;
+      code[1] |= imm->reg.data.u32 >> 6;
+   }
+
+   if (i->srcExists(2) && (i->predSrc != 2)) {
+      srcId(i->src(2), 32 + 17);
+      if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 20;
+   } else {
+      code[1] |= 7 << 17;
+   }
+
+   if (i->defExists(0)) {
+      if (i->def(0).getFile() == FILE_GPR)
+         rDef = i->getDef(0);
+      else
+         pDef = i->getDef(0);
+
+      if (i->defExists(1)) {
+         if (i->def(1).getFile() == FILE_GPR)
+            rDef = i->getDef(1);
+         else
+            pDef = i->getDef(1);
+      }
+   }
+   if (rDef) {
+      code[0] &= ~(63 << 14);
+      defId(rDef, 14);
+   }
+   if (pDef) {
+      code[1] &= ~(7 << 21);
+      defId(pDef, 32 + 21);
+   }
+}
+
+void
+CodeEmitterNVC0::emitPFETCH(const Instruction *i)
+{
+   uint32_t prim = i->src(0).get()->reg.data.u32;
+
+   code[0] = 0x00000006 | ((prim & 0x3f) << 26);
+   code[1] = 0x00000000 | (prim >> 6);
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+   srcId(i->src(1), 20);
+}
+
+void
+CodeEmitterNVC0::emitVFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x06000000 | i->src(0).get()->reg.data.offset;
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+
+   emitPredicate(i);
+
+   code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+   srcId(i->src(0).getIndirect(1), 26); // vertex address
+}
+
+void
+CodeEmitterNVC0::emitEXPORT(const Instruction *i)
+{
+   unsigned int size = typeSizeof(i->dType);
+
+   code[0] = 0x00000006 | ((size / 4 - 1) << 5);
+   code[1] = 0x0a000000 | i->src(0).get()->reg.data.offset;
+
+   assert(!(code[1] & ((size == 12) ? 15 : (size - 1))));
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+
+   emitPredicate(i);
+
+   assert(i->src(1).getFile() == FILE_GPR);
+
+   srcId(i->src(0).getIndirect(0), 20);
+   srcId(i->src(0).getIndirect(1), 32 + 17); // vertex base address
+   srcId(i->src(1), 26);
+}
+
+void
+CodeEmitterNVC0::emitOUT(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x1c000000;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14); // new secret address
+   srcId(i->src(0), 20); // old secret address, should be 0 initially
+
+   assert(i->src(0).getFile() == FILE_GPR);
+
+   if (i->op == OP_EMIT)
+      code[0] |= 1 << 5;
+   if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART)
+      code[0] |= 1 << 6;
+
+   // vertex stream
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] |= 0xc000;
+      code[0] |= SDATA(i->src(1)).u32 << 26;
+   } else {
+      srcId(i->src(1), 26);
+   }
+}
+
+void
+CodeEmitterNVC0::emitInterpMode(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID
+   } else {
+      if (i->getInterpMode() == NV50_IR_INTERP_SC)
+         code[0] |= 0x80;
+      assert(i->op == OP_PINTERP && i->getSampleMode() == 0);
+   }
+}
+
+void
+CodeEmitterNVC0::emitINTERP(const Instruction *i)
+{
+   const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+   if (i->encSize == 8) {
+      code[0] = 0x00000000;
+      code[1] = 0xc0000000 | (base & 0xffff);
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->op == OP_PINTERP)
+         srcId(i->src(1), 26);
+      else
+         code[0] |= 0x3f << 26;
+
+      srcId(i->src(0).getIndirect(0), 20);
+   } else {
+      assert(i->op == OP_PINTERP);
+      code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26);
+      srcId(i->src(1), 20);
+   }
+   emitInterpMode(i);
+
+   emitPredicate(i);
+   defId(i->def(0), 14);
+
+   if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+      srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 17);
+   else
+      code[1] |= 0x3f << 17;
+}
+
+void
+CodeEmitterNVC0::emitLoadStoreType(DataType ty)
+{
+   uint8_t val;
+
+   switch (ty) {
+   case TYPE_U8:
+      val = 0x00;
+      break;
+   case TYPE_S8:
+      val = 0x20;
+      break;
+   case TYPE_F16:
+   case TYPE_U16:
+      val = 0x40;
+      break;
+   case TYPE_S16:
+      val = 0x60;
+      break;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      val = 0x80;
+      break;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      val = 0xa0;
+      break;
+   case TYPE_B128:
+      val = 0xc0;
+      break;
+   default:
+      val = 0x80;
+      assert(!"invalid type");
+      break;
+   }
+   code[0] |= val;
+}
+
+void
+CodeEmitterNVC0::emitCachingMode(CacheMode c)
+{
+   uint32_t val;
+
+   switch (c) {
+   case CACHE_CA:
+// case CACHE_WB:
+      val = 0x000;
+      break;
+   case CACHE_CG:
+      val = 0x100;
+      break;
+   case CACHE_CS:
+      val = 0x200;
+      break;
+   case CACHE_CV:
+// case CACHE_WT:
+      val = 0x300;
+      break;
+   default:
+      val = 0;
+      assert(!"invalid caching mode");
+      break;
+   }
+   code[0] |= val;
+}
+
+static inline bool
+uses64bitAddress(const Instruction *ldst)
+{
+   return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+      ldst->src(0).isIndirect(0) &&
+      ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
+void
+CodeEmitterNVC0::emitSTORE(const Instruction *i)
+{
+   uint32_t opc;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc8000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[0] = 0x00000005;
+   code[1] = opc;
+
+   setAddressByFile(i->src(0));
+   srcId(i->src(1), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+void
+CodeEmitterNVC0::emitLOAD(const Instruction *i)
+{
+   uint32_t opc;
+
+   code[0] = 0x00000005;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc0000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
+   case FILE_MEMORY_CONST:
+      if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+         emitMOV(i); // not sure if this is any better
+         return;
+      }
+      opc = 0x14000000 | (i->src(0).get()->reg.fileIndex << 10);
+      code[0] = 0x00000006 | (i->subOp << 8);
+      break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[1] = opc;
+
+   defId(i->def(0), 14);
+
+   setAddressByFile(i->src(0));
+   srcId(i->src(0).getIndirect(0), 20);
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+uint8_t
+CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
+{
+   switch (SDATA(ref).sv.sv) {
+   case SV_LANEID:        return 0x00;
+   case SV_PHYSID:        return 0x03;
+   case SV_VERTEX_COUNT:  return 0x10;
+   case SV_INVOCATION_ID: return 0x11;
+   case SV_YDIR:          return 0x12;
+   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
+   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
+   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
+   case SV_GRIDID:        return 0x2c;
+   case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
+   case SV_LBASE:         return 0x34;
+   case SV_SBASE:         return 0x30;
+   case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
+   default:
+      assert(!"no sreg for system value");
+      return 0;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMOV(const Instruction *i)
+{
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      if (i->src(0).getFile() == FILE_GPR) {
+         code[0] = 0xfc01c003;
+         code[1] = 0x1a8e0000;
+         srcId(i->src(0), 20);
+      } else {
+         code[0] = 0x0001c004;
+         code[1] = 0x0c0e0000;
+         if (i->src(0).getFile() == FILE_IMMEDIATE) {
+            code[0] |= 7 << 20;
+            if (!i->getSrc(0)->reg.data.u32)
+               code[0] |= 1 << 23;
+         } else {
+            srcId(i->src(0), 20);
+         }
+      }
+      defId(i->def(0), 17);
+      emitPredicate(i);
+   } else
+   if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+      uint8_t sr = getSRegEncoding(i->src(0));
+
+      if (i->encSize == 8) {
+         code[0] = 0x00000004 | (sr << 26);
+         code[1] = 0x2c000000;
+      } else {
+         code[0] = 0x40000008 | (sr << 20);
+      }
+      defId(i->def(0), 14);
+
+      emitPredicate(i);
+   } else
+   if (i->encSize == 8) {
+      uint64_t opc;
+
+      if (i->src(0).getFile() == FILE_IMMEDIATE)
+         opc = HEX64(18000000, 000001e2);
+      else
+      if (i->src(0).getFile() == FILE_PREDICATE)
+         opc = HEX64(080e0000, 1c000004);
+      else
+         opc = HEX64(28000000, 00000004);
+
+      opc |= i->lanes << 5;
+
+      emitForm_B(i, opc);
+   } else {
+      uint32_t imm;
+
+      if (i->src(0).getFile() == FILE_IMMEDIATE) {
+         imm = SDATA(i->src(0)).u32;
+         if (imm & 0xfff00000) {
+            assert(!(imm & 0x000fffff));
+            code[0] = 0x00000318 | imm;
+         } else {
+            assert(imm < 0x800 || ((int32_t)imm >= -0x800));
+            code[0] = 0x00000118 | (imm << 20);
+         }
+      } else {
+         code[0] = 0x0028;
+         emitShortSrc2(i->src(0));
+      }
+      defId(i->def(0), 14);
+
+      emitPredicate(i);
+   }
+}
+
+void
+CodeEmitterNVC0::emitATOM(const Instruction *i)
+{
+   const bool hasDst = i->defExists(0);
+   const bool casOrExch =
+      i->subOp == NV50_IR_SUBOP_ATOM_EXCH ||
+      i->subOp == NV50_IR_SUBOP_ATOM_CAS;
+
+   if (i->dType == TYPE_U64) {
+      switch (i->subOp) {
+      case NV50_IR_SUBOP_ATOM_ADD:
+         code[0] = 0x205;
+         if (hasDst)
+            code[1] = 0x507e0000;
+         else
+            code[1] = 0x10000000;
+         break;
+      case NV50_IR_SUBOP_ATOM_EXCH:
+         code[0] = 0x305;
+         code[1] = 0x507e0000;
+         break;
+      case NV50_IR_SUBOP_ATOM_CAS:
+         code[0] = 0x325;
+         code[1] = 0x50000000;
+         break;
+      default:
+         assert(!"invalid u64 red op");
+         break;
+      }
+   } else
+   if (i->dType == TYPE_U32) {
+      switch (i->subOp) {
+      case NV50_IR_SUBOP_ATOM_EXCH:
+         code[0] = 0x105;
+         code[1] = 0x507e0000;
+         break;
+      case NV50_IR_SUBOP_ATOM_CAS:
+         code[0] = 0x125;
+         code[1] = 0x50000000;
+         break;
+      default:
+         code[0] = 0x5 | (i->subOp << 5);
+         if (hasDst)
+            code[1] = 0x507e0000;
+         else
+            code[1] = 0x10000000;
+         break;
+      }
+   } else
+   if (i->dType == TYPE_S32) {
+      assert(i->subOp <= 2);
+      code[0] = 0x205 | (i->subOp << 5);
+      if (hasDst)
+         code[1] = 0x587e0000;
+      else
+         code[1] = 0x18000000;
+   } else
+   if (i->dType == TYPE_F32) {
+      assert(i->subOp == NV50_IR_SUBOP_ATOM_ADD);
+      code[0] = 0x205;
+      if (hasDst)
+         code[1] = 0x687e0000;
+      else
+         code[1] = 0x28000000;
+   }
+
+   emitPredicate(i);
+
+   srcId(i->src(1), 14);
+
+   if (hasDst)
+      defId(i->def(0), 32 + 11);
+   else
+   if (casOrExch)
+      code[1] |= 63 << 11;
+
+   if (hasDst || casOrExch) {
+      const int32_t offset = SDATA(i->src(0)).offset;
+      assert(offset < 0x80000 && offset >= -0x80000);
+      code[0] |= offset << 26;
+      code[1] |= (offset & 0x1ffc0) >> 6;
+      code[1] |= (offset & 0xe0000) << 6;
+   } else {
+      srcAddr32(i->src(0), 26, 0);
+   }
+   if (i->getIndirect(0, 0)) {
+      srcId(i->getIndirect(0, 0), 20);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 26;
+   } else {
+      code[0] |= 63 << 20;
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      srcId(i->src(2), 32 + 17);
+}
+
+void
+CodeEmitterNVC0::emitMEMBAR(const Instruction *i)
+{
+   switch (NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp)) {
+   case NV50_IR_SUBOP_MEMBAR_CTA: code[0] = 0x05; break;
+   case NV50_IR_SUBOP_MEMBAR_GL:  code[0] = 0x25; break;
+   default:
+      code[0] = 0x45;
+      assert(NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) == NV50_IR_SUBOP_MEMBAR_SYS);
+      break;
+   }
+   code[1] = 0xe0000000;
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitCCTL(const Instruction *i)
+{
+   code[0] = 0x00000005 | (i->subOp << 5);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      code[1] = 0x98000000;
+      srcAddr32(i->src(0), 28, 2);
+   } else {
+      code[1] = 0xd0000000;
+      setAddress24(i->src(0));
+   }
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+   srcId(i->src(0).getIndirect(0), 20);
+
+   emitPredicate(i);
+
+   defId(i, 0, 14);
+}
+
+void
+CodeEmitterNVC0::emitSUCLAMPMode(uint16_t subOp)
+{
+   uint8_t m;
+   switch (subOp & ~NV50_IR_SUBOP_SUCLAMP_2D) {
+   case NV50_IR_SUBOP_SUCLAMP_SD(0, 1): m = 0; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(1, 1): m = 1; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(2, 1): m = 2; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(3, 1): m = 3; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(4, 1): m = 4; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(0, 1): m = 5; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(1, 1): m = 6; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(2, 1): m = 7; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(3, 1): m = 8; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(4, 1): m = 9; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(0, 1): m = 10; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(1, 1): m = 11; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(2, 1): m = 12; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(3, 1): m = 13; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(4, 1): m = 14; break;
+   default:
+      return;
+   }
+   code[0] |= m << 5;
+   if (subOp & NV50_IR_SUBOP_SUCLAMP_2D)
+      code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitSUCalc(Instruction *i)
+{
+   ImmediateValue *imm = NULL;
+   uint64_t opc;
+
+   if (i->srcExists(2)) {
+      imm = i->getSrc(2)->asImm();
+      if (imm)
+         i->setSrc(2, NULL); // special case, make emitForm_A not assert
+   }
+
+   switch (i->op) {
+   case OP_SUCLAMP: opc = HEX64(58000000, 00000004); break;
+   case OP_SUBFM: opc = HEX64(5c000000, 00000004); break;
+   case OP_SUEAU: opc = HEX64(60000000, 00000004); break;
+   default:
+      assert(0);
+      return;
+   }
+   emitForm_A(i, opc);
+
+   if (i->op == OP_SUCLAMP) {
+      if (i->dType == TYPE_S32)
+         code[0] |= 1 << 9;
+      emitSUCLAMPMode(i->subOp);
+   }
+
+   if (i->op == OP_SUBFM && i->subOp == NV50_IR_SUBOP_SUBFM_3D)
+         code[1] |= 1 << 16;
+
+   if (i->op != OP_SUEAU) {
+      if (i->def(0).getFile() == FILE_PREDICATE) { // p, #
+         code[0] |= 63 << 14;
+         code[1] |= i->getDef(0)->reg.data.id << 23;
+      } else
+      if (i->defExists(1)) { // r, p
+         assert(i->def(1).getFile() == FILE_PREDICATE);
+         code[1] |= i->getDef(1)->reg.data.id << 23;
+      } else { // r, #
+         code[1] |= 7 << 23;
+      }
+   }
+   if (imm) {
+      assert(i->op == OP_SUCLAMP);
+      i->setSrc(2, imm);
+      code[1] |= (imm->reg.data.u32 & 0x3f) << 17; // sint6
+   }
+}
+
+void
+CodeEmitterNVC0::emitSUGType(DataType ty)
+{
+   switch (ty) {
+   case TYPE_S32: code[1] |= 1 << 13; break;
+   case TYPE_U8:  code[1] |= 2 << 13; break;
+   case TYPE_S8:  code[1] |= 3 << 13; break;
+   default:
+      assert(ty == TYPE_U32);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::setSUConst16(const Instruction *i, const int s)
+{
+   const uint32_t offset = i->getSrc(s)->reg.data.offset;
+
+   assert(i->src(s).getFile() == FILE_MEMORY_CONST);
+   assert(offset == (offset & 0xfffc));
+
+   code[1] |= 1 << 21;
+   code[0] |= offset << 24;
+   code[1] |= offset >> 8;
+   code[1] |= i->getSrc(s)->reg.fileIndex << 8;
+}
+
+void
+CodeEmitterNVC0::setSUPred(const Instruction *i, const int s)
+{
+   if (!i->srcExists(s) || (i->predSrc == s)) {
+      code[1] |= 0x7 << 17;
+   } else {
+      if (i->src(s).mod == Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 20;
+      srcId(i->src(s), 32 + 17);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSULDGB(const TexInstruction *i)
+{
+   code[0] = 0x5;
+   code[1] = 0xd4000000 | (i->subOp << 15);
+
+   emitLoadStoreType(i->dType);
+   emitSUGType(i->sType);
+   emitCachingMode(i->cache);
+
+   emitPredicate(i);
+   defId(i->def(0), 14); // destination
+   srcId(i->src(0), 20); // address
+   // format
+   if (i->src(1).getFile() == FILE_GPR)
+      srcId(i->src(1), 26);
+   else
+      setSUConst16(i, 1);
+   setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitSUSTGx(const TexInstruction *i)
+{
+   code[0] = 0x5;
+   code[1] = 0xdc000000 | (i->subOp << 15);
+
+   if (i->op == OP_SUSTP)
+      code[1] |= i->tex.mask << 22;
+   else
+      emitLoadStoreType(i->dType);
+   emitSUGType(i->sType);
+   emitCachingMode(i->cache);
+
+   emitPredicate(i);
+   srcId(i->src(0), 20); // address
+   // format
+   if (i->src(1).getFile() == FILE_GPR)
+      srcId(i->src(1), 26);
+   else
+      setSUConst16(i, 1);
+   srcId(i->src(3), 14); // values
+   setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitVectorSubOp(const Instruction *i)
+{
+   switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+   case 0:
+      code[1] |= (i->subOp & 0x000f) << 12; // vsrc1
+      code[1] |= (i->subOp & 0x00e0) >> 5;  // vsrc2
+      code[1] |= (i->subOp & 0x0100) << 7;  // vsrc2
+      code[1] |= (i->subOp & 0x3c00) << 13; // vdst
+      break;
+   case 1:
+      code[1] |= (i->subOp & 0x000f) << 8;  // v2src1
+      code[1] |= (i->subOp & 0x0010) << 11; // v2src1
+      code[1] |= (i->subOp & 0x01e0) >> 1;  // v2src2
+      code[1] |= (i->subOp & 0x0200) << 6;  // v2src2
+      code[1] |= (i->subOp & 0x3c00) << 2;  // v4dst
+      code[1] |= (i->mask & 0x3) << 2;
+      break;
+   case 2:
+      code[1] |= (i->subOp & 0x000f) << 8; // v4src1
+      code[1] |= (i->subOp & 0x01e0) >> 1; // v4src2
+      code[1] |= (i->subOp & 0x3c00) << 2; // v4dst
+      code[1] |= (i->mask & 0x3) << 2;
+      code[1] |= (i->mask & 0xc) << 21;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitVSHL(const Instruction *i)
+{
+   uint64_t opc = 0x4;
+
+   switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+   case 0: opc |= 0xe8ULL << 56; break;
+   case 1: opc |= 0xb4ULL << 56; break;
+   case 2: opc |= 0x94ULL << 56; break;
+   default:
+      assert(0);
+      break;
+   }
+   if (NV50_IR_SUBOP_Vn(i->subOp) == 1) {
+      if (isSignedType(i->dType)) opc |= 1ULL << 0x2a;
+      if (isSignedType(i->sType)) opc |= (1 << 6) | (1 << 5);
+   } else {
+      if (isSignedType(i->dType)) opc |= 1ULL << 0x39;
+      if (isSignedType(i->sType)) opc |= 1 << 6;
+   }
+   emitForm_A(i, opc);
+   emitVectorSubOp(i);
+
+   if (i->saturate)
+      code[0] |= 1 << 9;
+   if (i->flagsDef >= 0)
+      code[1] |= 1 << 16;
+}
+
+bool
+CodeEmitterNVC0::emitInstruction(Instruction *insn)
+{
+   unsigned int size = insn->encSize;
+
+   if (writeIssueDelays && !(codeSize & 0x3f))
+      size += 8;
+
+   if (!insn->encSize) {
+      ERROR("skipping unencodable instruction: "); insn->print();
+      return false;
+   } else
+   if (codeSize + size > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (writeIssueDelays) {
+      if (!(codeSize & 0x3f)) {
+         code[0] = 0x00000007; // cf issue delay "instruction"
+         code[1] = 0x20000000;
+         code += 2;
+         codeSize += 8;
+      }
+      const unsigned int id = (codeSize & 0x3f) / 8 - 1;
+      uint32_t *data = code - (id * 2 + 2);
+      if (id <= 2) {
+         data[0] |= insn->sched << (id * 8 + 4);
+      } else
+      if (id == 3) {
+         data[0] |= insn->sched << 28;
+         data[1] |= insn->sched >> 4;
+      } else {
+         data[1] |= insn->sched << ((id - 4) * 8 + 4);
+      }
+   }
+
+   // assert that instructions with multiple defs don't corrupt registers
+   for (int d = 0; insn->defExists(d); ++d)
+      assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+   switch (insn->op) {
+   case OP_MOV:
+   case OP_RDSV:
+      emitMOV(insn);
+      break;
+   case OP_NOP:
+      break;
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_VFETCH:
+      emitVFETCH(insn);
+      break;
+   case OP_EXPORT:
+      emitEXPORT(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitUMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+      emitLogicOp(insn, 0);
+      break;
+   case OP_OR:
+      emitLogicOp(insn, 1);
+      break;
+   case OP_XOR:
+      emitLogicOp(insn, 2);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+      emitSET(insn->asCmp());
+      break;
+   case OP_SELP:
+      emitSELP(insn);
+      break;
+   case OP_SLCT:
+      emitSLCT(insn->asCmp());
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_ABS:
+   case OP_NEG:
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_CVT:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 1);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXD:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXBAR:
+      emitTEXBAR(insn);
+      break;
+   case OP_SUBFM:
+   case OP_SUCLAMP:
+   case OP_SUEAU:
+      emitSUCalc(insn);
+      break;
+   case OP_MADSP:
+      emitMADSP(insn);
+      break;
+   case OP_SULDB:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         emitSULDGB(insn->asTex());
+      else
+         ERROR("SULDB not yet supported on < nve4\n");
+      break;
+   case OP_SUSTB:
+   case OP_SUSTP:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         emitSUSTGx(insn->asTex());
+      else
+         ERROR("SUSTx not yet supported on < nve4\n");
+      break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_BRA:
+   case OP_CALL:
+   case OP_PRERET:
+   case OP_RET:
+   case OP_DISCARD:
+   case OP_EXIT:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_PREBREAK:
+   case OP_BREAK:
+   case OP_JOINAT:
+   case OP_BRKPT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+      emitFlow(insn);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->subOp, insn->lanes);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+      break;
+   case OP_POPCNT:
+      emitPOPC(insn);
+      break;
+   case OP_INSBF:
+      emitINSBF(insn);
+      break;
+   case OP_EXTBF:
+      emitEXTBF(insn);
+      break;
+   case OP_PERMT:
+      emitPERMT(insn);
+      break;
+   case OP_JOIN:
+      emitNOP(insn);
+      insn->join = 1;
+      break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
+   case OP_MEMBAR:
+      emitMEMBAR(insn);
+      break;
+   case OP_CCTL:
+      emitCCTL(insn);
+      break;
+   case OP_VSHL:
+      emitVSHL(insn);
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknow op\n");
+      return false;
+   }
+
+   if (insn->join) {
+      code[0] |= 0x10;
+      assert(insn->encSize == 8);
+   }
+
+   code += insn->encSize / 4;
+   codeSize += insn->encSize;
+   return true;
+}
+
+uint32_t
+CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
+{
+   const Target::OpInfo &info = targ->getOpInfo(i);
+
+   if (writeIssueDelays || info.minEncSize == 8 || 1)
+      return 8;
+
+   if (i->ftz || i->saturate || i->join)
+      return 8;
+   if (i->rnd != ROUND_N)
+      return 8;
+   if (i->predSrc >= 0 && i->op == OP_MAD)
+      return 8;
+
+   if (i->op == OP_PINTERP) {
+      if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
+         return 8;
+   } else
+   if (i->op == OP_MOV && i->lanes != 0xf) {
+      return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      if (i->src(s).isIndirect(0))
+         return 8;
+
+      if (i->src(s).getFile() == FILE_MEMORY_CONST) {
+         if (SDATA(i->src(s)).offset >= 0x100)
+            return 8;
+         if (i->getSrc(s)->reg.fileIndex > 1 &&
+             i->getSrc(s)->reg.fileIndex != 16)
+             return 8;
+      } else
+      if (i->src(s).getFile() == FILE_IMMEDIATE) {
+         if (i->dType == TYPE_F32) {
+            if (SDATA(i->src(s)).u32 >= 0x100)
+               return 8;
+         } else {
+            if (SDATA(i->src(s)).u32 > 0xff)
+               return 8;
+         }
+      }
+
+      if (i->op == OP_CVT)
+         continue;
+      if (i->src(s).mod != Modifier(0)) {
+         if (i->src(s).mod == Modifier(NV50_IR_MOD_ABS))
+            if (i->op != OP_RSQ)
+               return 8;
+         if (i->src(s).mod == Modifier(NV50_IR_MOD_NEG))
+            if (i->op != OP_ADD || s != 0)
+               return 8;
+      }
+   }
+
+   return 4;
+}
+
+// Simplified, erring on safe side.
+class SchedDataCalculator : public Pass
+{
+public:
+   SchedDataCalculator(const Target *targ) : targ(targ) { }
+
+private:
+   struct RegScores
+   {
+      struct Resource {
+         int st[DATA_FILE_COUNT]; // LD to LD delay 3
+         int ld[DATA_FILE_COUNT]; // ST to ST delay 3
+         int tex; // TEX to non-TEX delay 17 (0x11)
+         int sfu; // SFU to SFU delay 3 (except PRE-ops)
+         int imul; // integer MUL to MUL delay 3
+      } res;
+      struct ScoreData {
+         int r[64];
+         int p[8];
+         int c;
+      } rd, wr;
+      int base;
+
+      void rebase(const int base)
+      {
+         const int delta = this->base - base;
+         if (!delta)
+            return;
+         this->base = 0;
+
+         for (int i = 0; i < 64; ++i) {
+            rd.r[i] += delta;
+            wr.r[i] += delta;
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] += delta;
+            wr.p[i] += delta;
+         }
+         rd.c += delta;
+         wr.c += delta;
+
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            res.ld[f] += delta;
+            res.st[f] += delta;
+         }
+         res.sfu += delta;
+         res.imul += delta;
+         res.tex += delta;
+      }
+      void wipe()
+      {
+         memset(&rd, 0, sizeof(rd));
+         memset(&wr, 0, sizeof(wr));
+         memset(&res, 0, sizeof(res));
+      }
+      int getLatest(const ScoreData& d) const
+      {
+         int max = 0;
+         for (int i = 0; i < 64; ++i)
+            if (d.r[i] > max)
+               max = d.r[i];
+         for (int i = 0; i < 8; ++i)
+            if (d.p[i] > max)
+               max = d.p[i];
+         if (d.c > max)
+            max = d.c;
+         return max;
+      }
+      inline int getLatestRd() const
+      {
+         return getLatest(rd);
+      }
+      inline int getLatestWr() const
+      {
+         return getLatest(wr);
+      }
+      inline int getLatest() const
+      {
+         const int a = getLatestRd();
+         const int b = getLatestWr();
+
+         int max = MAX2(a, b);
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            max = MAX2(res.ld[f], max);
+            max = MAX2(res.st[f], max);
+         }
+         max = MAX2(res.sfu, max);
+         max = MAX2(res.imul, max);
+         max = MAX2(res.tex, max);
+         return max;
+      }
+      void setMax(const RegScores *that)
+      {
+         for (int i = 0; i < 64; ++i) {
+            rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
+            wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
+            wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
+         }
+         rd.c = MAX2(rd.c, that->rd.c);
+         wr.c = MAX2(wr.c, that->wr.c);
+
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
+            res.st[f] = MAX2(res.st[f], that->res.st[f]);
+         }
+         res.sfu = MAX2(res.sfu, that->res.sfu);
+         res.imul = MAX2(res.imul, that->res.imul);
+         res.tex = MAX2(res.tex, that->res.tex);
+      }
+      void print(int cycle)
+      {
+         for (int i = 0; i < 64; ++i) {
+            if (rd.r[i] > cycle)
+               INFO("rd $r%i @ %i\n", i, rd.r[i]);
+            if (wr.r[i] > cycle)
+               INFO("wr $r%i @ %i\n", i, wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            if (rd.p[i] > cycle)
+               INFO("rd $p%i @ %i\n", i, rd.p[i]);
+            if (wr.p[i] > cycle)
+               INFO("wr $p%i @ %i\n", i, wr.p[i]);
+         }
+         if (rd.c > cycle)
+            INFO("rd $c @ %i\n", rd.c);
+         if (wr.c > cycle)
+            INFO("wr $c @ %i\n", wr.c);
+         if (res.sfu > cycle)
+            INFO("sfu @ %i\n", res.sfu);
+         if (res.imul > cycle)
+            INFO("imul @ %i\n", res.imul);
+         if (res.tex > cycle)
+            INFO("tex @ %i\n", res.tex);
+      }
+   };
+
+   RegScores *score; // for current BB
+   std::vector<RegScores> scoreBoards;
+   int cycle;
+   int prevData;
+   operation prevOp;
+
+   const Target *targ;
+
+   bool visit(Function *);
+   bool visit(BasicBlock *);
+
+   void commitInsn(const Instruction *, int cycle);
+   int calcDelay(const Instruction *, int cycle) const;
+   void setDelay(Instruction *, int delay, Instruction *next);
+
+   void recordRd(const Value *, const int ready);
+   void recordWr(const Value *, const int ready);
+   void checkRd(const Value *, int cycle, int& delay) const;
+   void checkWr(const Value *, int cycle, int& delay) const;
+
+   int getCycles(const Instruction *, int origDelay) const;
+};
+
+void
+SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
+{
+   if (insn->op == OP_EXIT || insn->op == OP_RET)
+      delay = MAX2(delay, 14);
+
+   if (insn->op == OP_TEXBAR) {
+      // TODO: except if results not used before EXIT
+      insn->sched = 0xc2;
+   } else
+   if (insn->op == OP_JOIN || insn->join) {
+      insn->sched = 0x00;
+   } else
+   if (delay >= 0 || prevData == 0x04 ||
+       !next || !targ->canDualIssue(insn, next)) {
+      insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
+      if (prevOp == OP_EXPORT)
+         insn->sched |= 0x40;
+      else
+         insn->sched |= 0x20;
+   } else {
+      insn->sched = 0x04; // dual-issue
+   }
+
+   if (prevData != 0x04 || prevOp != OP_EXPORT)
+      if (insn->sched != 0x04 || insn->op == OP_EXPORT)
+         prevOp = insn->op;
+
+   prevData = insn->sched;
+}
+
+int
+SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
+{
+   if (insn->sched & 0x80) {
+      int c = (insn->sched & 0x0f) * 2 + 1;
+      if (insn->op == OP_TEXBAR && origDelay > 0)
+         c += origDelay;
+      return c;
+   }
+   if (insn->sched & 0x60)
+      return (insn->sched & 0x1f) + 1;
+   return (insn->sched == 0x04) ? 0 : 32;
+}
+
+bool
+SchedDataCalculator::visit(Function *func)
+{
+   scoreBoards.resize(func->cfg.getSize());
+   for (size_t i = 0; i < scoreBoards.size(); ++i)
+      scoreBoards[i].wipe();
+   return true;
+}
+
+bool
+SchedDataCalculator::visit(BasicBlock *bb)
+{
+   Instruction *insn;
+   Instruction *next = NULL;
+
+   int cycle = 0;
+
+   prevData = 0x00;
+   prevOp = OP_NOP;
+   score = &scoreBoards.at(bb->getId());
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      // back branches will wait until all target dependencies are satisfied
+      if (ei.getType() == Graph::Edge::BACK) // sched would be uninitialized
+         continue;
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      if (in->getExit()) {
+         if (prevData != 0x04)
+            prevData = in->getExit()->sched;
+         prevOp = in->getExit()->op;
+      }
+      score->setMax(&scoreBoards.at(in->getId()));
+   }
+   if (bb->cfg.incidentCount() > 1)
+      prevOp = OP_NOP;
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+   INFO("=== BB:%i initial scores\n", bb->getId());
+   score->print(cycle);
+#endif
+
+   for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
+      next = insn->next;
+
+      commitInsn(insn, cycle);
+      int delay = calcDelay(next, cycle);
+      setDelay(insn, delay, next);
+      cycle += getCycles(insn, delay);
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+      INFO("cycle %i, sched %02x\n", cycle, insn->sched);
+      insn->print();
+      next->print();
+#endif
+   }
+   if (!insn)
+      return true;
+   commitInsn(insn, cycle);
+
+   int bbDelay = -1;
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+
+      if (ei.getType() != Graph::Edge::BACK) {
+         // only test the first instruction of the outgoing block
+         next = out->getEntry();
+         if (next)
+            bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
+      } else {
+         // wait until all dependencies are satisfied
+         const int regsFree = score->getLatest();
+         next = out->getFirst();
+         for (int c = cycle; next && c < regsFree; next = next->next) {
+            bbDelay = MAX2(bbDelay, calcDelay(next, c));
+            c += getCycles(next, bbDelay);
+         }
+         next = NULL;
+      }
+   }
+   if (bb->cfg.outgoingCount() != 1)
+      next = NULL;
+   setDelay(insn, bbDelay, next);
+   cycle += getCycles(insn, bbDelay);
+
+   score->rebase(cycle); // common base for initializing out blocks' scores
+   return true;
+}
+
+#define NVE4_MAX_ISSUE_DELAY 0x1f
+int
+SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
+{
+   int delay = 0, ready = cycle;
+
+   for (int s = 0; insn->srcExists(s); ++s)
+      checkRd(insn->getSrc(s), cycle, delay);
+   // WAR & WAW don't seem to matter
+   // for (int s = 0; insn->srcExists(s); ++s)
+   //   recordRd(insn->getSrc(s), cycle);
+
+   switch (Target::getOpClass(insn->op)) {
+   case OPCLASS_SFU:
+      ready = score->res.sfu;
+      break;
+   case OPCLASS_ARITH:
+      if (insn->op == OP_MUL && !isFloatType(insn->dType))
+         ready = score->res.imul;
+      break;
+   case OPCLASS_TEXTURE:
+      ready = score->res.tex;
+      break;
+   case OPCLASS_LOAD:
+      ready = score->res.ld[insn->src(0).getFile()];
+      break;
+   case OPCLASS_STORE:
+      ready = score->res.st[insn->src(0).getFile()];
+      break;
+   default:
+      break;
+   }
+   if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
+      ready = MAX2(ready, score->res.tex);
+
+   delay = MAX2(delay, ready - cycle);
+
+   // if can issue next cycle, delay is 0, not 1
+   return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
+}
+
+void
+SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
+{
+   const int ready = cycle + targ->getLatency(insn);
+
+   for (int d = 0; insn->defExists(d); ++d)
+      recordWr(insn->getDef(d), ready);
+   // WAR & WAW don't seem to matter
+   // for (int s = 0; insn->srcExists(s); ++s)
+   //   recordRd(insn->getSrc(s), cycle);
+
+   switch (Target::getOpClass(insn->op)) {
+   case OPCLASS_SFU:
+      score->res.sfu = cycle + 4;
+      break;
+   case OPCLASS_ARITH:
+      if (insn->op == OP_MUL && !isFloatType(insn->dType))
+         score->res.imul = cycle + 4;
+      break;
+   case OPCLASS_TEXTURE:
+      score->res.tex = cycle + 18;
+      break;
+   case OPCLASS_LOAD:
+      if (insn->src(0).getFile() == FILE_MEMORY_CONST)
+         break;
+      score->res.ld[insn->src(0).getFile()] = cycle + 4;
+      score->res.st[insn->src(0).getFile()] = ready;
+      break;
+   case OPCLASS_STORE:
+      score->res.st[insn->src(0).getFile()] = cycle + 4;
+      score->res.ld[insn->src(0).getFile()] = ready;
+      break;
+   case OPCLASS_OTHER:
+      if (insn->op == OP_TEXBAR)
+         score->res.tex = cycle;
+      break;
+   default:
+      break;
+   }
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+   score->print(cycle);
+#endif
+}
+
+void
+SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
+{
+   int ready = cycle;
+   int a, b;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      a = v->reg.data.id;
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         ready = MAX2(ready, score->rd.r[r]);
+      break;
+   case FILE_PREDICATE:
+      ready = MAX2(ready, score->rd.p[v->reg.data.id]);
+      break;
+   case FILE_FLAGS:
+      ready = MAX2(ready, score->rd.c);
+      break;
+   case FILE_SHADER_INPUT:
+   case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
+   case FILE_MEMORY_LOCAL:
+   case FILE_MEMORY_CONST:
+   case FILE_MEMORY_SHARED:
+   case FILE_MEMORY_GLOBAL:
+   case FILE_SYSTEM_VALUE:
+      // TODO: any restrictions here ?
+      break;
+   case FILE_IMMEDIATE:
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   if (cycle < ready)
+      delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
+{
+   int ready = cycle;
+   int a, b;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      a = v->reg.data.id;
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         ready = MAX2(ready, score->wr.r[r]);
+      break;
+   case FILE_PREDICATE:
+      ready = MAX2(ready, score->wr.p[v->reg.data.id]);
+      break;
+   default:
+      assert(v->reg.file == FILE_FLAGS);
+      ready = MAX2(ready, score->wr.c);
+      break;
+   }
+   if (cycle < ready)
+      delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::recordWr(const Value *v, const int ready)
+{
+   int a = v->reg.data.id;
+
+   if (v->reg.file == FILE_GPR) {
+      int b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         score->rd.r[r] = ready;
+   } else
+   // $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
+   if (v->reg.file == FILE_PREDICATE) {
+      score->rd.p[a] = ready + 4;
+   } else {
+      assert(v->reg.file == FILE_FLAGS);
+      score->rd.c = ready + 4;
+   }
+}
+
+void
+SchedDataCalculator::recordRd(const Value *v, const int ready)
+{
+   int a = v->reg.data.id;
+
+   if (v->reg.file == FILE_GPR) {
+      int b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         score->wr.r[r] = ready;
+   } else
+   if (v->reg.file == FILE_PREDICATE) {
+      score->wr.p[a] = ready;
+   } else
+   if (v->reg.file == FILE_FLAGS) {
+      score->wr.c = ready;
+   }
+}
+
+bool
+calculateSchedDataNVC0(const Target *targ, Function *func)
+{
+   SchedDataCalculator sched(targ);
+   return sched.run(func, true, true);
+}
+
+void
+CodeEmitterNVC0::prepareEmission(Function *func)
+{
+   CodeEmitter::prepareEmission(func);
+
+   if (targ->hasSWSched)
+      calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
+   : CodeEmitter(target),
+     targNVC0(target),
+     writeIssueDelays(target->hasSWSched)
+{
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterNVC0(Program::Type type)
+{
+   CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+CodeEmitter *
+TargetNVC0::getCodeEmitter(Program::Type type)
+{
+   if (chipset >= NVISA_GK110_CHIPSET)
+      return createCodeEmitterGK110(type);
+   return createCodeEmitterNVC0(type);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
new file mode 100644
index 00000000000..3193ea668a3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -0,0 +1,2852 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+extern "C" {
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_util.h"
+}
+
+#include <set>
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace tgsi {
+
+class Source;
+
+static nv50_ir::operation translateOpcode(uint opcode);
+static nv50_ir::DataFile translateFile(uint file);
+static nv50_ir::TexTarget translateTexture(uint texTarg);
+static nv50_ir::SVSemantic translateSysVal(uint sysval);
+
+class Instruction
+{
+public:
+   Instruction(const struct tgsi_full_instruction *inst) : insn(inst) { }
+
+   class SrcRegister
+   {
+   public:
+      SrcRegister(const struct tgsi_full_src_register *src)
+         : reg(src->Register),
+           fsr(src)
+      { }
+
+      SrcRegister(const struct tgsi_src_register& src) : reg(src), fsr(NULL) { }
+
+      SrcRegister(const struct tgsi_ind_register& ind)
+         : reg(tgsi_util_get_src_from_ind(&ind)),
+           fsr(NULL)
+      { }
+
+      struct tgsi_src_register offsetToSrc(struct tgsi_texture_offset off)
+      {
+         struct tgsi_src_register reg;
+         memset(&reg, 0, sizeof(reg));
+         reg.Index = off.Index;
+         reg.File = off.File;
+         reg.SwizzleX = off.SwizzleX;
+         reg.SwizzleY = off.SwizzleY;
+         reg.SwizzleZ = off.SwizzleZ;
+         return reg;
+      }
+
+      SrcRegister(const struct tgsi_texture_offset& off) :
+         reg(offsetToSrc(off)),
+         fsr(NULL)
+      { }
+
+      uint getFile() const { return reg.File; }
+
+      bool is2D() const { return reg.Dimension; }
+
+      bool isIndirect(int dim) const
+      {
+         return (dim && fsr) ? fsr->Dimension.Indirect : reg.Indirect;
+      }
+
+      int getIndex(int dim) const
+      {
+         return (dim && fsr) ? fsr->Dimension.Index : reg.Index;
+      }
+
+      int getSwizzle(int chan) const
+      {
+         return tgsi_util_get_src_register_swizzle(&reg, chan);
+      }
+
+      nv50_ir::Modifier getMod(int chan) const;
+
+      SrcRegister getIndirect(int dim) const
+      {
+         assert(fsr && isIndirect(dim));
+         if (dim)
+            return SrcRegister(fsr->DimIndirect);
+         return SrcRegister(fsr->Indirect);
+      }
+
+      uint32_t getValueU32(int c, const struct nv50_ir_prog_info *info) const
+      {
+         assert(reg.File == TGSI_FILE_IMMEDIATE);
+         assert(!reg.Absolute);
+         assert(!reg.Negate);
+         return info->immd.data[reg.Index * 4 + getSwizzle(c)];
+      }
+
+   private:
+      const struct tgsi_src_register reg;
+      const struct tgsi_full_src_register *fsr;
+   };
+
+   class DstRegister
+   {
+   public:
+      DstRegister(const struct tgsi_full_dst_register *dst)
+         : reg(dst->Register),
+           fdr(dst)
+      { }
+
+      DstRegister(const struct tgsi_dst_register& dst) : reg(dst), fdr(NULL) { }
+
+      uint getFile() const { return reg.File; }
+
+      bool is2D() const { return reg.Dimension; }
+
+      bool isIndirect(int dim) const
+      {
+         return (dim && fdr) ? fdr->Dimension.Indirect : reg.Indirect;
+      }
+
+      int getIndex(int dim) const
+      {
+         return (dim && fdr) ? fdr->Dimension.Dimension : reg.Index;
+      }
+
+      unsigned int getMask() const { return reg.WriteMask; }
+
+      bool isMasked(int chan) const { return !(getMask() & (1 << chan)); }
+
+      SrcRegister getIndirect(int dim) const
+      {
+         assert(fdr && isIndirect(dim));
+         if (dim)
+            return SrcRegister(fdr->DimIndirect);
+         return SrcRegister(fdr->Indirect);
+      }
+
+   private:
+      const struct tgsi_dst_register reg;
+      const struct tgsi_full_dst_register *fdr;
+   };
+
+   inline uint getOpcode() const { return insn->Instruction.Opcode; }
+
+   unsigned int srcCount() const { return insn->Instruction.NumSrcRegs; }
+   unsigned int dstCount() const { return insn->Instruction.NumDstRegs; }
+
+   // mask of used components of source s
+   unsigned int srcMask(unsigned int s) const;
+
+   SrcRegister getSrc(unsigned int s) const
+   {
+      assert(s < srcCount());
+      return SrcRegister(&insn->Src[s]);
+   }
+
+   DstRegister getDst(unsigned int d) const
+   {
+      assert(d < dstCount());
+      return DstRegister(&insn->Dst[d]);
+   }
+
+   SrcRegister getTexOffset(unsigned int i) const
+   {
+      assert(i < TGSI_FULL_MAX_TEX_OFFSETS);
+      return SrcRegister(insn->TexOffsets[i]);
+   }
+
+   unsigned int getNumTexOffsets() const { return insn->Texture.NumOffsets; }
+
+   bool checkDstSrcAliasing() const;
+
+   inline nv50_ir::operation getOP() const {
+      return translateOpcode(getOpcode()); }
+
+   nv50_ir::DataType inferSrcType() const;
+   nv50_ir::DataType inferDstType() const;
+
+   nv50_ir::CondCode getSetCond() const;
+
+   nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const;
+
+   inline uint getLabel() { return insn->Label.Label; }
+
+   unsigned getSaturate() const { return insn->Instruction.Saturate; }
+
+   void print() const
+   {
+      tgsi_dump_instruction(insn, 1);
+   }
+
+private:
+   const struct tgsi_full_instruction *insn;
+};
+
+unsigned int Instruction::srcMask(unsigned int s) const
+{
+   unsigned int mask = insn->Dst[0].Register.WriteMask;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+      return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+   case TGSI_OPCODE_DP2:
+      return 0x3;
+   case TGSI_OPCODE_DP3:
+      return 0x7;
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_KILL_IF: /* WriteMask ignored */
+      return 0xf;
+   case TGSI_OPCODE_DST:
+      return mask & (s ? 0xa : 0x6);
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SCS:
+      return 0x1;
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+      return 0x1;
+   case TGSI_OPCODE_LIT:
+      return 0xb;
+   case TGSI_OPCODE_TEX2:
+   case TGSI_OPCODE_TXB2:
+   case TGSI_OPCODE_TXL2:
+      return (s == 0) ? 0xf : 0x3;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+   {
+      const struct tgsi_instruction_texture *tex = &insn->Texture;
+
+      assert(insn->Instruction.Texture);
+
+      mask = 0x7;
+      if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+          insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+         mask |= 0x8; /* bias, lod or proj */
+
+      switch (tex->Texture) {
+      case TGSI_TEXTURE_1D:
+         mask &= 0x9;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+         mask &= 0xd;
+         break;
+      case TGSI_TEXTURE_1D_ARRAY:
+      case TGSI_TEXTURE_2D:
+      case TGSI_TEXTURE_RECT:
+         mask &= 0xb;
+         break;
+      case TGSI_TEXTURE_CUBE_ARRAY:
+      case TGSI_TEXTURE_SHADOW2D_ARRAY:
+      case TGSI_TEXTURE_SHADOWCUBE:
+      case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+         mask |= 0x8;
+         break;
+      default:
+         break;
+      }
+   }
+      return mask;
+   case TGSI_OPCODE_XPD:
+   {
+      unsigned int x = 0;
+      if (mask & 1) x |= 0x6;
+      if (mask & 2) x |= 0x5;
+      if (mask & 4) x |= 0x3;
+      return x;
+   }
+   default:
+      break;
+   }
+
+   return mask;
+}
+
+nv50_ir::Modifier Instruction::SrcRegister::getMod(int chan) const
+{
+   nv50_ir::Modifier m(0);
+
+   if (reg.Absolute)
+      m = m | nv50_ir::Modifier(NV50_IR_MOD_ABS);
+   if (reg.Negate)
+      m = m | nv50_ir::Modifier(NV50_IR_MOD_NEG);
+   return m;
+}
+
+static nv50_ir::DataFile translateFile(uint file)
+{
+   switch (file) {
+   case TGSI_FILE_CONSTANT:        return nv50_ir::FILE_MEMORY_CONST;
+   case TGSI_FILE_INPUT:           return nv50_ir::FILE_SHADER_INPUT;
+   case TGSI_FILE_OUTPUT:          return nv50_ir::FILE_SHADER_OUTPUT;
+   case TGSI_FILE_TEMPORARY:       return nv50_ir::FILE_GPR;
+   case TGSI_FILE_ADDRESS:         return nv50_ir::FILE_ADDRESS;
+   case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
+   case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
+   case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
+   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   case TGSI_FILE_SAMPLER:
+   case TGSI_FILE_NULL:
+   default:
+      return nv50_ir::FILE_NULL;
+   }
+}
+
+static nv50_ir::SVSemantic translateSysVal(uint sysval)
+{
+   switch (sysval) {
+   case TGSI_SEMANTIC_FACE:       return nv50_ir::SV_FACE;
+   case TGSI_SEMANTIC_PSIZE:      return nv50_ir::SV_POINT_SIZE;
+   case TGSI_SEMANTIC_PRIMID:     return nv50_ir::SV_PRIMITIVE_ID;
+   case TGSI_SEMANTIC_INSTANCEID: return nv50_ir::SV_INSTANCE_ID;
+   case TGSI_SEMANTIC_VERTEXID:   return nv50_ir::SV_VERTEX_ID;
+   case TGSI_SEMANTIC_GRID_SIZE:  return nv50_ir::SV_NCTAID;
+   case TGSI_SEMANTIC_BLOCK_ID:   return nv50_ir::SV_CTAID;
+   case TGSI_SEMANTIC_BLOCK_SIZE: return nv50_ir::SV_NTID;
+   case TGSI_SEMANTIC_THREAD_ID:  return nv50_ir::SV_TID;
+   default:
+      assert(0);
+      return nv50_ir::SV_CLOCK;
+   }
+}
+
+#define NV50_IR_TEX_TARG_CASE(a, b) \
+   case TGSI_TEXTURE_##a: return nv50_ir::TEX_TARGET_##b;
+
+static nv50_ir::TexTarget translateTexture(uint tex)
+{
+   switch (tex) {
+   NV50_IR_TEX_TARG_CASE(1D, 1D);
+   NV50_IR_TEX_TARG_CASE(2D, 2D);
+   NV50_IR_TEX_TARG_CASE(2D_MSAA, 2D_MS);
+   NV50_IR_TEX_TARG_CASE(3D, 3D);
+   NV50_IR_TEX_TARG_CASE(CUBE, CUBE);
+   NV50_IR_TEX_TARG_CASE(RECT, RECT);
+   NV50_IR_TEX_TARG_CASE(1D_ARRAY, 1D_ARRAY);
+   NV50_IR_TEX_TARG_CASE(2D_ARRAY, 2D_ARRAY);
+   NV50_IR_TEX_TARG_CASE(2D_ARRAY_MSAA, 2D_MS_ARRAY);
+   NV50_IR_TEX_TARG_CASE(CUBE_ARRAY, CUBE_ARRAY);
+   NV50_IR_TEX_TARG_CASE(SHADOW1D, 1D_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW2D, 2D_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWCUBE, CUBE_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW1D_ARRAY, 1D_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWCUBE_ARRAY, CUBE_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(BUFFER, BUFFER);
+
+   case TGSI_TEXTURE_UNKNOWN:
+   default:
+      assert(!"invalid texture target");
+      return nv50_ir::TEX_TARGET_2D;
+   }
+}
+
+nv50_ir::DataType Instruction::inferSrcType() const
+{
+   switch (getOpcode()) {
+   case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+   case TGSI_OPCODE_USHR:
+   case TGSI_OPCODE_UCMP:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+      return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_IABS:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_ISSG:
+   case TGSI_OPCODE_SAD: // not sure about SAD, but no one has a float version
+   case TGSI_OPCODE_MOD:
+   case TGSI_OPCODE_UARL:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      return nv50_ir::TYPE_S32;
+   default:
+      return nv50_ir::TYPE_F32;
+   }
+}
+
+nv50_ir::DataType Instruction::inferDstType() const
+{
+   switch (getOpcode()) {
+   case TGSI_OPCODE_F2U: return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_F2I: return nv50_ir::TYPE_S32;
+   case TGSI_OPCODE_FSEQ:
+   case TGSI_OPCODE_FSGE:
+   case TGSI_OPCODE_FSLT:
+   case TGSI_OPCODE_FSNE:
+      return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_U2F:
+      return nv50_ir::TYPE_F32;
+   default:
+      return inferSrcType();
+   }
+}
+
+nv50_ir::CondCode Instruction::getSetCond() const
+{
+   using namespace nv50_ir;
+
+   switch (getOpcode()) {
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_FSLT:
+      return CC_LT;
+   case TGSI_OPCODE_SLE:
+      return CC_LE;
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_FSGE:
+      return CC_GE;
+   case TGSI_OPCODE_SGT:
+      return CC_GT;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_FSEQ:
+      return CC_EQ;
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_FSNE:
+      return CC_NEU;
+   case TGSI_OPCODE_USNE:
+      return CC_NE;
+   case TGSI_OPCODE_SFL:
+      return CC_NEVER;
+   case TGSI_OPCODE_STR:
+   default:
+      return CC_ALWAYS;
+   }
+}
+
+#define NV50_IR_OPCODE_CASE(a, b) case TGSI_OPCODE_##a: return nv50_ir::OP_##b
+
+static nv50_ir::operation translateOpcode(uint opcode)
+{
+   switch (opcode) {
+   NV50_IR_OPCODE_CASE(ARL, SHL);
+   NV50_IR_OPCODE_CASE(MOV, MOV);
+
+   NV50_IR_OPCODE_CASE(RCP, RCP);
+   NV50_IR_OPCODE_CASE(RSQ, RSQ);
+
+   NV50_IR_OPCODE_CASE(MUL, MUL);
+   NV50_IR_OPCODE_CASE(ADD, ADD);
+
+   NV50_IR_OPCODE_CASE(MIN, MIN);
+   NV50_IR_OPCODE_CASE(MAX, MAX);
+   NV50_IR_OPCODE_CASE(SLT, SET);
+   NV50_IR_OPCODE_CASE(SGE, SET);
+   NV50_IR_OPCODE_CASE(MAD, MAD);
+   NV50_IR_OPCODE_CASE(SUB, SUB);
+
+   NV50_IR_OPCODE_CASE(FLR, FLOOR);
+   NV50_IR_OPCODE_CASE(ROUND, CVT);
+   NV50_IR_OPCODE_CASE(EX2, EX2);
+   NV50_IR_OPCODE_CASE(LG2, LG2);
+   NV50_IR_OPCODE_CASE(POW, POW);
+
+   NV50_IR_OPCODE_CASE(ABS, ABS);
+
+   NV50_IR_OPCODE_CASE(COS, COS);
+   NV50_IR_OPCODE_CASE(DDX, DFDX);
+   NV50_IR_OPCODE_CASE(DDY, DFDY);
+   NV50_IR_OPCODE_CASE(KILL, DISCARD);
+
+   NV50_IR_OPCODE_CASE(SEQ, SET);
+   NV50_IR_OPCODE_CASE(SFL, SET);
+   NV50_IR_OPCODE_CASE(SGT, SET);
+   NV50_IR_OPCODE_CASE(SIN, SIN);
+   NV50_IR_OPCODE_CASE(SLE, SET);
+   NV50_IR_OPCODE_CASE(SNE, SET);
+   NV50_IR_OPCODE_CASE(STR, SET);
+   NV50_IR_OPCODE_CASE(TEX, TEX);
+   NV50_IR_OPCODE_CASE(TXD, TXD);
+   NV50_IR_OPCODE_CASE(TXP, TEX);
+
+   NV50_IR_OPCODE_CASE(BRA, BRA);
+   NV50_IR_OPCODE_CASE(CAL, CALL);
+   NV50_IR_OPCODE_CASE(RET, RET);
+   NV50_IR_OPCODE_CASE(CMP, SLCT);
+
+   NV50_IR_OPCODE_CASE(TXB, TXB);
+
+   NV50_IR_OPCODE_CASE(DIV, DIV);
+
+   NV50_IR_OPCODE_CASE(TXL, TXL);
+
+   NV50_IR_OPCODE_CASE(CEIL, CEIL);
+   NV50_IR_OPCODE_CASE(I2F, CVT);
+   NV50_IR_OPCODE_CASE(NOT, NOT);
+   NV50_IR_OPCODE_CASE(TRUNC, TRUNC);
+   NV50_IR_OPCODE_CASE(SHL, SHL);
+
+   NV50_IR_OPCODE_CASE(AND, AND);
+   NV50_IR_OPCODE_CASE(OR, OR);
+   NV50_IR_OPCODE_CASE(MOD, MOD);
+   NV50_IR_OPCODE_CASE(XOR, XOR);
+   NV50_IR_OPCODE_CASE(SAD, SAD);
+   NV50_IR_OPCODE_CASE(TXF, TXF);
+   NV50_IR_OPCODE_CASE(TXQ, TXQ);
+
+   NV50_IR_OPCODE_CASE(EMIT, EMIT);
+   NV50_IR_OPCODE_CASE(ENDPRIM, RESTART);
+
+   NV50_IR_OPCODE_CASE(KILL_IF, DISCARD);
+
+   NV50_IR_OPCODE_CASE(F2I, CVT);
+   NV50_IR_OPCODE_CASE(FSEQ, SET);
+   NV50_IR_OPCODE_CASE(FSGE, SET);
+   NV50_IR_OPCODE_CASE(FSLT, SET);
+   NV50_IR_OPCODE_CASE(FSNE, SET);
+   NV50_IR_OPCODE_CASE(IDIV, DIV);
+   NV50_IR_OPCODE_CASE(IMAX, MAX);
+   NV50_IR_OPCODE_CASE(IMIN, MIN);
+   NV50_IR_OPCODE_CASE(IABS, ABS);
+   NV50_IR_OPCODE_CASE(INEG, NEG);
+   NV50_IR_OPCODE_CASE(ISGE, SET);
+   NV50_IR_OPCODE_CASE(ISHR, SHR);
+   NV50_IR_OPCODE_CASE(ISLT, SET);
+   NV50_IR_OPCODE_CASE(F2U, CVT);
+   NV50_IR_OPCODE_CASE(U2F, CVT);
+   NV50_IR_OPCODE_CASE(UADD, ADD);
+   NV50_IR_OPCODE_CASE(UDIV, DIV);
+   NV50_IR_OPCODE_CASE(UMAD, MAD);
+   NV50_IR_OPCODE_CASE(UMAX, MAX);
+   NV50_IR_OPCODE_CASE(UMIN, MIN);
+   NV50_IR_OPCODE_CASE(UMOD, MOD);
+   NV50_IR_OPCODE_CASE(UMUL, MUL);
+   NV50_IR_OPCODE_CASE(USEQ, SET);
+   NV50_IR_OPCODE_CASE(USGE, SET);
+   NV50_IR_OPCODE_CASE(USHR, SHR);
+   NV50_IR_OPCODE_CASE(USLT, SET);
+   NV50_IR_OPCODE_CASE(USNE, SET);
+
+   NV50_IR_OPCODE_CASE(SAMPLE, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_B, TXB);
+   NV50_IR_OPCODE_CASE(SAMPLE_C, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_C_LZ, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_D, TXD);
+   NV50_IR_OPCODE_CASE(SAMPLE_L, TXL);
+   NV50_IR_OPCODE_CASE(SAMPLE_I, TXF);
+   NV50_IR_OPCODE_CASE(SAMPLE_I_MS, TXF);
+   NV50_IR_OPCODE_CASE(GATHER4, TXG);
+   NV50_IR_OPCODE_CASE(SVIEWINFO, TXQ);
+
+   NV50_IR_OPCODE_CASE(ATOMUADD, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMXCHG, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMCAS, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMAND, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMOR, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMXOR, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMUMIN, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMUMAX, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMIMIN, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMIMAX, ATOM);
+
+   NV50_IR_OPCODE_CASE(TEX2, TEX);
+   NV50_IR_OPCODE_CASE(TXB2, TXB);
+   NV50_IR_OPCODE_CASE(TXL2, TXL);
+
+   NV50_IR_OPCODE_CASE(END, EXIT);
+
+   default:
+      return nv50_ir::OP_NOP;
+   }
+}
+
+static uint16_t opcodeToSubOp(uint opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_LFENCE:   return NV50_IR_SUBOP_MEMBAR(L, GL);
+   case TGSI_OPCODE_SFENCE:   return NV50_IR_SUBOP_MEMBAR(S, GL);
+   case TGSI_OPCODE_MFENCE:   return NV50_IR_SUBOP_MEMBAR(M, GL);
+   case TGSI_OPCODE_ATOMUADD: return NV50_IR_SUBOP_ATOM_ADD;
+   case TGSI_OPCODE_ATOMXCHG: return NV50_IR_SUBOP_ATOM_EXCH;
+   case TGSI_OPCODE_ATOMCAS:  return NV50_IR_SUBOP_ATOM_CAS;
+   case TGSI_OPCODE_ATOMAND:  return NV50_IR_SUBOP_ATOM_AND;
+   case TGSI_OPCODE_ATOMOR:   return NV50_IR_SUBOP_ATOM_OR;
+   case TGSI_OPCODE_ATOMXOR:  return NV50_IR_SUBOP_ATOM_XOR;
+   case TGSI_OPCODE_ATOMUMIN: return NV50_IR_SUBOP_ATOM_MIN;
+   case TGSI_OPCODE_ATOMIMIN: return NV50_IR_SUBOP_ATOM_MIN;
+   case TGSI_OPCODE_ATOMUMAX: return NV50_IR_SUBOP_ATOM_MAX;
+   case TGSI_OPCODE_ATOMIMAX: return NV50_IR_SUBOP_ATOM_MAX;
+   default:
+      return 0;
+   }
+}
+
+bool Instruction::checkDstSrcAliasing() const
+{
+   if (insn->Dst[0].Register.Indirect) // no danger if indirect, using memory
+      return false;
+
+   for (int s = 0; s < TGSI_FULL_MAX_SRC_REGISTERS; ++s) {
+      if (insn->Src[s].Register.File == TGSI_FILE_NULL)
+         break;
+      if (insn->Src[s].Register.File == insn->Dst[0].Register.File &&
+          insn->Src[s].Register.Index == insn->Dst[0].Register.Index)
+         return true;
+   }
+   return false;
+}
+
+class Source
+{
+public:
+   Source(struct nv50_ir_prog_info *);
+   ~Source();
+
+public:
+   bool scanSource();
+   unsigned fileSize(unsigned file) const { return scan.file_max[file] + 1; }
+
+public:
+   struct tgsi_shader_info scan;
+   struct tgsi_full_instruction *insns;
+   const struct tgsi_token *tokens;
+   struct nv50_ir_prog_info *info;
+
+   nv50_ir::DynArray tempArrays;
+   nv50_ir::DynArray immdArrays;
+
+   typedef nv50_ir::BuildUtil::Location Location;
+   // these registers are per-subroutine, cannot be used for parameter passing
+   std::set<Location> locals;
+
+   bool mainTempsInLMem;
+
+   int clipVertexOutput;
+
+   struct TextureView {
+      uint8_t target; // TGSI_TEXTURE_*
+   };
+   std::vector<TextureView> textureViews;
+
+   struct Resource {
+      uint8_t target; // TGSI_TEXTURE_*
+      bool raw;
+      uint8_t slot; // $surface index
+   };
+   std::vector<Resource> resources;
+
+private:
+   int inferSysValDirection(unsigned sn) const;
+   bool scanDeclaration(const struct tgsi_full_declaration *);
+   bool scanInstruction(const struct tgsi_full_instruction *);
+   void scanProperty(const struct tgsi_full_property *);
+   void scanImmediate(const struct tgsi_full_immediate *);
+
+   inline bool isEdgeFlagPassthrough(const Instruction&) const;
+};
+
+Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
+{
+   tokens = (const struct tgsi_token *)info->bin.source;
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+      tgsi_dump(tokens, 0);
+
+   mainTempsInLMem = FALSE;
+}
+
+Source::~Source()
+{
+   if (insns)
+      FREE(insns);
+
+   if (info->immd.data)
+      FREE(info->immd.data);
+   if (info->immd.type)
+      FREE(info->immd.type);
+}
+
+bool Source::scanSource()
+{
+   unsigned insnCount = 0;
+   struct tgsi_parse_context parse;
+
+   tgsi_scan_shader(tokens, &scan);
+
+   insns = (struct tgsi_full_instruction *)MALLOC(scan.num_instructions *
+                                                  sizeof(insns[0]));
+   if (!insns)
+      return false;
+
+   clipVertexOutput = -1;
+
+   textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
+   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+
+   info->immd.bufSize = 0;
+
+   info->numInputs = scan.file_max[TGSI_FILE_INPUT] + 1;
+   info->numOutputs = scan.file_max[TGSI_FILE_OUTPUT] + 1;
+   info->numSysVals = scan.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
+
+   if (info->type == PIPE_SHADER_FRAGMENT) {
+      info->prop.fp.writesDepth = scan.writes_z;
+      info->prop.fp.usesDiscard = scan.uses_kill;
+   } else
+   if (info->type == PIPE_SHADER_GEOMETRY) {
+      info->prop.gp.instanceCount = 1; // default value
+   }
+
+   info->immd.data = (uint32_t *)MALLOC(scan.immediate_count * 16);
+   info->immd.type = (ubyte *)MALLOC(scan.immediate_count * sizeof(ubyte));
+
+   tgsi_parse_init(&parse, tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         scanImmediate(&parse.FullToken.FullImmediate);
+         break;
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         scanDeclaration(&parse.FullToken.FullDeclaration);
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         insns[insnCount++] = parse.FullToken.FullInstruction;
+         scanInstruction(&parse.FullToken.FullInstruction);
+         break;
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         scanProperty(&parse.FullToken.FullProperty);
+         break;
+      default:
+         INFO("unknown TGSI token type: %d\n", parse.FullToken.Token.Type);
+         break;
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   if (mainTempsInLMem)
+      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+
+   if (info->io.genUserClip > 0) {
+      info->io.clipDistanceMask = (1 << info->io.genUserClip) - 1;
+
+      const unsigned int nOut = (info->io.genUserClip + 3) / 4;
+
+      for (unsigned int n = 0; n < nOut; ++n) {
+         unsigned int i = info->numOutputs++;
+         info->out[i].id = i;
+         info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
+         info->out[i].si = n;
+         info->out[i].mask = info->io.clipDistanceMask >> (n * 4);
+      }
+   }
+
+   return info->assignSlots(info) == 0;
+}
+
+void Source::scanProperty(const struct tgsi_full_property *prop)
+{
+   switch (prop->Property.PropertyName) {
+   case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+      info->prop.gp.outputPrim = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_GS_INPUT_PRIM:
+      info->prop.gp.inputPrim = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
+      info->prop.gp.maxVertices = prop->u[0].Data;
+      break;
+#if 0
+   case TGSI_PROPERTY_GS_INSTANCE_COUNT:
+      info->prop.gp.instanceCount = prop->u[0].Data;
+      break;
+#endif
+   case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+      info->prop.fp.separateFragData = TRUE;
+      break;
+   case TGSI_PROPERTY_FS_COORD_ORIGIN:
+   case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+      // we don't care
+      break;
+   case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
+      info->io.genUserClip = -1;
+      break;
+   default:
+      INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
+      break;
+   }
+}
+
+void Source::scanImmediate(const struct tgsi_full_immediate *imm)
+{
+   const unsigned n = info->immd.count++;
+
+   assert(n < scan.immediate_count);
+
+   for (int c = 0; c < 4; ++c)
+      info->immd.data[n * 4 + c] = imm->u[c].Uint;
+
+   info->immd.type[n] = imm->Immediate.DataType;
+}
+
+int Source::inferSysValDirection(unsigned sn) const
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_INSTANCEID:
+   case TGSI_SEMANTIC_VERTEXID:
+      return 1;
+#if 0
+   case TGSI_SEMANTIC_LAYER:
+   case TGSI_SEMANTIC_VIEWPORTINDEX:
+      return 0;
+#endif
+   case TGSI_SEMANTIC_PRIMID:
+      return (info->type == PIPE_SHADER_FRAGMENT) ? 1 : 0;
+   default:
+      return 0;
+   }
+}
+
+bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
+{
+   unsigned i, c;
+   unsigned sn = TGSI_SEMANTIC_GENERIC;
+   unsigned si = 0;
+   const unsigned first = decl->Range.First, last = decl->Range.Last;
+
+   if (decl->Declaration.Semantic) {
+      sn = decl->Semantic.Name;
+      si = decl->Semantic.Index;
+   }
+
+   if (decl->Declaration.Local) {
+      for (i = first; i <= last; ++i) {
+         for (c = 0; c < 4; ++c) {
+            locals.insert(
+               Location(decl->Declaration.File, decl->Dim.Index2D, i, c));
+         }
+      }
+   }
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_INPUT:
+      if (info->type == PIPE_SHADER_VERTEX) {
+         // all vertex attributes are equal
+         for (i = first; i <= last; ++i) {
+            info->in[i].sn = TGSI_SEMANTIC_GENERIC;
+            info->in[i].si = i;
+         }
+      } else {
+         for (i = first; i <= last; ++i, ++si) {
+            info->in[i].id = i;
+            info->in[i].sn = sn;
+            info->in[i].si = si;
+            if (info->type == PIPE_SHADER_FRAGMENT) {
+               // translate interpolation mode
+               switch (decl->Interp.Interpolate) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  info->in[i].flat = 1;
+                  break;
+               case TGSI_INTERPOLATE_COLOR:
+                  info->in[i].sc = 1;
+                  break;
+               case TGSI_INTERPOLATE_LINEAR:
+                  info->in[i].linear = 1;
+                  break;
+               default:
+                  break;
+               }
+               if (decl->Interp.Centroid)
+                  info->in[i].centroid = 1;
+            }
+         }
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      for (i = first; i <= last; ++i, ++si) {
+         switch (sn) {
+         case TGSI_SEMANTIC_POSITION:
+            if (info->type == PIPE_SHADER_FRAGMENT)
+               info->io.fragDepth = i;
+            else
+            if (clipVertexOutput < 0)
+               clipVertexOutput = i;
+            break;
+         case TGSI_SEMANTIC_COLOR:
+            if (info->type == PIPE_SHADER_FRAGMENT)
+               info->prop.fp.numColourResults++;
+            break;
+         case TGSI_SEMANTIC_EDGEFLAG:
+            info->io.edgeFlagOut = i;
+            break;
+         case TGSI_SEMANTIC_CLIPVERTEX:
+            clipVertexOutput = i;
+            break;
+         case TGSI_SEMANTIC_CLIPDIST:
+            info->io.clipDistanceMask |=
+               decl->Declaration.UsageMask << (si * 4);
+            info->io.genUserClip = -1;
+            break;
+         default:
+            break;
+         }
+         info->out[i].id = i;
+         info->out[i].sn = sn;
+         info->out[i].si = si;
+      }
+      break;
+   case TGSI_FILE_SYSTEM_VALUE:
+      switch (sn) {
+      case TGSI_SEMANTIC_INSTANCEID:
+         info->io.instanceId = first;
+         break;
+      case TGSI_SEMANTIC_VERTEXID:
+         info->io.vertexId = first;
+         break;
+      default:
+         break;
+      }
+      for (i = first; i <= last; ++i, ++si) {
+         info->sv[i].sn = sn;
+         info->sv[i].si = si;
+         info->sv[i].input = inferSysValDirection(sn);
+      }
+      break;
+   case TGSI_FILE_RESOURCE:
+      for (i = first; i <= last; ++i) {
+         resources[i].target = decl->Resource.Resource;
+         resources[i].raw = decl->Resource.Raw;
+         resources[i].slot = i;
+      }
+      break;
+   case TGSI_FILE_SAMPLER_VIEW:
+      for (i = first; i <= last; ++i)
+         textureViews[i].target = decl->SamplerView.Resource;
+      break;
+   case TGSI_FILE_NULL:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_IMMEDIATE:
+   case TGSI_FILE_PREDICATE:
+   case TGSI_FILE_SAMPLER:
+      break;
+   default:
+      ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
+      return false;
+   }
+   return true;
+}
+
+inline bool Source::isEdgeFlagPassthrough(const Instruction& insn) const
+{
+   return insn.getOpcode() == TGSI_OPCODE_MOV &&
+      insn.getDst(0).getIndex(0) == info->io.edgeFlagOut &&
+      insn.getSrc(0).getFile() == TGSI_FILE_INPUT;
+}
+
+bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
+{
+   Instruction insn(inst);
+
+   if (insn.getOpcode() == TGSI_OPCODE_BARRIER)
+      info->numBarriers = 1;
+
+   if (insn.dstCount()) {
+      if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
+         Instruction::DstRegister dst = insn.getDst(0);
+
+         if (dst.isIndirect(0))
+            for (unsigned i = 0; i < info->numOutputs; ++i)
+               info->out[i].mask = 0xf;
+         else
+            info->out[dst.getIndex(0)].mask |= dst.getMask();
+
+         if (info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PSIZE ||
+             info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PRIMID ||
+             info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_FOG)
+            info->out[dst.getIndex(0)].mask &= 1;
+
+         if (isEdgeFlagPassthrough(insn))
+            info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
+      } else
+      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
+         if (insn.getDst(0).isIndirect(0))
+            mainTempsInLMem = TRUE;
+      }
+   }
+
+   for (unsigned s = 0; s < insn.srcCount(); ++s) {
+      Instruction::SrcRegister src = insn.getSrc(s);
+      if (src.getFile() == TGSI_FILE_TEMPORARY) {
+         if (src.isIndirect(0))
+            mainTempsInLMem = TRUE;
+      } else
+      if (src.getFile() == TGSI_FILE_RESOURCE) {
+         if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
+            info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+               0x1 : 0x2;
+      }
+      if (src.getFile() != TGSI_FILE_INPUT)
+         continue;
+      unsigned mask = insn.srcMask(s);
+
+      if (src.isIndirect(0)) {
+         for (unsigned i = 0; i < info->numInputs; ++i)
+            info->in[i].mask = 0xf;
+      } else {
+         const int i = src.getIndex(0);
+         for (unsigned c = 0; c < 4; ++c) {
+            if (!(mask & (1 << c)))
+               continue;
+            int k = src.getSwizzle(c);
+            if (k <= TGSI_SWIZZLE_W)
+               info->in[i].mask |= 1 << k;
+         }
+         switch (info->in[i].sn) {
+         case TGSI_SEMANTIC_PSIZE:
+         case TGSI_SEMANTIC_PRIMID:
+         case TGSI_SEMANTIC_FOG:
+            info->in[i].mask &= 0x1;
+            break;
+         case TGSI_SEMANTIC_PCOORD:
+            info->in[i].mask &= 0x3;
+            break;
+         default:
+            break;
+         }
+      }
+   }
+   return true;
+}
+
+nv50_ir::TexInstruction::Target
+Instruction::getTexture(const tgsi::Source *code, int s) const
+{
+   // XXX: indirect access
+   unsigned int r;
+
+   switch (getSrc(s).getFile()) {
+   case TGSI_FILE_RESOURCE:
+      r = getSrc(s).getIndex(0);
+      return translateTexture(code->resources.at(r).target);
+   case TGSI_FILE_SAMPLER_VIEW:
+      r = getSrc(s).getIndex(0);
+      return translateTexture(code->textureViews.at(r).target);
+   default:
+      return translateTexture(insn->Texture.Texture);
+   }
+}
+
+} // namespace tgsi
+
+namespace {
+
+using namespace nv50_ir;
+
+class Converter : public BuildUtil
+{
+public:
+   Converter(Program *, const tgsi::Source *);
+   ~Converter();
+
+   bool run();
+
+private:
+   struct Subroutine
+   {
+      Subroutine(Function *f) : f(f) { }
+      Function *f;
+      ValueMap values;
+   };
+
+   Value *getVertexBase(int s);
+   DataArray *getArrayForFile(unsigned file, int idx);
+   Value *fetchSrc(int s, int c);
+   Value *acquireDst(int d, int c);
+   void storeDst(int d, int c, Value *);
+
+   Value *fetchSrc(const tgsi::Instruction::SrcRegister src, int c, Value *ptr);
+   void storeDst(const tgsi::Instruction::DstRegister dst, int c,
+                 Value *val, Value *ptr);
+
+   Value *applySrcMod(Value *, int s, int c);
+
+   Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
+   Symbol *srcToSym(tgsi::Instruction::SrcRegister, int c);
+   Symbol *dstToSym(tgsi::Instruction::DstRegister, int c);
+
+   bool handleInstruction(const struct tgsi_full_instruction *);
+   void exportOutputs();
+   inline Subroutine *getSubroutine(unsigned ip);
+   inline Subroutine *getSubroutine(Function *);
+   inline bool isEndOfSubroutine(uint ip);
+
+   void loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask);
+
+   // R,S,L,C,Dx,Dy encode TGSI sources for respective values (0xSf for auto)
+   void setTexRS(TexInstruction *, unsigned int& s, int R, int S);
+   void handleTEX(Value *dst0[4], int R, int S, int L, int C, int Dx, int Dy);
+   void handleTXF(Value *dst0[4], int R, int L_M);
+   void handleTXQ(Value *dst0[4], enum TexQuery);
+   void handleLIT(Value *dst0[4]);
+   void handleUserClipPlanes();
+
+   Symbol *getResourceBase(int r);
+   void getResourceCoords(std::vector<Value *>&, int r, int s);
+
+   void handleLOAD(Value *dst0[4]);
+   void handleSTORE();
+   void handleATOM(Value *dst0[4], DataType, uint16_t subOp);
+
+   Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr);
+
+   void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork);
+
+   Value *buildDot(int dim);
+
+   class BindArgumentsPass : public Pass {
+   public:
+      BindArgumentsPass(Converter &conv) : conv(conv) { }
+
+   private:
+      Converter &conv;
+      Subroutine *sub;
+
+      inline const Location *getValueLocation(Subroutine *, Value *);
+
+      template<typename T> inline void
+      updateCallArgs(Instruction *i, void (Instruction::*setArg)(int, Value *),
+                     T (Function::*proto));
+
+      template<typename T> inline void
+      updatePrototype(BitSet *set, void (Function::*updateSet)(),
+                      T (Function::*proto));
+
+   protected:
+      bool visit(Function *);
+      bool visit(BasicBlock *bb) { return false; }
+   };
+
+private:
+   const struct tgsi::Source *code;
+   const struct nv50_ir_prog_info *info;
+
+   struct {
+      std::map<unsigned, Subroutine> map;
+      Subroutine *cur;
+   } sub;
+
+   uint ip; // instruction pointer
+
+   tgsi::Instruction tgsi;
+
+   DataType dstTy;
+   DataType srcTy;
+
+   DataArray tData; // TGSI_FILE_TEMPORARY
+   DataArray aData; // TGSI_FILE_ADDRESS
+   DataArray pData; // TGSI_FILE_PREDICATE
+   DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
+
+   Value *zero;
+   Value *fragCoord[4];
+   Value *clipVtx[4];
+
+   Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
+   uint8_t vtxBaseValid;
+
+   Stack condBBs;  // fork BB, then else clause BB
+   Stack joinBBs;  // fork BB, for inserting join ops on ENDIF
+   Stack loopBBs;  // loop headers
+   Stack breakBBs; // end of / after loop
+};
+
+Symbol *
+Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c)
+{
+   const int swz = src.getSwizzle(c);
+
+   return makeSym(src.getFile(),
+                  src.is2D() ? src.getIndex(1) : 0,
+                  src.isIndirect(0) ? -1 : src.getIndex(0), swz,
+                  src.getIndex(0) * 16 + swz * 4);
+}
+
+Symbol *
+Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c)
+{
+   return makeSym(dst.getFile(),
+                  dst.is2D() ? dst.getIndex(1) : 0,
+                  dst.isIndirect(0) ? -1 : dst.getIndex(0), c,
+                  dst.getIndex(0) * 16 + c * 4);
+}
+
+Symbol *
+Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
+{
+   Symbol *sym = new_Symbol(prog, tgsi::translateFile(tgsiFile));
+
+   sym->reg.fileIndex = fileIdx;
+
+   if (idx >= 0) {
+      if (sym->reg.file == FILE_SHADER_INPUT)
+         sym->setOffset(info->in[idx].slot[c] * 4);
+      else
+      if (sym->reg.file == FILE_SHADER_OUTPUT)
+         sym->setOffset(info->out[idx].slot[c] * 4);
+      else
+      if (sym->reg.file == FILE_SYSTEM_VALUE)
+         sym->setSV(tgsi::translateSysVal(info->sv[idx].sn), c);
+      else
+         sym->setOffset(address);
+   } else {
+      sym->setOffset(address);
+   }
+   return sym;
+}
+
+static inline uint8_t
+translateInterpMode(const struct nv50_ir_varying *var, operation& op)
+{
+   uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;
+
+   if (var->flat)
+      mode = NV50_IR_INTERP_FLAT;
+   else
+   if (var->linear)
+      mode = NV50_IR_INTERP_LINEAR;
+   else
+   if (var->sc)
+      mode = NV50_IR_INTERP_SC;
+
+   op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
+      ? OP_PINTERP : OP_LINTERP;
+
+   if (var->centroid)
+      mode |= NV50_IR_INTERP_CENTROID;
+
+   return mode;
+}
+
+Value *
+Converter::interpolate(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+   operation op;
+
+   // XXX: no way to know interpolation mode if we don't know what's accessed
+   const uint8_t mode = translateInterpMode(&info->in[ptr ? 0 :
+                                                      src.getIndex(0)], op);
+
+   Instruction *insn = new_Instruction(func, op, TYPE_F32);
+
+   insn->setDef(0, getScratch());
+   insn->setSrc(0, srcToSym(src, c));
+   if (op == OP_PINTERP)
+      insn->setSrc(1, fragCoord[3]);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insn->setInterpolate(mode);
+
+   bb->insertTail(insn);
+   return insn->getDef(0);
+}
+
+Value *
+Converter::applySrcMod(Value *val, int s, int c)
+{
+   Modifier m = tgsi.getSrc(s).getMod(c);
+   DataType ty = tgsi.inferSrcType();
+
+   if (m & Modifier(NV50_IR_MOD_ABS))
+      val = mkOp1v(OP_ABS, ty, getScratch(), val);
+
+   if (m & Modifier(NV50_IR_MOD_NEG))
+      val = mkOp1v(OP_NEG, ty, getScratch(), val);
+
+   return val;
+}
+
+Value *
+Converter::getVertexBase(int s)
+{
+   assert(s < 5);
+   if (!(vtxBaseValid & (1 << s))) {
+      const int index = tgsi.getSrc(s).getIndex(1);
+      Value *rel = NULL;
+      if (tgsi.getSrc(s).isIndirect(1))
+         rel = fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL);
+      vtxBaseValid |= 1 << s;
+      vtxBase[s] = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(), mkImm(index), rel);
+   }
+   return vtxBase[s];
+}
+
+Value *
+Converter::fetchSrc(int s, int c)
+{
+   Value *res;
+   Value *ptr = NULL, *dimRel = NULL;
+
+   tgsi::Instruction::SrcRegister src = tgsi.getSrc(s);
+
+   if (src.isIndirect(0))
+      ptr = fetchSrc(src.getIndirect(0), 0, NULL);
+
+   if (src.is2D()) {
+      switch (src.getFile()) {
+      case TGSI_FILE_INPUT:
+         dimRel = getVertexBase(s);
+         break;
+      case TGSI_FILE_CONSTANT:
+         // on NVC0, this is valid and c{I+J}[k] == cI[(J << 16) + k]
+         if (src.isIndirect(1))
+            dimRel = fetchSrc(src.getIndirect(1), 0, 0);
+         break;
+      default:
+         break;
+      }
+   }
+
+   res = fetchSrc(src, c, ptr);
+
+   if (dimRel)
+      res->getInsn()->setIndirect(0, 1, dimRel);
+
+   return applySrcMod(res, s, c);
+}
+
+Converter::DataArray *
+Converter::getArrayForFile(unsigned file, int idx)
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+      return &tData;
+   case TGSI_FILE_PREDICATE:
+      return &pData;
+   case TGSI_FILE_ADDRESS:
+      return &aData;
+   case TGSI_FILE_OUTPUT:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      return &oData;
+   default:
+      assert(!"invalid/unhandled TGSI source file");
+      return NULL;
+   }
+}
+
+Value *
+Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+   const int idx2d = src.is2D() ? src.getIndex(1) : 0;
+   const int idx = src.getIndex(0);
+   const int swz = src.getSwizzle(c);
+
+   switch (src.getFile()) {
+   case TGSI_FILE_IMMEDIATE:
+      assert(!ptr);
+      return loadImm(NULL, info->immd.data[idx * 4 + swz]);
+   case TGSI_FILE_CONSTANT:
+      return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+   case TGSI_FILE_INPUT:
+      if (prog->getType() == Program::TYPE_FRAGMENT) {
+         // don't load masked inputs, won't be assigned a slot
+         if (!ptr && !(info->in[idx].mask & (1 << swz)))
+            return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
+	 if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
+            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
+         return interpolate(src, c, ptr);
+      }
+      return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+   case TGSI_FILE_OUTPUT:
+      assert(!"load from output file");
+      return NULL;
+   case TGSI_FILE_SYSTEM_VALUE:
+      assert(!ptr);
+      return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+   default:
+      return getArrayForFile(src.getFile(), idx2d)->load(
+         sub.cur->values, idx, swz, ptr);
+   }
+}
+
+Value *
+Converter::acquireDst(int d, int c)
+{
+   const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+   const unsigned f = dst.getFile();
+   const int idx = dst.getIndex(0);
+   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+      return NULL;
+
+   if (dst.isIndirect(0) ||
+       f == TGSI_FILE_SYSTEM_VALUE ||
+       (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
+      return getScratch();
+
+   return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
+}
+
+void
+Converter::storeDst(int d, int c, Value *val)
+{
+   const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+
+   switch (tgsi.getSaturate()) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      mkOp1(OP_SAT, dstTy, val, val);
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f));
+      mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f));
+      break;
+   default:
+      assert(!"invalid saturation mode");
+      break;
+   }
+
+   Value *ptr = dst.isIndirect(0) ?
+      fetchSrc(dst.getIndirect(0), 0, NULL) : NULL;
+
+   if (info->io.genUserClip > 0 &&
+       dst.getFile() == TGSI_FILE_OUTPUT &&
+       !dst.isIndirect(0) && dst.getIndex(0) == code->clipVertexOutput) {
+      mkMov(clipVtx[c], val);
+      val = clipVtx[c];
+   }
+
+   storeDst(dst, c, val, ptr);
+}
+
+void
+Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
+                    Value *val, Value *ptr)
+{
+   const unsigned f = dst.getFile();
+   const int idx = dst.getIndex(0);
+   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+   if (f == TGSI_FILE_SYSTEM_VALUE) {
+      assert(!ptr);
+      mkOp2(OP_WRSV, TYPE_U32, NULL, dstToSym(dst, c), val);
+   } else
+   if (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT) {
+      if (ptr || (info->out[idx].mask & (1 << c)))
+         mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+   } else
+   if (f == TGSI_FILE_TEMPORARY ||
+       f == TGSI_FILE_PREDICATE ||
+       f == TGSI_FILE_ADDRESS ||
+       f == TGSI_FILE_OUTPUT) {
+      getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
+   } else {
+      assert(!"invalid dst file");
+   }
+}
+
+#define FOR_EACH_DST_ENABLED_CHANNEL(d, chan, inst) \
+   for (chan = 0; chan < 4; ++chan)                 \
+      if (!inst.getDst(d).isMasked(chan))
+
+Value *
+Converter::buildDot(int dim)
+{
+   assert(dim > 0);
+
+   Value *src0 = fetchSrc(0, 0), *src1 = fetchSrc(1, 0);
+   Value *dotp = getScratch();
+
+   mkOp2(OP_MUL, TYPE_F32, dotp, src0, src1);
+
+   for (int c = 1; c < dim; ++c) {
+      src0 = fetchSrc(0, c);
+      src1 = fetchSrc(1, c);
+      mkOp3(OP_MAD, TYPE_F32, dotp, src0, src1, dotp);
+   }
+   return dotp;
+}
+
+void
+Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
+{
+   FlowInstruction *join = new_FlowInstruction(func, OP_JOIN, NULL);
+   join->fixed = 1;
+   conv->insertHead(join);
+
+   fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
+   fork->insertBefore(fork->getExit(), fork->joinAt);
+}
+
+void
+Converter::setTexRS(TexInstruction *tex, unsigned int& s, int R, int S)
+{
+   unsigned rIdx = 0, sIdx = 0;
+
+   if (R >= 0)
+      rIdx = tgsi.getSrc(R).getIndex(0);
+   if (S >= 0)
+      sIdx = tgsi.getSrc(S).getIndex(0);
+
+   tex->setTexture(tgsi.getTexture(code, R), rIdx, sIdx);
+
+   if (tgsi.getSrc(R).isIndirect(0)) {
+      tex->tex.rIndirectSrc = s;
+      tex->setSrc(s++, fetchSrc(tgsi.getSrc(R).getIndirect(0), 0, NULL));
+   }
+   if (S >= 0 && tgsi.getSrc(S).isIndirect(0)) {
+      tex->tex.sIndirectSrc = s;
+      tex->setSrc(s++, fetchSrc(tgsi.getSrc(S).getIndirect(0), 0, NULL));
+   }
+}
+
+void
+Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
+{
+   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
+   tex->tex.query = query;
+   unsigned int c, d;
+
+   for (d = 0, c = 0; c < 4; ++c) {
+      if (!dst0[c])
+         continue;
+      tex->tex.mask |= 1 << c;
+      tex->setDef(d++, dst0[c]);
+   }
+   tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
+
+   setTexRS(tex, c, 1, -1);
+
+   bb->insertTail(tex);
+}
+
+void
+Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
+{
+   Value *proj = fetchSrc(0, 3);
+   Instruction *insn = proj->getUniqueInsn();
+   int c;
+
+   if (insn->op == OP_PINTERP) {
+      bb->insertTail(insn = cloneForward(func, insn));
+      insn->op = OP_LINTERP;
+      insn->setInterpolate(NV50_IR_INTERP_LINEAR | insn->getSampleMode());
+      insn->setSrc(1, NULL);
+      proj = insn->getDef(0);
+   }
+   proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), proj);
+
+   for (c = 0; c < 4; ++c) {
+      if (!(mask & (1 << c)))
+         continue;
+      if ((insn = src[c]->getUniqueInsn())->op != OP_PINTERP)
+         continue;
+      mask &= ~(1 << c);
+
+      bb->insertTail(insn = cloneForward(func, insn));
+      insn->setInterpolate(NV50_IR_INTERP_PERSPECTIVE | insn->getSampleMode());
+      insn->setSrc(1, proj);
+      dst[c] = insn->getDef(0);
+   }
+   if (!mask)
+      return;
+
+   proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), fetchSrc(0, 3));
+
+   for (c = 0; c < 4; ++c)
+      if (mask & (1 << c))
+         dst[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), src[c], proj);
+}
+
+// order of nv50 ir sources: x y z layer lod/bias shadow
+// order of TGSI TEX sources: x y z layer shadow lod/bias
+//  lowering will finally set the hw specific order (like array first on nvc0)
+void
+Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
+{
+   Value *val;
+   Value *arg[4], *src[8];
+   Value *lod = NULL, *shd = NULL;
+   unsigned int s, c, d;
+   TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+
+   TexInstruction::Target tgt = tgsi.getTexture(code, R);
+
+   for (s = 0; s < tgt.getArgCount(); ++s)
+      arg[s] = src[s] = fetchSrc(0, s);
+
+   if (texi->op == OP_TXL || texi->op == OP_TXB)
+      lod = fetchSrc(L >> 4, L & 3);
+
+   if (C == 0x0f)
+      C = 0x00 | MAX2(tgt.getArgCount(), 2); // guess DC src
+
+   if (tgt.isShadow())
+      shd = fetchSrc(C >> 4, C & 3);
+
+   if (texi->op == OP_TXD) {
+      for (c = 0; c < tgt.getDim(); ++c) {
+         texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c));
+         texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c));
+      }
+   }
+
+   // cube textures don't care about projection value, it's divided out
+   if (tgsi.getOpcode() == TGSI_OPCODE_TXP && !tgt.isCube() && !tgt.isArray()) {
+      unsigned int n = tgt.getDim();
+      if (shd) {
+         arg[n] = shd;
+         ++n;
+         assert(tgt.getDim() == tgt.getArgCount());
+      }
+      loadProjTexCoords(src, arg, (1 << n) - 1);
+      if (shd)
+         shd = src[n - 1];
+   }
+
+   if (tgt.isCube()) {
+      for (c = 0; c < 3; ++c)
+         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
+      val = getScratch();
+      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c)
+         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
+   }
+
+   for (c = 0, d = 0; c < 4; ++c) {
+      if (dst[c]) {
+         texi->setDef(d++, dst[c]);
+         texi->tex.mask |= 1 << c;
+      } else {
+         // NOTE: maybe hook up def too, for CSE
+      }
+   }
+   for (s = 0; s < tgt.getArgCount(); ++s)
+      texi->setSrc(s, src[s]);
+   if (lod)
+      texi->setSrc(s++, lod);
+   if (shd)
+      texi->setSrc(s++, shd);
+
+   setTexRS(texi, s, R, S);
+
+   if (tgsi.getOpcode() == TGSI_OPCODE_SAMPLE_C_LZ)
+      texi->tex.levelZero = true;
+
+   bb->insertTail(texi);
+}
+
+// 1st source: xyz = coordinates, w = lod/sample
+// 2nd source: offset
+void
+Converter::handleTXF(Value *dst[4], int R, int L_M)
+{
+   TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+   int ms;
+   unsigned int c, d, s;
+
+   texi->tex.target = tgsi.getTexture(code, R);
+
+   ms = texi->tex.target.isMS() ? 1 : 0;
+   texi->tex.levelZero = ms; /* MS textures don't have mip-maps */
+
+   for (c = 0, d = 0; c < 4; ++c) {
+      if (dst[c]) {
+         texi->setDef(d++, dst[c]);
+         texi->tex.mask |= 1 << c;
+      }
+   }
+   for (c = 0; c < (texi->tex.target.getArgCount() - ms); ++c)
+      texi->setSrc(c, fetchSrc(0, c));
+   texi->setSrc(c++, fetchSrc(L_M >> 4, L_M & 3)); // lod or ms
+
+   setTexRS(texi, c, R, -1);
+
+   for (s = 0; s < tgsi.getNumTexOffsets(); ++s) {
+      for (c = 0; c < 3; ++c) {
+         texi->tex.offset[s][c] = tgsi.getTexOffset(s).getValueU32(c, info);
+         if (texi->tex.offset[s][c])
+            texi->tex.useOffsets = s + 1;
+      }
+   }
+
+   bb->insertTail(texi);
+}
+
+void
+Converter::handleLIT(Value *dst0[4])
+{
+   Value *val0 = NULL;
+   unsigned int mask = tgsi.getDst(0).getMask();
+
+   if (mask & (1 << 0))
+      loadImm(dst0[0], 1.0f);
+
+   if (mask & (1 << 3))
+      loadImm(dst0[3], 1.0f);
+
+   if (mask & (3 << 1)) {
+      val0 = getScratch();
+      mkOp2(OP_MAX, TYPE_F32, val0, fetchSrc(0, 0), zero);
+      if (mask & (1 << 1))
+         mkMov(dst0[1], val0);
+   }
+
+   if (mask & (1 << 2)) {
+      Value *src1 = fetchSrc(0, 1), *src3 = fetchSrc(0, 3);
+      Value *val1 = getScratch(), *val3 = getScratch();
+
+      Value *pos128 = loadImm(NULL, +127.999999f);
+      Value *neg128 = loadImm(NULL, -127.999999f);
+
+      mkOp2(OP_MAX, TYPE_F32, val1, src1, zero);
+      mkOp2(OP_MAX, TYPE_F32, val3, src3, neg128);
+      mkOp2(OP_MIN, TYPE_F32, val3, val3, pos128);
+      mkOp2(OP_POW, TYPE_F32, val3, val1, val3);
+
+      mkCmp(OP_SLCT, CC_GT, TYPE_F32, dst0[2], val3, zero, val0);
+   }
+}
+
+static inline bool
+isResourceSpecial(const int r)
+{
+   return (r == TGSI_RESOURCE_GLOBAL ||
+           r == TGSI_RESOURCE_LOCAL ||
+           r == TGSI_RESOURCE_PRIVATE ||
+           r == TGSI_RESOURCE_INPUT);
+}
+
+static inline bool
+isResourceRaw(const struct tgsi::Source *code, const int r)
+{
+   return isResourceSpecial(r) || code->resources[r].raw;
+}
+
+static inline nv50_ir::TexTarget
+getResourceTarget(const struct tgsi::Source *code, int r)
+{
+   if (isResourceSpecial(r))
+      return nv50_ir::TEX_TARGET_BUFFER;
+   return tgsi::translateTexture(code->resources.at(r).target);
+}
+
+Symbol *
+Converter::getResourceBase(const int r)
+{
+   Symbol *sym = NULL;
+
+   switch (r) {
+   case TGSI_RESOURCE_GLOBAL:
+      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+      break;
+   case TGSI_RESOURCE_LOCAL:
+      assert(prog->getType() == Program::TYPE_COMPUTE);
+      sym = mkSymbol(nv50_ir::FILE_MEMORY_SHARED, 0, TYPE_U32,
+                     info->prop.cp.sharedOffset);
+      break;
+   case TGSI_RESOURCE_PRIVATE:
+      sym = mkSymbol(nv50_ir::FILE_MEMORY_LOCAL, 0, TYPE_U32,
+                     info->bin.tlsSpace);
+      break;
+   case TGSI_RESOURCE_INPUT:
+      assert(prog->getType() == Program::TYPE_COMPUTE);
+      sym = mkSymbol(nv50_ir::FILE_SHADER_INPUT, 0, TYPE_U32,
+                     info->prop.cp.inputOffset);
+      break;
+   default:
+      sym = new_Symbol(prog,
+                       nv50_ir::FILE_MEMORY_GLOBAL, code->resources.at(r).slot);
+      break;
+   }
+   return sym;
+}
+
+void
+Converter::getResourceCoords(std::vector<Value *> &coords, int r, int s)
+{
+   const int arg =
+      TexInstruction::Target(getResourceTarget(code, r)).getArgCount();
+
+   for (int c = 0; c < arg; ++c)
+      coords.push_back(fetchSrc(s, c));
+
+   // NOTE: TGSI_RESOURCE_GLOBAL needs FILE_GPR; this is an nv50 quirk
+   if (r == TGSI_RESOURCE_LOCAL ||
+       r == TGSI_RESOURCE_PRIVATE ||
+       r == TGSI_RESOURCE_INPUT)
+      coords[0] = mkOp1v(OP_MOV, TYPE_U32, getScratch(4, FILE_ADDRESS),
+                         coords[0]);
+}
+
+static inline int
+partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
+{
+   int n = 0;
+
+   while (mask) {
+      if (mask & 1) {
+         size[n]++;
+      } else {
+         if (size[n])
+            comp[n = 1] = size[0] + 1;
+         else
+            comp[n]++;
+      }
+      mask >>= 1;
+   }
+   if (size[0] == 3) {
+      n = 1;
+      size[0] = (comp[0] == 1) ? 1 : 2;
+      size[1] = 3 - size[0];
+      comp[1] = comp[0] + size[0];
+   }
+   return n + 1;
+}
+
+// For raw loads, granularity is 4 byte.
+// Usage of the texture read mask on OP_SULDP is not allowed.
+void
+Converter::handleLOAD(Value *dst0[4])
+{
+   const int r = tgsi.getSrc(0).getIndex(0);
+   int c;
+   std::vector<Value *> off, src, ldv, def;
+
+   getResourceCoords(off, r, 1);
+
+   if (isResourceRaw(code, r)) {
+      uint8_t mask = 0;
+      uint8_t comp[2] = { 0, 0 };
+      uint8_t size[2] = { 0, 0 };
+
+      Symbol *base = getResourceBase(r);
+
+      // determine the base and size of the at most 2 load ops
+      for (c = 0; c < 4; ++c)
+         if (!tgsi.getDst(0).isMasked(c))
+            mask |= 1 << (tgsi.getSrc(0).getSwizzle(c) - TGSI_SWIZZLE_X);
+
+      int n = partitionLoadStore(comp, size, mask);
+
+      src = off;
+
+      def.resize(4); // index by component, the ones we need will be non-NULL
+      for (c = 0; c < 4; ++c) {
+         if (dst0[c] && tgsi.getSrc(0).getSwizzle(c) == (TGSI_SWIZZLE_X + c))
+            def[c] = dst0[c];
+         else
+         if (mask & (1 << c))
+            def[c] = getScratch();
+      }
+
+      const bool useLd = isResourceSpecial(r) ||
+         (info->io.nv50styleSurfaces &&
+          code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+      for (int i = 0; i < n; ++i) {
+         ldv.assign(def.begin() + comp[i], def.begin() + comp[i] + size[i]);
+
+         if (comp[i]) // adjust x component of source address if necessary
+            src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+                            off[0], mkImm(comp[i] * 4));
+         else
+            src[0] = off[0];
+
+         if (useLd) {
+            Instruction *ld =
+               mkLoad(typeOfSize(size[i] * 4), ldv[0], base, src[0]);
+            for (size_t c = 1; c < ldv.size(); ++c)
+               ld->setDef(c, ldv[c]);
+         } else {
+            mkTex(OP_SULDB, getResourceTarget(code, r), code->resources[r].slot,
+                  0, ldv, src)->dType = typeOfSize(size[i] * 4);
+         }
+      }
+   } else {
+      def.resize(4);
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c] || tgsi.getSrc(0).getSwizzle(c) != (TGSI_SWIZZLE_X + c))
+            def[c] = getScratch();
+         else
+            def[c] = dst0[c];
+      }
+
+      mkTex(OP_SULDP, getResourceTarget(code, r), code->resources[r].slot, 0,
+            def, off);
+   }
+   FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+      if (dst0[c] != def[c])
+         mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+}
+
+// For formatted stores, the write mask on OP_SUSTP can be used.
+// Raw stores have to be split.
+void
+Converter::handleSTORE()
+{
+   const int r = tgsi.getDst(0).getIndex(0);
+   int c;
+   std::vector<Value *> off, src, dummy;
+
+   getResourceCoords(off, r, 0);
+   src = off;
+   const int s = src.size();
+
+   if (isResourceRaw(code, r)) {
+      uint8_t comp[2] = { 0, 0 };
+      uint8_t size[2] = { 0, 0 };
+
+      int n = partitionLoadStore(comp, size, tgsi.getDst(0).getMask());
+
+      Symbol *base = getResourceBase(r);
+
+      const bool useSt = isResourceSpecial(r) ||
+         (info->io.nv50styleSurfaces &&
+          code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+      for (int i = 0; i < n; ++i) {
+         if (comp[i]) // adjust x component of source address if necessary
+            src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+                            off[0], mkImm(comp[i] * 4));
+         else
+            src[0] = off[0];
+
+         const DataType stTy = typeOfSize(size[i] * 4);
+
+         if (useSt) {
+            Instruction *st =
+               mkStore(OP_STORE, stTy, base, NULL, fetchSrc(1, comp[i]));
+            for (c = 1; c < size[i]; ++c)
+               st->setSrc(1 + c, fetchSrc(1, comp[i] + c));
+            st->setIndirect(0, 0, src[0]);
+         } else {
+            // attach values to be stored
+            src.resize(s + size[i]);
+            for (c = 0; c < size[i]; ++c)
+               src[s + c] = fetchSrc(1, comp[i] + c);
+            mkTex(OP_SUSTB, getResourceTarget(code, r), code->resources[r].slot,
+                  0, dummy, src)->setType(stTy);
+         }
+      }
+   } else {
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         src.push_back(fetchSrc(1, c));
+
+      mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
+            dummy, src)->tex.mask = tgsi.getDst(0).getMask();
+   }
+}
+
+// XXX: These only work on resources with the single-component u32/s32 formats.
+// Therefore the result is replicated. This might not be intended by TGSI, but
+// operating on more than 1 component would produce undefined results because
+// they do not exist.
+void
+Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
+{
+   const int r = tgsi.getSrc(0).getIndex(0);
+   std::vector<Value *> srcv;
+   std::vector<Value *> defv;
+   LValue *dst = getScratch();
+
+   getResourceCoords(srcv, r, 1);
+
+   if (isResourceSpecial(r)) {
+      assert(r != TGSI_RESOURCE_INPUT);
+      Instruction *insn;
+      insn = mkOp2(OP_ATOM, ty, dst, getResourceBase(r), fetchSrc(2, 0));
+      insn->subOp = subOp;
+      if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+         insn->setSrc(2, fetchSrc(3, 0));
+      insn->setIndirect(0, 0, srcv.at(0));
+   } else {
+      operation op = isResourceRaw(code, r) ? OP_SUREDB : OP_SUREDP;
+      TexTarget targ = getResourceTarget(code, r);
+      int idx = code->resources[r].slot;
+      defv.push_back(dst);
+      srcv.push_back(fetchSrc(2, 0));
+      if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+         srcv.push_back(fetchSrc(3, 0));
+      TexInstruction *tex = mkTex(op, targ, idx, 0, defv, srcv);
+      tex->subOp = subOp;
+      tex->tex.mask = 1;
+      tex->setType(ty);
+   }
+
+   for (int c = 0; c < 4; ++c)
+      if (dst0[c])
+         dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(unsigned ip)
+{
+   std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+   if (it == sub.map.end())
+      it = sub.map.insert(std::make_pair(
+              ip, Subroutine(new Function(prog, "SUB", ip)))).first;
+
+   return &it->second;
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(Function *f)
+{
+   unsigned ip = f->getLabel();
+   std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+   if (it == sub.map.end())
+      it = sub.map.insert(std::make_pair(ip, Subroutine(f))).first;
+
+   return &it->second;
+}
+
+bool
+Converter::isEndOfSubroutine(uint ip)
+{
+   assert(ip < code->scan.num_instructions);
+   tgsi::Instruction insn(&code->insns[ip]);
+   return (insn.getOpcode() == TGSI_OPCODE_END ||
+           insn.getOpcode() == TGSI_OPCODE_ENDSUB ||
+           // does END occur at end of main or the very end ?
+           insn.getOpcode() == TGSI_OPCODE_BGNSUB);
+}
+
+bool
+Converter::handleInstruction(const struct tgsi_full_instruction *insn)
+{
+   Instruction *geni;
+
+   Value *dst0[4], *rDst0[4];
+   Value *src0, *src1, *src2;
+   Value *val0, *val1;
+   int c;
+
+   tgsi = tgsi::Instruction(insn);
+
+   bool useScratchDst = tgsi.checkDstSrcAliasing();
+
+   operation op = tgsi.getOP();
+   dstTy = tgsi.inferDstType();
+   srcTy = tgsi.inferSrcType();
+
+   unsigned int mask = tgsi.dstCount() ? tgsi.getDst(0).getMask() : 0;
+
+   if (tgsi.dstCount()) {
+      for (c = 0; c < 4; ++c) {
+         rDst0[c] = acquireDst(0, c);
+         dst0[c] = (useScratchDst && rDst0[c]) ? getScratch() : rDst0[c];
+      }
+   }
+
+   switch (tgsi.getOpcode()) {
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_MOD:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_SHL:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_USHR:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         mkOp2(op, dstTy, dst0[c], src0, src1);
+      }
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_SAD:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         mkOp3(op, dstTy, dst0[c], src0, src1, src2);
+      }
+      break;
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_CEIL:
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_TRUNC:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_IABS:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp1(op, dstTy, dst0[c], fetchSrc(0, c));
+      break;
+   case TGSI_OPCODE_RSQ:
+      src0 = fetchSrc(0, 0);
+      val0 = getScratch();
+      mkOp1(OP_ABS, TYPE_F32, val0, src0);
+      mkOp1(OP_RSQ, TYPE_F32, val0, val0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         mkCvt(OP_CVT, TYPE_S32, dst0[c], TYPE_F32, src0)->rnd = ROUND_M;
+         mkOp2(OP_SHL, TYPE_U32, dst0[c], dst0[c], mkImm(4));
+      }
+      break;
+   case TGSI_OPCODE_UARL:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp2(OP_SHL, TYPE_U32, dst0[c], fetchSrc(0, c), mkImm(4));
+      break;
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+      val0 = mkOp1(op, TYPE_F32, getScratch(), fetchSrc(0, 0))->getDef(0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp1(OP_MOV, TYPE_F32, dst0[c], val0);
+      break;
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+      val0 = getScratch();
+      if (mask & 7) {
+         mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 0));
+         mkOp1(op, TYPE_F32, val0, val0);
+         for (c = 0; c < 3; ++c)
+            if (dst0[c])
+               mkMov(dst0[c], val0);
+      }
+      if (dst0[3]) {
+         mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 3));
+         mkOp1(op, TYPE_F32, dst0[3], val0);
+      }
+      break;
+   case TGSI_OPCODE_SCS:
+      if (mask & 3) {
+         val0 = mkOp1v(OP_PRESIN, TYPE_F32, getSSA(), fetchSrc(0, 0));
+         if (dst0[0])
+            mkOp1(OP_COS, TYPE_F32, dst0[0], val0);
+         if (dst0[1])
+            mkOp1(OP_SIN, TYPE_F32, dst0[1], val0);
+      }
+      if (dst0[2])
+         loadImm(dst0[2], 0.0f);
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_EXP:
+      src0 = fetchSrc(0, 0);
+      val0 = mkOp1v(OP_FLOOR, TYPE_F32, getSSA(), src0);
+      if (dst0[1])
+         mkOp2(OP_SUB, TYPE_F32, dst0[1], src0, val0);
+      if (dst0[0])
+         mkOp1(OP_EX2, TYPE_F32, dst0[0], val0);
+      if (dst0[2])
+         mkOp1(OP_EX2, TYPE_F32, dst0[2], src0);
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_LOG:
+      src0 = mkOp1v(OP_ABS, TYPE_F32, getSSA(), fetchSrc(0, 0));
+      val0 = mkOp1v(OP_LG2, TYPE_F32, dst0[2] ? dst0[2] : getSSA(), src0);
+      if (dst0[0] || dst0[1])
+         val1 = mkOp1v(OP_FLOOR, TYPE_F32, dst0[0] ? dst0[0] : getSSA(), val0);
+      if (dst0[1]) {
+         mkOp1(OP_EX2, TYPE_F32, dst0[1], val1);
+         mkOp1(OP_RCP, TYPE_F32, dst0[1], dst0[1]);
+         mkOp2(OP_MUL, TYPE_F32, dst0[1], dst0[1], src0);
+      }
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_DP2:
+      val0 = buildDot(2);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DP3:
+      val0 = buildDot(3);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DP4:
+      val0 = buildDot(4);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DPH:
+      val0 = buildDot(3);
+      src1 = fetchSrc(1, 3);
+      mkOp2(OP_ADD, TYPE_F32, val0, val0, src1);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DST:
+      if (dst0[0])
+         loadImm(dst0[0], 1.0f);
+      if (dst0[1]) {
+         src0 = fetchSrc(0, 1);
+         src1 = fetchSrc(1, 1);
+         mkOp2(OP_MUL, TYPE_F32, dst0[1], src0, src1);
+      }
+      if (dst0[2])
+         mkMov(dst0[2], fetchSrc(0, 2));
+      if (dst0[3])
+         mkMov(dst0[3], fetchSrc(1, 3));
+      break;
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         mkOp3(OP_MAD, TYPE_F32, dst0[c],
+               mkOp2v(OP_SUB, TYPE_F32, getSSA(), src1, src2), src0, src2);
+      }
+      break;
+   case TGSI_OPCODE_LIT:
+      handleLIT(dst0);
+      break;
+   case TGSI_OPCODE_XPD:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         if (c < 3) {
+            val0 = getSSA();
+            src0 = fetchSrc(1, (c + 1) % 3);
+            src1 = fetchSrc(0, (c + 2) % 3);
+            mkOp2(OP_MUL, TYPE_F32, val0, src0, src1);
+            mkOp1(OP_NEG, TYPE_F32, val0, val0);
+
+            src0 = fetchSrc(0, (c + 1) % 3);
+            src1 = fetchSrc(1, (c + 2) % 3);
+            mkOp3(OP_MAD, TYPE_F32, dst0[c], src0, src1, val0);
+         } else {
+            loadImm(dst0[c], 1.0f);
+         }
+      }
+      break;
+   case TGSI_OPCODE_ISSG:
+   case TGSI_OPCODE_SSG:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         val0 = getScratch();
+         val1 = getScratch();
+         mkCmp(OP_SET, CC_GT, srcTy, val0, src0, zero);
+         mkCmp(OP_SET, CC_LT, srcTy, val1, src0, zero);
+         if (srcTy == TYPE_F32)
+            mkOp2(OP_SUB, TYPE_F32, dst0[c], val0, val1);
+         else
+            mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0);
+      }
+      break;
+   case TGSI_OPCODE_UCMP:
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         if (src1 == src2)
+            mkMov(dst0[c], src1);
+         else
+            mkCmp(OP_SLCT, (srcTy == TYPE_F32) ? CC_LT : CC_NE,
+                  srcTy, dst0[c], src1, src2, src0);
+      }
+      break;
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         val0 = getScratch();
+         mkOp1(OP_FLOOR, TYPE_F32, val0, src0);
+         mkOp2(OP_SUB, TYPE_F32, dst0[c], src0, val0);
+      }
+      break;
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F32, fetchSrc(0, c))
+         ->rnd = ROUND_NI;
+      break;
+   case TGSI_OPCODE_CLAMP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         val0 = getScratch();
+         mkOp2(OP_MIN, TYPE_F32, val0, src0, src1);
+         mkOp2(OP_MAX, TYPE_F32, dst0[c], val0, src2);
+      }
+      break;
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_FSEQ:
+   case TGSI_OPCODE_FSGE:
+   case TGSI_OPCODE_FSLT:
+   case TGSI_OPCODE_FSNE:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], src0, src1);
+      }
+      break;
+   case TGSI_OPCODE_KILL_IF:
+      val0 = new_LValue(func, FILE_PREDICATE);
+      for (c = 0; c < 4; ++c) {
+         mkCmp(OP_SET, CC_LT, TYPE_F32, val0, fetchSrc(0, c), zero);
+         mkOp(OP_DISCARD, TYPE_NONE, NULL)->setPredicate(CC_P, val0);
+      }
+      break;
+   case TGSI_OPCODE_KILL:
+      mkOp(OP_DISCARD, TYPE_NONE, NULL);
+      break;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+      //              R  S     L     C    Dx    Dy
+      handleTEX(dst0, 1, 1, 0x03, 0x0f, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_TXD:
+      handleTEX(dst0, 3, 3, 0x03, 0x0f, 0x10, 0x20);
+      break;
+   case TGSI_OPCODE_TEX2:
+      handleTEX(dst0, 2, 2, 0x03, 0x10, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_TXB2:
+   case TGSI_OPCODE_TXL2:
+      handleTEX(dst0, 2, 2, 0x10, 0x11, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_SAMPLE:
+   case TGSI_OPCODE_SAMPLE_B:
+   case TGSI_OPCODE_SAMPLE_D:
+   case TGSI_OPCODE_SAMPLE_L:
+   case TGSI_OPCODE_SAMPLE_C:
+   case TGSI_OPCODE_SAMPLE_C_LZ:
+      handleTEX(dst0, 1, 2, 0x30, 0x30, 0x30, 0x40);
+      break;
+   case TGSI_OPCODE_TXF:
+      handleTXF(dst0, 1, 0x03);
+      break;
+   case TGSI_OPCODE_SAMPLE_I:
+      handleTXF(dst0, 1, 0x03);
+      break;
+   case TGSI_OPCODE_SAMPLE_I_MS:
+      handleTXF(dst0, 1, 0x20);
+      break;
+   case TGSI_OPCODE_TXQ:
+   case TGSI_OPCODE_SVIEWINFO:
+      handleTXQ(dst0, TXQ_DIMS);
+      break;
+   case TGSI_OPCODE_F2I:
+   case TGSI_OPCODE_F2U:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c))->rnd = ROUND_Z;
+      break;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_U2F:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
+      break;
+   case TGSI_OPCODE_EMIT:
+   case TGSI_OPCODE_ENDPRIM:
+      // get vertex stream if specified (must be immediate)
+      src0 = tgsi.srcCount() ?
+         mkImm(tgsi.getSrc(0).getValueU32(0, info)) : zero;
+      mkOp1(op, TYPE_U32, NULL, src0)->fixed = 1;
+      break;
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+   {
+      BasicBlock *ifBB = new BasicBlock(func);
+
+      bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
+      condBBs.push(bb);
+      joinBBs.push(bb);
+
+      mkFlow(OP_BRA, NULL, CC_NOT_P, fetchSrc(0, 0))->setType(srcTy);
+
+      setPosition(ifBB, true);
+   }
+      break;
+   case TGSI_OPCODE_ELSE:
+   {
+      BasicBlock *elseBB = new BasicBlock(func);
+      BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+
+      forkBB->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
+      condBBs.push(bb);
+
+      forkBB->getExit()->asFlow()->target.bb = elseBB;
+      if (!bb->isTerminated())
+         mkFlow(OP_BRA, NULL, CC_ALWAYS, NULL);
+
+      setPosition(elseBB, true);
+   }
+      break;
+   case TGSI_OPCODE_ENDIF:
+   {
+      BasicBlock *convBB = new BasicBlock(func);
+      BasicBlock *prevBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+      BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(joinBBs.pop().u.p);
+
+      if (!bb->isTerminated()) {
+         // we only want join if none of the clauses ended with CONT/BREAK/RET
+         if (prevBB->getExit()->op == OP_BRA && joinBBs.getSize() < 6)
+            insertConvergenceOps(convBB, forkBB);
+         mkFlow(OP_BRA, convBB, CC_ALWAYS, NULL);
+         bb->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+      }
+
+      if (prevBB->getExit()->op == OP_BRA) {
+         prevBB->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+         prevBB->getExit()->asFlow()->target.bb = convBB;
+      }
+      setPosition(convBB, true);
+   }
+      break;
+   case TGSI_OPCODE_BGNLOOP:
+   {
+      BasicBlock *lbgnBB = new BasicBlock(func);
+      BasicBlock *lbrkBB = new BasicBlock(func);
+
+      loopBBs.push(lbgnBB);
+      breakBBs.push(lbrkBB);
+      if (loopBBs.getSize() > func->loopNestingBound)
+         func->loopNestingBound++;
+
+      mkFlow(OP_PREBREAK, lbrkBB, CC_ALWAYS, NULL);
+
+      bb->cfg.attach(&lbgnBB->cfg, Graph::Edge::TREE);
+      setPosition(lbgnBB, true);
+      mkFlow(OP_PRECONT, lbgnBB, CC_ALWAYS, NULL);
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+   {
+      BasicBlock *loopBB = reinterpret_cast<BasicBlock *>(loopBBs.pop().u.p);
+
+      if (!bb->isTerminated()) {
+         mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+      }
+      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+   }
+      break;
+   case TGSI_OPCODE_BRK:
+   {
+      if (bb->isTerminated())
+         break;
+      BasicBlock *brkBB = reinterpret_cast<BasicBlock *>(breakBBs.peek().u.p);
+      mkFlow(OP_BREAK, brkBB, CC_ALWAYS, NULL);
+      bb->cfg.attach(&brkBB->cfg, Graph::Edge::CROSS);
+   }
+      break;
+   case TGSI_OPCODE_CONT:
+   {
+      if (bb->isTerminated())
+         break;
+      BasicBlock *contBB = reinterpret_cast<BasicBlock *>(loopBBs.peek().u.p);
+      mkFlow(OP_CONT, contBB, CC_ALWAYS, NULL);
+      contBB->explicitCont = true;
+      bb->cfg.attach(&contBB->cfg, Graph::Edge::BACK);
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   {
+      Subroutine *s = getSubroutine(ip);
+      BasicBlock *entry = new BasicBlock(s->f);
+      BasicBlock *leave = new BasicBlock(s->f);
+
+      // multiple entrypoints possible, keep the graph connected
+      if (prog->getType() == Program::TYPE_COMPUTE)
+         prog->main->call.attach(&s->f->call, Graph::Edge::TREE);
+
+      sub.cur = s;
+      s->f->setEntry(entry);
+      s->f->setExit(leave);
+      setPosition(entry, true);
+      return true;
+   }
+   case TGSI_OPCODE_ENDSUB:
+   {
+      sub.cur = getSubroutine(prog->main);
+      setPosition(BasicBlock::get(sub.cur->f->cfg.getRoot()), true);
+      return true;
+   }
+   case TGSI_OPCODE_CAL:
+   {
+      Subroutine *s = getSubroutine(tgsi.getLabel());
+      mkFlow(OP_CALL, s->f, CC_ALWAYS, NULL);
+      func->call.attach(&s->f->call, Graph::Edge::TREE);
+      return true;
+   }
+   case TGSI_OPCODE_RET:
+   {
+      if (bb->isTerminated())
+         return true;
+      BasicBlock *leave = BasicBlock::get(func->cfgExit);
+
+      if (!isEndOfSubroutine(ip + 1)) {
+         // insert a PRERET at the entry if this is an early return
+         // (only needed for sharing code in the epilogue)
+         BasicBlock *pos = getBB();
+         setPosition(BasicBlock::get(func->cfg.getRoot()), false);
+         mkFlow(OP_PRERET, leave, CC_ALWAYS, NULL)->fixed = 1;
+         setPosition(pos, true);
+      }
+      mkFlow(OP_RET, NULL, CC_ALWAYS, NULL)->fixed = 1;
+      bb->cfg.attach(&leave->cfg, Graph::Edge::CROSS);
+   }
+      break;
+   case TGSI_OPCODE_END:
+   {
+      // attach and generate epilogue code
+      BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+      bb->cfg.attach(&epilogue->cfg, Graph::Edge::TREE);
+      setPosition(epilogue, true);
+      if (prog->getType() == Program::TYPE_FRAGMENT)
+         exportOutputs();
+      if (info->io.genUserClip > 0)
+         handleUserClipPlanes();
+      mkOp(OP_EXIT, TYPE_NONE, NULL)->terminator = 1;
+   }
+      break;
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_CASE:
+      ERROR("switch/case opcode encountered, should have been lowered\n");
+      abort();
+      break;
+   case TGSI_OPCODE_LOAD:
+      handleLOAD(dst0);
+      break;
+   case TGSI_OPCODE_STORE:
+      handleSTORE();
+      break;
+   case TGSI_OPCODE_BARRIER:
+      geni = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0));
+      geni->fixed = 1;
+      geni->subOp = NV50_IR_SUBOP_BAR_SYNC;
+      break;
+   case TGSI_OPCODE_MFENCE:
+   case TGSI_OPCODE_LFENCE:
+   case TGSI_OPCODE_SFENCE:
+      geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
+      geni->fixed = 1;
+      geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
+      break;
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMAX:
+      handleATOM(dst0, dstTy, tgsi::opcodeToSubOp(tgsi.getOpcode()));
+      break;
+   default:
+      ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
+      assert(0);
+      break;
+   }
+
+   if (tgsi.dstCount()) {
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+         if (dst0[c] != rDst0[c])
+            mkMov(rDst0[c], dst0[c]);
+         storeDst(0, c, rDst0[c]);
+      }
+   }
+   vtxBaseValid = 0;
+
+   return true;
+}
+
+void
+Converter::handleUserClipPlanes()
+{
+   Value *res[8];
+   int n, i, c;
+
+   for (c = 0; c < 4; ++c) {
+      for (i = 0; i < info->io.genUserClip; ++i) {
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+                                TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
+         Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
+         if (c == 0)
+            res[i] = mkOp2v(OP_MUL, TYPE_F32, getScratch(), clipVtx[c], ucp);
+         else
+            mkOp3(OP_MAD, TYPE_F32, res[i], clipVtx[c], ucp, res[i]);
+      }
+   }
+
+   const int first = info->numOutputs - (info->io.genUserClip + 3) / 4;
+
+   for (i = 0; i < info->io.genUserClip; ++i) {
+      n = i / 4 + first;
+      c = i % 4;
+      Symbol *sym =
+         mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4);
+      mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]);
+   }
+}
+
+void
+Converter::exportOutputs()
+{
+   for (unsigned int i = 0; i < info->numOutputs; ++i) {
+      for (unsigned int c = 0; c < 4; ++c) {
+         if (!oData.exists(sub.cur->values, i, c))
+            continue;
+         Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
+                                info->out[i].slot[c] * 4);
+         Value *val = oData.load(sub.cur->values, i, c, NULL);
+         if (val)
+            mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+      }
+   }
+}
+
+Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
+     code(code),
+     tgsi(NULL),
+     tData(this), aData(this), pData(this), oData(this)
+{
+   info = code->info;
+
+   const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
+
+   const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
+   const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
+   const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
+   const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);
+
+   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+   pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
+   aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_ADDRESS, 0);
+   oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);
+
+   zero = mkImm((uint32_t)0);
+
+   vtxBaseValid = 0;
+}
+
+Converter::~Converter()
+{
+}
+
+inline const Converter::Location *
+Converter::BindArgumentsPass::getValueLocation(Subroutine *s, Value *v)
+{
+   ValueMap::l_iterator it = s->values.l.find(v);
+   return it == s->values.l.end() ? NULL : &it->second;
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updateCallArgs(
+   Instruction *i, void (Instruction::*setArg)(int, Value *),
+   T (Function::*proto))
+{
+   Function *g = i->asFlow()->target.fn;
+   Subroutine *subg = conv.getSubroutine(g);
+
+   for (unsigned a = 0; a < (g->*proto).size(); ++a) {
+      Value *v = (g->*proto)[a].get();
+      const Converter::Location &l = *getValueLocation(subg, v);
+      Converter::DataArray *array = conv.getArrayForFile(l.array, l.arrayIdx);
+
+      (i->*setArg)(a, array->acquire(sub->values, l.i, l.c));
+   }
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updatePrototype(
+   BitSet *set, void (Function::*updateSet)(), T (Function::*proto))
+{
+   (func->*updateSet)();
+
+   for (unsigned i = 0; i < set->getSize(); ++i) {
+      Value *v = func->getLValue(i);
+      const Converter::Location *l = getValueLocation(sub, v);
+
+      // only include values with a matching TGSI register
+      if (set->test(i) && l && !conv.code->locals.count(*l))
+         (func->*proto).push_back(v);
+   }
+}
+
+bool
+Converter::BindArgumentsPass::visit(Function *f)
+{
+   sub = conv.getSubroutine(f);
+
+   for (ArrayList::Iterator bi = f->allBBlocks.iterator();
+        !bi.end(); bi.next()) {
+      for (Instruction *i = BasicBlock::get(bi)->getFirst();
+           i; i = i->next) {
+         if (i->op == OP_CALL && !i->asFlow()->builtin) {
+            updateCallArgs(i, &Instruction::setSrc, &Function::ins);
+            updateCallArgs(i, &Instruction::setDef, &Function::outs);
+         }
+      }
+   }
+
+   if (func == prog->main && prog->getType() != Program::TYPE_COMPUTE)
+      return true;
+   updatePrototype(&BasicBlock::get(f->cfg.getRoot())->liveSet,
+                   &Function::buildLiveSets, &Function::ins);
+   updatePrototype(&BasicBlock::get(f->cfgExit)->defSet,
+                   &Function::buildDefSets, &Function::outs);
+
+   return true;
+}
+
+bool
+Converter::run()
+{
+   BasicBlock *entry = new BasicBlock(prog->main);
+   BasicBlock *leave = new BasicBlock(prog->main);
+
+   prog->main->setEntry(entry);
+   prog->main->setExit(leave);
+
+   setPosition(entry, true);
+   sub.cur = getSubroutine(prog->main);
+
+   if (info->io.genUserClip > 0) {
+      for (int c = 0; c < 4; ++c)
+         clipVtx[c] = getScratch();
+   }
+
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      Symbol *sv = mkSysVal(SV_POSITION, 3);
+      fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
+      mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+   }
+
+   for (ip = 0; ip < code->scan.num_instructions; ++ip) {
+      if (!handleInstruction(&code->insns[ip]))
+         return false;
+   }
+
+   if (!BindArgumentsPass(*this).run(prog))
+      return false;
+
+   return true;
+}
+
+} // unnamed namespace
+
+namespace nv50_ir {
+
+bool
+Program::makeFromTGSI(struct nv50_ir_prog_info *info)
+{
+   tgsi::Source src(info);
+   if (!src.scanSource())
+      return false;
+   tlsSize = info->bin.tlsSpace;
+
+   Converter builder(this, &src);
+   return builder.run();
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
new file mode 100644
index 00000000000..3f8d00a1c99
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_graph.h"
+#include <limits>
+#include <list>
+#include <stack>
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Graph::Graph()
+{
+   root = NULL;
+   size = 0;
+   sequence = 0;
+}
+
+Graph::~Graph()
+{
+   for (IteratorRef it = safeIteratorDFS(); !it->end(); it->next())
+      reinterpret_cast<Node *>(it->get())->cut();
+}
+
+void Graph::insert(Node *node)
+{
+   if (!root)
+      root = node;
+
+   node->graph = this;
+   size++;
+}
+
+void Graph::Edge::unlink()
+{
+   if (origin) {
+      prev[0]->next[0] = next[0];
+      next[0]->prev[0] = prev[0];
+      if (origin->out == this)
+         origin->out = (next[0] == this) ? NULL : next[0];
+
+      --origin->outCount;
+   }
+   if (target) {
+      prev[1]->next[1] = next[1];
+      next[1]->prev[1] = prev[1];
+      if (target->in == this)
+         target->in = (next[1] == this) ? NULL : next[1];
+
+      --target->inCount;
+   }
+}
+
+const char *Graph::Edge::typeStr() const
+{
+   switch (type) {
+   case TREE:    return "tree";
+   case FORWARD: return "forward";
+   case BACK:    return "back";
+   case CROSS:   return "cross";
+   case DUMMY:   return "dummy";
+   case UNKNOWN:
+   default:
+      return "unk";
+   }
+}
+
+Graph::Node::Node(void *priv) : data(priv),
+                                in(0), out(0), graph(0),
+                                visited(0),
+                                inCount(0), outCount(0)
+{
+   // nothing to do
+}
+
+void Graph::Node::attach(Node *node, Edge::Type kind)
+{
+   Edge *edge = new Edge(this, node, kind);
+
+   // insert head
+   if (this->out) {
+      edge->next[0] = this->out;
+      edge->prev[0] = this->out->prev[0];
+      edge->prev[0]->next[0] = edge;
+      this->out->prev[0] = edge;
+   }
+   this->out = edge;
+
+   if (node->in) {
+      edge->next[1] = node->in;
+      edge->prev[1] = node->in->prev[1];
+      edge->prev[1]->next[1] = edge;
+      node->in->prev[1] = edge;
+   }
+   node->in = edge;
+
+   ++this->outCount;
+   ++node->inCount;
+
+   assert(graph || node->graph);
+   if (!node->graph)
+      graph->insert(node);
+   if (!graph)
+      node->graph->insert(this);
+
+   if (kind == Edge::UNKNOWN)
+      graph->classifyEdges();
+}
+
+bool Graph::Node::detach(Graph::Node *node)
+{
+   EdgeIterator ei = this->outgoing();
+   for (; !ei.end(); ei.next())
+      if (ei.getNode() == node)
+         break;
+   if (ei.end()) {
+      ERROR("no such node attached\n");
+      return false;
+   }
+   delete ei.getEdge();
+   return true;
+}
+
+// Cut a node from the graph, deleting all attached edges.
+void Graph::Node::cut()
+{
+   while (out)
+      delete out;
+   while (in)
+      delete in;
+
+   if (graph) {
+      if (graph->root == this)
+         graph->root = NULL;
+      graph = NULL;
+   }
+}
+
+Graph::Edge::Edge(Node *org, Node *tgt, Type kind)
+{
+   target = tgt;
+   origin = org;
+   type = kind;
+
+   next[0] = next[1] = this;
+   prev[0] = prev[1] = this;
+}
+
+bool
+Graph::Node::reachableBy(const Node *node, const Node *term) const
+{
+   std::stack<const Node *> stack;
+   const Node *pos = NULL;
+   const int seq = graph->nextSequence();
+
+   stack.push(node);
+
+   while (!stack.empty()) {
+      pos = stack.top();
+      stack.pop();
+
+      if (pos == this)
+         return true;
+      if (pos == term)
+         continue;
+
+      for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
+         if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
+            continue;
+         if (ei.getNode()->visit(seq))
+            stack.push(ei.getNode());
+      }
+   }
+   return pos == this;
+}
+
+class DFSIterator : public Iterator
+{
+public:
+   DFSIterator(Graph *graph, const bool preorder)
+   {
+      unsigned int seq = graph->nextSequence();
+
+      nodes = new Graph::Node * [graph->getSize() + 1];
+      count = 0;
+      pos = 0;
+      nodes[graph->getSize()] = 0;
+
+      if (graph->getRoot()) {
+         graph->getRoot()->visit(seq);
+         search(graph->getRoot(), preorder, seq);
+      }
+   }
+
+   ~DFSIterator()
+   {
+      if (nodes)
+         delete[] nodes;
+   }
+
+   void search(Graph::Node *node, const bool preorder, const int sequence)
+   {
+      if (preorder)
+         nodes[count++] = node;
+
+      for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+         if (ei.getNode()->visit(sequence))
+            search(ei.getNode(), preorder, sequence);
+
+      if (!preorder)
+         nodes[count++] = node;
+   }
+
+   virtual bool end() const { return pos >= count; }
+   virtual void next() { if (pos < count) ++pos; }
+   virtual void *get() const { return nodes[pos]; }
+   virtual void reset() { pos = 0; }
+
+protected:
+   Graph::Node **nodes;
+   int count;
+   int pos;
+};
+
+IteratorRef Graph::iteratorDFS(bool preorder)
+{
+   return IteratorRef(new DFSIterator(this, preorder));
+}
+
+IteratorRef Graph::safeIteratorDFS(bool preorder)
+{
+   return this->iteratorDFS(preorder);
+}
+
+class CFGIterator : public Iterator
+{
+public:
+   CFGIterator(Graph *graph)
+   {
+      nodes = new Graph::Node * [graph->getSize() + 1];
+      count = 0;
+      pos = 0;
+      nodes[graph->getSize()] = 0;
+
+      // TODO: argh, use graph->sequence instead of tag and just raise it by > 1
+      for (IteratorRef it = graph->iteratorDFS(); !it->end(); it->next())
+         reinterpret_cast<Graph::Node *>(it->get())->tag = 0;
+
+      if (graph->getRoot())
+         search(graph->getRoot(), graph->nextSequence());
+   }
+
+   ~CFGIterator()
+   {
+      if (nodes)
+         delete[] nodes;
+   }
+
+   virtual void *get() const { return nodes[pos]; }
+   virtual bool end() const { return pos >= count; }
+   virtual void next() { if (pos < count) ++pos; }
+   virtual void reset() { pos = 0; }
+
+private:
+   void search(Graph::Node *node, const int sequence)
+   {
+      Stack bb, cross;
+
+      bb.push(node);
+
+      while (bb.getSize()) {
+         node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
+         assert(node);
+         if (!node->visit(sequence))
+            continue;
+         node->tag = 0;
+
+         for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+            switch (ei.getType()) {
+            case Graph::Edge::TREE:
+            case Graph::Edge::FORWARD:
+            case Graph::Edge::DUMMY:
+               if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd())
+                  bb.push(ei.getNode());
+               break;
+            case Graph::Edge::BACK:
+               continue;
+            case Graph::Edge::CROSS:
+               if (++(ei.getNode()->tag) == 1)
+                  cross.push(ei.getNode());
+               break;
+            default:
+               assert(!"unknown edge kind in CFG");
+               break;
+            }
+         }
+         nodes[count++] = node;
+
+         if (bb.getSize() == 0)
+            cross.moveTo(bb);
+      }
+   }
+
+private:
+   Graph::Node **nodes;
+   int count;
+   int pos;
+};
+
+IteratorRef Graph::iteratorCFG()
+{
+   return IteratorRef(new CFGIterator(this));
+}
+
+IteratorRef Graph::safeIteratorCFG()
+{
+   return this->iteratorCFG();
+}
+
+void Graph::classifyEdges()
+{
+   int seq;
+
+   for (IteratorRef it = iteratorDFS(true); !it->end(); it->next()) {
+      Node *node = reinterpret_cast<Node *>(it->get());
+      node->visit(0);
+      node->tag = 0;
+   }
+
+   classifyDFS(root, (seq = 0));
+
+   sequence = seq;
+}
+
+void Graph::classifyDFS(Node *curr, int& seq)
+{
+   Graph::Edge *edge;
+   Graph::Node *node;
+
+   curr->visit(++seq);
+   curr->tag = 1;
+
+   for (edge = curr->out; edge; edge = edge->next[0]) {
+      node = edge->target;
+      if (edge->type == Edge::DUMMY)
+         continue;
+
+      if (node->getSequence() == 0) {
+         edge->type = Edge::TREE;
+         classifyDFS(node, seq);
+      } else
+      if (node->getSequence() > curr->getSequence()) {
+         edge->type = Edge::FORWARD;
+      } else {
+         edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+      }
+   }
+
+   for (edge = curr->in; edge; edge = edge->next[1]) {
+      node = edge->origin;
+      if (edge->type == Edge::DUMMY)
+         continue;
+
+      if (node->getSequence() == 0) {
+         edge->type = Edge::TREE;
+         classifyDFS(node, seq);
+      } else
+      if (node->getSequence() > curr->getSequence()) {
+         edge->type = Edge::FORWARD;
+      } else {
+         edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+      }
+   }
+
+   curr->tag = 0;
+}
+
+// @dist is indexed by Node::tag, returns -1 if no path found
+int
+Graph::findLightestPathWeight(Node *a, Node *b, const std::vector<int> &weight)
+{
+   std::vector<int> path(weight.size(), std::numeric_limits<int>::max());
+   std::list<Node *> nodeList;
+   const int seq = nextSequence();
+
+   path[a->tag] = 0;
+   for (Node *c = a; c && c != b;) {
+      const int p = path[c->tag] + weight[c->tag];
+      for (EdgeIterator ei = c->outgoing(); !ei.end(); ei.next()) {
+         Node *t = ei.getNode();
+         if (t->getSequence() < seq) {
+            if (path[t->tag] == std::numeric_limits<int>::max())
+               nodeList.push_front(t);
+            if (p < path[t->tag])
+               path[t->tag] = p;
+         }
+      }
+      c->visit(seq);
+      Node *next = NULL;
+      for (std::list<Node *>::iterator n = nodeList.begin();
+           n != nodeList.end(); ++n) {
+         if (!next || path[(*n)->tag] < path[next->tag])
+            next = *n;
+         if ((*n) == c) {
+            // erase visited
+            n = nodeList.erase(n);
+            --n;
+         }
+      }
+      c = next;
+   }
+   if (path[b->tag] == std::numeric_limits<int>::max())
+      return -1;
+   return path[b->tag];
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
new file mode 100644
index 00000000000..b0981ff6943
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_GRAPH_H__
+#define __NV50_IR_GRAPH_H__
+
+#include "codegen/nv50_ir_util.h"
+#include <vector>
+
+namespace nv50_ir {
+
+#define ITER_NODE(x) reinterpret_cast<Graph::Node *>((x).get())
+#define ITER_EDGE(x) reinterpret_cast<Graph::Edge *>((x).get())
+
+// A connected graph.
+class Graph
+{
+public:
+   class Node;
+
+   class Edge
+   {
+   public:
+      enum Type
+      {
+         UNKNOWN,
+         TREE,
+         FORWARD,
+         BACK,
+         CROSS, // e.g. loop break
+         DUMMY
+      };
+
+      Edge(Node *dst, Node *src, Type kind);
+      ~Edge() { unlink(); }
+
+      inline Node *getOrigin() const { return origin; }
+      inline Node *getTarget() const { return target; }
+
+      inline Type getType() const { return type; }
+      const char *typeStr() const;
+
+   private:
+      Node *origin;
+      Node *target;
+
+      Type type;
+      Edge *next[2]; // next edge outgoing/incident from/to origin/target
+      Edge *prev[2];
+
+      void unlink();
+
+      friend class Graph;
+   };
+
+   class EdgeIterator : public Iterator
+   {
+   public:
+      EdgeIterator() : e(0), t(0), d(0), rev(false) { }
+      EdgeIterator(Graph::Edge *first, int dir, bool reverse)
+         : d(dir), rev(reverse)
+      {
+         t = e = ((rev && first) ? first->prev[d] : first);
+      }
+
+      virtual void next()
+      {
+         Graph::Edge *n = (rev ? e->prev[d] : e->next[d]);
+         e = (n == t ? NULL : n);
+      }
+      virtual bool end() const { return !e; }
+      virtual void *get() const { return e; }
+
+      inline Node *getNode() const { assert(e); return d ?
+                                                   e->origin : e->target; }
+      inline Edge *getEdge() const { return e; }
+      inline Edge::Type getType() { return e ? e->getType() : Edge::UNKNOWN; }
+
+   private:
+      Graph::Edge *e;
+      Graph::Edge *t;
+      int d;
+      bool rev;
+   };
+
+   class Node
+   {
+   public:
+      Node(void *);
+      ~Node() { cut(); }
+
+      void attach(Node *, Edge::Type);
+      bool detach(Node *);
+      void cut();
+
+      inline EdgeIterator outgoing(bool reverse = false) const;
+      inline EdgeIterator incident(bool reverse = false) const;
+
+      inline Node *parent() const; // returns NULL if count(incident edges) != 1
+
+      bool reachableBy(const Node *node, const Node *term) const;
+
+      inline bool visit(int);
+      inline int  getSequence() const;
+
+      inline int incidentCountFwd() const; // count of incident non-back edges
+      inline int incidentCount() const { return inCount; }
+      inline int outgoingCount() const { return outCount; }
+
+      Graph *getGraph() const { return graph; }
+
+      void *data;
+
+   private:
+      Edge *in;
+      Edge *out;
+      Graph *graph;
+
+      int visited;
+
+      int16_t inCount;
+      int16_t outCount;
+   public:
+      int tag; // for temporary use
+
+      friend class Graph;
+   };
+
+public:
+   Graph();
+   ~Graph(); // does *not* free the nodes (make it an option ?)
+
+   inline Node *getRoot() const { return root; }
+
+   inline unsigned int getSize() const { return size; }
+
+   inline int nextSequence();
+
+   void insert(Node *node); // attach to or set as root
+
+   IteratorRef iteratorDFS(bool preorder = true);
+   IteratorRef iteratorCFG();
+
+   // safe iterators are unaffected by changes to the *edges* of the graph
+   IteratorRef safeIteratorDFS(bool preorder = true);
+   IteratorRef safeIteratorCFG();
+
+   void classifyEdges();
+
+   // @weights: indexed by Node::tag
+   int findLightestPathWeight(Node *, Node *, const std::vector<int>& weights);
+
+private:
+   void classifyDFS(Node *, int&);
+
+private:
+   Node *root;
+   unsigned int size;
+   int sequence;
+};
+
+int Graph::nextSequence()
+{
+   return ++sequence;
+}
+
+Graph::Node *Graph::Node::parent() const
+{
+   if (inCount != 1)
+      return NULL;
+   assert(in);
+   return in->origin;
+}
+
+bool Graph::Node::visit(int v)
+{
+   if (visited == v)
+      return false;
+   visited = v;
+   return true;
+}
+
+int Graph::Node::getSequence() const
+{
+   return visited;
+}
+
+Graph::EdgeIterator Graph::Node::outgoing(bool reverse) const
+{
+   return EdgeIterator(out, 0, reverse);
+}
+
+Graph::EdgeIterator Graph::Node::incident(bool reverse) const
+{
+   return EdgeIterator(in, 1, reverse);
+}
+
+int Graph::Node::incidentCountFwd() const
+{
+   int n = 0;
+   for (EdgeIterator ei = incident(); !ei.end(); ei.next())
+      if (ei.getType() != Edge::BACK)
+         ++n;
+   return n;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_GRAPH_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
new file mode 100644
index 00000000000..255324fec40
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_INLINES_H__
+#define __NV50_IR_INLINES_H__
+
+static inline CondCode reverseCondCode(CondCode cc)
+{
+   static const uint8_t ccRev[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+   return static_cast<CondCode>(ccRev[cc & 7] | (cc & ~7));
+}
+
+static inline CondCode inverseCondCode(CondCode cc)
+{
+   return static_cast<CondCode>(cc ^ 7);
+}
+
+static inline bool isMemoryFile(DataFile f)
+{
+   return (f >= FILE_MEMORY_CONST && f <= FILE_MEMORY_LOCAL);
+}
+
+// contrary to asTex(), this will never include SULD/SUST
+static inline bool isTextureOp(operation op)
+{
+   return (op >= OP_TEX && op <= OP_TEXPREP);
+}
+
+static inline bool isSurfaceOp(operation op)
+{
+   return (op >= OP_SULDB && op <= OP_SULEA);
+}
+
+static inline unsigned int typeSizeof(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U8:
+   case TYPE_S8:
+      return 1;
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return 2;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return 4;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      return 8;
+   case TYPE_B96:
+      return 12;
+   case TYPE_B128:
+      return 16;
+   default:
+      return 0;
+   }
+}
+
+static inline unsigned int typeSizeofLog2(DataType ty)
+{
+   switch (ty) {
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return 1;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return 2;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      return 3;
+   case TYPE_B96:
+   case TYPE_B128:
+      return 4;
+   case TYPE_U8:
+   case TYPE_S8:
+   default:
+      return 0;
+   }
+}
+
+static inline DataType typeOfSize(unsigned int size,
+                                  bool flt = false, bool sgn = false)
+{
+   switch (size) {
+   case 1: return sgn ? TYPE_S8 : TYPE_U8;
+   case 2: return flt ? TYPE_F16 : (sgn ? TYPE_S16 : TYPE_U16);
+   case 8: return flt ? TYPE_F64 : (sgn ? TYPE_S64 : TYPE_U64);
+   case 12: return TYPE_B96;
+   case 16: return TYPE_B128;
+   case 4:
+      return flt ? TYPE_F32 : (sgn ? TYPE_S32 : TYPE_U32);
+   default:
+      return TYPE_NONE;
+   }
+}
+
+static inline bool isFloatType(DataType ty)
+{
+   return (ty >= TYPE_F16 && ty <= TYPE_F64);
+}
+
+static inline bool isSignedIntType(DataType ty)
+{
+   return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32);
+}
+
+static inline bool isSignedType(DataType ty)
+{
+   switch (ty) {
+   case TYPE_NONE:
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32:
+   case TYPE_B96:
+   case TYPE_B128:
+      return false;
+   default:
+      return true;
+   }
+}
+
+static inline DataType intTypeToSigned(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U32: return TYPE_S32;
+   case TYPE_U16: return TYPE_S16;
+   case TYPE_U8: return TYPE_S8;
+   default:
+      return ty;
+   }
+}
+
+const ValueRef *ValueRef::getIndirect(int dim) const
+{
+   return isIndirect(dim) ? &insn->src(indirect[dim]) : NULL;
+}
+
+DataFile ValueRef::getFile() const
+{
+   return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueRef::getSize() const
+{
+   return value ? value->reg.size : 0;
+}
+
+Value *ValueRef::rep() const
+{
+   assert(value);
+   return value->join;
+}
+
+Value *ValueDef::rep() const
+{
+   assert(value);
+   return value->join;
+}
+
+DataFile ValueDef::getFile() const
+{
+   return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueDef::getSize() const
+{
+   return value ? value->reg.size : 0;
+}
+
+void ValueDef::setSSA(LValue *lval)
+{
+   origin = value->asLValue();
+   set(lval);
+}
+
+const LValue *ValueDef::preSSA() const
+{
+   return origin;
+}
+
+Instruction *Value::getInsn() const
+{
+   return defs.empty() ? NULL : defs.front()->getInsn();
+}
+
+Instruction *Value::getUniqueInsn() const
+{
+   if (defs.empty())
+      return NULL;
+
+   // after regalloc, the definitions of coalesced values are linked
+   if (join != this) {
+      for (DefCIterator it = defs.begin(); it != defs.end(); ++it)
+         if ((*it)->get() == this)
+            return (*it)->getInsn();
+      // should be unreachable and trigger assertion at the end
+   }
+#ifdef DEBUG
+   if (reg.data.id < 0) {
+      int n = 0;
+      for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it)
+         if ((*it)->get() == this) // don't count joined values
+            ++n;
+      if (n > 1)
+         WARN("value %%%i not uniquely defined\n", id); // return NULL ?
+   }
+#endif
+   assert(defs.front()->get() == this);
+   return defs.front()->getInsn();
+}
+
+inline bool Instruction::constrainedDefs() const
+{
+   return defExists(1) || op == OP_UNION;
+}
+
+Value *Instruction::getIndirect(int s, int dim) const
+{
+   return srcs[s].isIndirect(dim) ? getSrc(srcs[s].indirect[dim]) : NULL;
+}
+
+Value *Instruction::getPredicate() const
+{
+   return (predSrc >= 0) ? getSrc(predSrc) : NULL;
+}
+
+void Instruction::setFlagsDef(int d, Value *val)
+{
+   if (val) {
+      if (flagsDef < 0)
+         flagsDef = d;
+      setDef(flagsDef, val);
+   } else {
+      if (flagsDef >= 0) {
+         setDef(flagsDef, NULL);
+         flagsDef = -1;
+      }
+   }
+}
+
+void Instruction::setFlagsSrc(int s, Value *val)
+{
+   flagsSrc = s;
+   setSrc(flagsSrc, val);
+}
+
+Value *TexInstruction::getIndirectR() const
+{
+   return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+Value *TexInstruction::getIndirectS() const
+{
+   return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+CmpInstruction *Instruction::asCmp()
+{
+   if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+      return static_cast<CmpInstruction *>(this);
+   return NULL;
+}
+
+const CmpInstruction *Instruction::asCmp() const
+{
+   if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+      return static_cast<const CmpInstruction *>(this);
+   return NULL;
+}
+
+FlowInstruction *Instruction::asFlow()
+{
+   if (op >= OP_BRA && op <= OP_JOIN)
+      return static_cast<FlowInstruction *>(this);
+   return NULL;
+}
+
+const FlowInstruction *Instruction::asFlow() const
+{
+   if (op >= OP_BRA && op <= OP_JOINAT)
+      return static_cast<const FlowInstruction *>(this);
+   return NULL;
+}
+
+TexInstruction *Instruction::asTex()
+{
+   if (op >= OP_TEX && op <= OP_SULEA)
+      return static_cast<TexInstruction *>(this);
+   return NULL;
+}
+
+const TexInstruction *Instruction::asTex() const
+{
+   if (op >= OP_TEX && op <= OP_SULEA)
+      return static_cast<const TexInstruction *>(this);
+   return NULL;
+}
+
+static inline Instruction *cloneForward(Function *ctx, Instruction *obj)
+{
+   DeepClonePolicy<Function> pol(ctx);
+
+   for (int i = 0; obj->srcExists(i); ++i)
+      pol.set(obj->getSrc(i), obj->getSrc(i));
+
+   return obj->clone(pol);
+}
+
+// XXX: use a virtual function so we're really really safe ?
+LValue *Value::asLValue()
+{
+   if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS)
+      return static_cast<LValue *>(this);
+   return NULL;
+}
+
+Symbol *Value::asSym()
+{
+   if (reg.file >= FILE_MEMORY_CONST)
+      return static_cast<Symbol *>(this);
+   return NULL;
+}
+
+const Symbol *Value::asSym() const
+{
+   if (reg.file >= FILE_MEMORY_CONST)
+      return static_cast<const Symbol *>(this);
+   return NULL;
+}
+
+void Symbol::setOffset(int32_t offset)
+{
+   reg.data.offset = offset;
+}
+
+void Symbol::setAddress(Symbol *base, int32_t offset)
+{
+   baseSym = base;
+   reg.data.offset = offset;
+}
+
+void Symbol::setSV(SVSemantic sv, uint32_t index)
+{
+   reg.data.sv.sv = sv;
+   reg.data.sv.index = index;
+}
+
+ImmediateValue *Value::asImm()
+{
+   if (reg.file == FILE_IMMEDIATE)
+      return static_cast<ImmediateValue *>(this);
+   return NULL;
+}
+
+const ImmediateValue *Value::asImm() const
+{
+   if (reg.file == FILE_IMMEDIATE)
+      return static_cast<const ImmediateValue *>(this);
+   return NULL;
+}
+
+Value *Value::get(Iterator &it)
+{
+   return reinterpret_cast<Value *>(it.get());
+}
+
+bool BasicBlock::reachableBy(const BasicBlock *by, const BasicBlock *term)
+{
+   return cfg.reachableBy(&by->cfg, &term->cfg);
+}
+
+BasicBlock *BasicBlock::get(Iterator &iter)
+{
+   return reinterpret_cast<BasicBlock *>(iter.get());
+}
+
+BasicBlock *BasicBlock::get(Graph::Node *node)
+{
+   assert(node);
+   return reinterpret_cast<BasicBlock *>(node->data);
+}
+
+Function *Function::get(Graph::Node *node)
+{
+   assert(node);
+   return reinterpret_cast<Function *>(node->data);
+}
+
+LValue *Function::getLValue(int id)
+{
+   assert((unsigned int)id < (unsigned int)allLValues.getSize());
+   return reinterpret_cast<LValue *>(allLValues.get(id));
+}
+
+#endif // __NV50_IR_INLINES_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 00000000000..56eaad3bbf9
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
+//       al*bl
+//    ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+   DataType fTy = mul->sType; // full type
+   DataType hTy;
+   switch (fTy) {
+   case TYPE_S32: hTy = TYPE_S16; break;
+   case TYPE_U32: hTy = TYPE_U16; break;
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return false;
+   }
+   unsigned int fullSize = typeSizeof(fTy);
+   unsigned int halfSize = typeSizeof(hTy);
+
+   Instruction *i[9];
+
+   bld->setPosition(mul, true);
+
+   Value *a[2], *b[2];
+   Value *c[2];
+   Value *t[4];
+   for (int j = 0; j < 4; ++j)
+      t[j] = bld->getSSA(fullSize);
+
+   // split sources into halves
+   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+
+   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+   if (highResult) {
+      Value *r[3];
+      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+      c[0] = bld->getSSA(1, FILE_FLAGS);
+      c[1] = bld->getSSA(1, FILE_FLAGS);
+      for (int j = 0; j < 3; ++j)
+         r[j] = bld->getSSA(fullSize);
+
+      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+      // set carry defs / sources
+      i[3]->setFlagsDef(1, c[0]);
+      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      i[6]->setPredicate(CC_C, c[0]);
+      i[5]->setFlagsSrc(3, c[1]);
+   } else {
+      bld->mkMov(mul->getDef(0), t[3]);
+   }
+   delete_Instruction(bld->getProgram(), mul);
+
+   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+      if (i[j])
+         i[j]->sType = hTy;
+
+   return true;
+}
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)            \
+   ((QOP_##q << 6) | (QOP_##r << 4) | \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void handlePRERET(FlowInstruction *);
+   void replaceZero(Instruction *);
+
+   LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+   Program *prog = fn->getProgram();
+
+   r63 = new_LValue(fn, FILE_GPR);
+   r63->reg.data.id = 63;
+
+   // this is actually per-program, but we can do it all on visiting main()
+   std::list<Instruction *> *outWrites =
+      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+   if (outWrites) {
+      for (std::list<Instruction *>::iterator it = outWrites->begin();
+           it != outWrites->end(); ++it)
+         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+      // instructions will be deleted on exit
+      outWrites->clear();
+   }
+
+   return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, r63);
+   }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+//             --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+   BasicBlock *bbE = pre->bb;
+   BasicBlock *bbT = pre->target.bb;
+
+   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+   bbE->remove(pre);
+   bbE->insertHead(pre);
+
+   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+   bbT->insertHead(call);
+   bbT->insertHead(skip);
+
+   // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isNop()) {
+         bb->remove(i);
+      } else
+      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+         handlePRERET(i->asFlow());
+      } else {
+         // TODO: We will want to do this before register allocation,
+         // since have to use a $c register for the carry flag.
+         if (typeSizeof(i->dType) == 8) {
+            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
+            if (hi)
+               next = hi;
+         }
+
+         if (i->op != OP_MOV && i->op != OP_PFETCH &&
+             i->op != OP_BAR &&
+             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+            replaceZero(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+   NV50LegalizeSSA(Program *);
+
+   virtual bool visit(BasicBlock *bb);
+
+private:
+   void propagateWriteToOutput(Instruction *);
+   void handleDIV(Instruction *);
+   void handleMOD(Instruction *);
+   void handleMUL(Instruction *);
+   void handleAddrDef(Instruction *);
+
+   inline bool isARL(const Instruction *) const;
+
+   BuildUtil bld;
+
+   std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+   bld.setProgram(prog);
+
+   if (prog->optLevel >= 2 &&
+       (prog->getType() == Program::TYPE_GEOMETRY ||
+        prog->getType() == Program::TYPE_VERTEX))
+      outWrites =
+         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   else
+      outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+      return;
+
+   // check def instruction can store
+   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+   // TODO: move exports (if beneficial) in common opt pass
+   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+      return;
+   for (int s = 0; di->srcExists(s); ++s)
+      if (di->src(s).getFile() == FILE_IMMEDIATE)
+         return;
+
+   // We cannot set defs to non-lvalues before register allocation, so
+   // save & remove (to save registers) the exports and replace later.
+   outWrites->push_back(st);
+   st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+   ImmediateValue imm;
+
+   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+      return false;
+   if (!i->src(1).getImmediate(imm))
+      return false;
+   return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+   Instruction *arl;
+
+   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+         return;
+      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+         return;
+   }
+
+   // turn $a sources into $r sources (can't operate on $a)
+   for (int s = 0; i->srcExists(s); ++s) {
+      Value *a = i->getSrc(s);
+      Value *r;
+      if (a->reg.file == FILE_ADDRESS) {
+         if (a->getInsn() && isARL(a->getInsn())) {
+            i->setSrc(s, a->getInsn()->getSrc(0));
+         } else {
+            bld.setPosition(i, false);
+            r = bld.getSSA();
+            bld.mkMov(r, a);
+            i->setSrc(s, r);
+         }
+      }
+   }
+   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+      return;
+
+   // turn result back into $a
+   bld.setPosition(i, true);
+   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+   i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+      return;
+   Value *def = mul->getDef(0);
+   Value *pred = mul->getPredicate();
+   CondCode cc = mul->cc;
+   if (pred)
+      mul->setPredicate(CC_ALWAYS, NULL);
+
+   if (mul->op == OP_MAD) {
+      Instruction *add = mul;
+      bld.setPosition(add, false);
+      Value *res = cloneShallow(func, mul->getDef(0));
+      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+      add->op = OP_ADD;
+      add->setSrc(0, mul->getDef(0));
+      add->setSrc(1, add->getSrc(2));
+      for (int s = 2; add->srcExists(s); ++s)
+         add->setSrc(s, NULL);
+      mul->subOp = add->subOp;
+      add->subOp = 0;
+   }
+   expandIntegerMUL(&bld, mul);
+   if (pred)
+      def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+   const DataType ty = div->sType;
+
+   if (ty != TYPE_U32 && ty != TYPE_S32)
+      return;
+
+   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+   bld.setPosition(div, false);
+
+   Value *a, *af = bld.getSSA();
+   Value *b, *bf = bld.getSSA();
+
+   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+   if (isSignedType(ty)) {
+      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      a = bld.getSSA();
+      b = bld.getSSA();
+      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+   } else {
+      a = div->getSrc(0);
+      b = div->getSrc(1);
+   }
+
+   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+   // get error of 1st result
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+      ->rnd = ROUND_Z;
+   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+   // correction: if modulus >= divisor, add 1
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+   if (!isSignedType(ty)) {
+      div->op = OP_SUB;
+      div->setSrc(0, q);
+      div->setSrc(1, s);
+   } else {
+      t = q;
+      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+      s = bld.getSSA();
+      t = bld.getSSA();
+      // fix the sign
+      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+      div->op = OP_UNION;
+      div->setSrc(0, s);
+      div->setSrc(1, t);
+   }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+      return;
+   bld.setPosition(mod, false);
+
+   Value *q = bld.getSSA();
+   Value *m = bld.getSSA();
+
+   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+   handleDIV(q->getInsn());
+
+   bld.setPosition(mod, false);
+   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+   mod->op = OP_SUB;
+   mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *insn, *next;
+   // skipping PHIs (don't pass them to handleAddrDef) !
+   for (insn = bb->getEntry(); insn; insn = next) {
+      next = insn->next;
+
+      switch (insn->op) {
+      case OP_EXPORT:
+         if (outWrites)
+            propagateWriteToOutput(insn);
+         break;
+      case OP_DIV:
+         handleDIV(insn);
+         break;
+      case OP_MOD:
+         handleMOD(insn);
+         break;
+      case OP_MAD:
+      case OP_MUL:
+         handleMUL(insn);
+         break;
+      default:
+         break;
+      }
+
+      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+         handleAddrDef(insn);
+   }
+   return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+   NV50LoweringPreSSA(Program *);
+
+private:
+   virtual bool visit(Instruction *);
+   virtual bool visit(Function *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+
+   bool handleEXPORT(Instruction *);
+
+   bool handleDIV(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+
+   bool handleSET(Instruction *);
+   bool handleSLCT(CmpInstruction *);
+   bool handleSELP(Instruction *);
+
+   bool handleTEX(TexInstruction *);
+   bool handleTXB(TexInstruction *); // I really
+   bool handleTXL(TexInstruction *); // hate
+   bool handleTXD(TexInstruction *); // these 3
+
+   bool handleCALL(Instruction *);
+   bool handlePRECONT(Instruction *);
+   bool handleCONT(Instruction *);
+
+   void checkPredicate(Instruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+   targ(prog->getTarget()), tid(NULL)
+{
+   bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      Value *arg = new_LValue(func, FILE_GPR);
+      arg->reg.data.id = 0;
+      f->ins.push_back(arg);
+
+      bld.setPosition(root, false);
+      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+   }
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+   const int arg = i->tex.target.getArgCount();
+   const int dref = arg;
+   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+   // dref comes before bias/lod
+   if (i->tex.target.isShadow())
+      if (i->op == OP_TXB || i->op == OP_TXL)
+         i->swapSources(dref, lod);
+
+   // array index must be converted to u32
+   if (i->tex.target.isArray()) {
+      Value *layer = i->getSrc(arg - 1);
+      LValue *src = new_LValue(func, FILE_GPR);
+      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
+      i->setSrc(arg - 1, src);
+
+      if (i->tex.target.isCube()) {
+         std::vector<Value *> acube, a2d;
+         int c;
+
+         acube.resize(4);
+         for (c = 0; c < 4; ++c)
+            acube[c] = i->getSrc(c);
+         a2d.resize(4);
+         for (c = 0; c < 3; ++c)
+            a2d[c] = new_LValue(func, FILE_GPR);
+         a2d[3] = NULL;
+
+         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
+                   a2d, acube)->asTex()->tex.mask = 0x7;
+
+         for (c = 0; c < 3; ++c)
+            i->setSrc(c, a2d[c]);
+         i->setSrc(c, NULL);
+         for (; i->srcExists(c + 1); ++c)
+            i->setSrc(c, i->getSrc(c + 1));
+
+         i->tex.target = i->tex.target.isShadow() ?
+            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
+      }
+   }
+
+   // texel offsets are 3 immediate fields in the instruction,
+   // nv50 cannot do textureGatherOffsets
+   assert(i->tex.useOffsets <= 1);
+
+   return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+   int l, d;
+
+   handleTEX(i);
+   Value *bias = i->getSrc(i->tex.target.getArgCount());
+   if (bias->isUniform())
+      return true;
+
+   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+                                 bld.loadImm(NULL, 1));
+   bld.setPosition(cond, false);
+
+   for (l = 1; l < 4; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *bit = bld.getSSA();
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      Value *imm = bld.loadImm(NULL, (1 << l));
+      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+      cond->setSrc(l, bit);
+   }
+   Value *flags = bld.getScratch(1, FILE_FLAGS);
+   bld.setPosition(cond, true);
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+   Instruction *tex[4];
+   for (l = 0; l < 4; ++l) {
+      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+      bld.insert(tex[l]);
+   }
+
+   Value *res[4][4];
+   for (d = 0; i->defExists(d); ++d)
+      res[0][d] = tex[0]->getDef(d);
+   for (l = 1; l < 4; ++l) {
+      for (d = 0; tex[l]->defExists(d); ++d) {
+         res[l][d] = cloneShallow(func, res[0][d]);
+         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+      }
+   }
+
+   for (d = 0; i->defExists(d); ++d) {
+      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+      for (l = 0; l < 4; ++l)
+         dst->setSrc(l, res[l][d]);
+   }
+   delete_Instruction(prog, i);
+   return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+   handleTEX(i);
+   Value *lod = i->getSrc(i->tex.target.getArgCount());
+   if (lod->isUniform())
+      return true;
+
+   BasicBlock *currBB = i->bb;
+   BasicBlock *texiBB = i->bb->splitBefore(i, false);
+   BasicBlock *joinBB = i->bb->splitAfter(i);
+
+   bld.setPosition(currBB, true);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   for (int l = 0; l <= 3; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      bld.setPosition(currBB, true);
+      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+      if (l <= 2) {
+         BasicBlock *laneBB = new BasicBlock(func);
+         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+         currBB = laneBB;
+      }
+   }
+   bld.setPosition(joinBB, false);
+   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   handleTEX(i);
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+   if (i->dType == TYPE_F32) {
+      bld.setPosition(i, true);
+      i->dType = TYPE_U32;
+      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+   Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   // XXX: these probably shouldn't be immediates in the first place ...
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.setPosition(i, true);
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+   bld.setPosition(i, false);
+   i->op = OP_SET;
+   i->setFlagsDef(0, pred);
+   i->dType = TYPE_U8;
+   i->setSrc(0, i->getSrc(2));
+   i->setSrc(2, NULL);
+   i->setSrc(1, bld.loadImm(NULL, 0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+
+   // these are all shader outputs, $sreg are not writeable
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      i->setSrc(i->srcCount(), tid);
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+   i->op = OP_BRA;
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+   Value *def = i->getDef(0);
+   SVSemantic sv = sym->reg.data.sv.sv;
+   int idx = sym->reg.data.sv.index;
+
+   if (addr >= 0x400) // mov $sreg
+      return true;
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+      }
+      break;
+   case SV_NCTAID:
+   case SV_CTAID:
+   case SV_NTID:
+      if ((sv == SV_NCTAID && idx >= 2) ||
+          (sv == SV_NTID && idx >= 3)) {
+         bld.mkMov(def, bld.mkImm(1));
+      } else if (sv == SV_CTAID && idx >= 2) {
+         bld.mkMov(def, bld.mkImm(0));
+      } else {
+         Value *x = bld.getSSA(2);
+         bld.mkOp1(OP_LOAD, TYPE_U16, x,
+                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+      }
+      break;
+   case SV_TID:
+      if (idx == 0) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+      } else if (idx == 1) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+      } else if (idx == 2) {
+         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+      } else {
+         bld.mkMov(def, bld.mkImm(0));
+      }
+      break;
+   default:
+      bld.mkFetch(i->getDef(0), i->dType,
+                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      if (i->getIndirect(0, 0)) {
+         // TODO: redirect to l[] here, load to GPRs at exit
+         return false;
+      } else {
+         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+         i->op = OP_MOV;
+         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+         i->src(0).set(i->src(1));
+         i->setSrc(1, NULL);
+         i->setDef(0, new_LValue(func, FILE_GPR));
+         i->getDef(0)->reg.data.id = id;
+
+         prog->maxGPR = MAX2(prog->maxGPR, id);
+      }
+   }
+   return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *cdst;
+
+   if (!pred || pred->reg.file == FILE_FLAGS)
+      return;
+   cdst = bld.getSSA(1, FILE_FLAGS);
+
+   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
+
+   insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXB:
+      return handleTXB(i->asTex());
+   case OP_TXL:
+      return handleTXL(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_SET:
+      return handleSET(i);
+   case OP_SLCT:
+      return handleSLCT(i->asCmp());
+   case OP_SELP:
+      return handleSELP(i);
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_CALL:
+      return handleCALL(i);
+   case OP_PRECONT:
+      return handlePRECONT(i);
+   case OP_CONT:
+      return handleCONT(i);
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+   bool ret = false;
+
+   if (stage == CG_STAGE_PRE_SSA) {
+      NV50LoweringPreSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      if (!prog->targetPriv)
+         prog->targetPriv = new std::list<Instruction *>();
+      NV50LegalizeSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NV50LegalizePostRA pass;
+      ret = pass.run(prog, false, true);
+      if (prog->targetPriv)
+         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   }
+   return ret;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
new file mode 100644
index 00000000000..8d94dd1ce5d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -0,0 +1,1597 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+#include <limits>
+
+namespace nv50_ir {
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)                      \
+   ((QOP_##q << 6) | (QOP_##r << 4) |           \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+class NVC0LegalizeSSA : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Function *);
+
+   // we want to insert calls to the builtin library only after optimization
+   void handleDIV(Instruction *); // integer division, modulus
+   void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
+
+private:
+   BuildUtil bld;
+};
+
+void
+NVC0LegalizeSSA::handleDIV(Instruction *i)
+{
+   FlowInstruction *call;
+   int builtin;
+   Value *def[2];
+
+   bld.setPosition(i, false);
+   def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
+   def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
+   switch (i->dType) {
+   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
+   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
+   default:
+      return;
+   }
+   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+   bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
+   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
+   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
+
+   call->fixed = 1;
+   call->absolute = call->builtin = 1;
+   call->target.builtin = builtin;
+   delete_Instruction(prog, i);
+}
+
+void
+NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
+{
+   // TODO
+}
+
+bool
+NVC0LegalizeSSA::visit(Function *fn)
+{
+   bld.setProgram(fn->getProgram());
+   return true;
+}
+
+bool
+NVC0LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *next;
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->dType == TYPE_F32)
+         continue;
+      switch (i->op) {
+      case OP_DIV:
+      case OP_MOD:
+         handleDIV(i);
+         break;
+      case OP_RCP:
+      case OP_RSQ:
+         if (i->dType == TYPE_F64)
+            handleRCPRSQ(i);
+         break;
+      default:
+         break;
+      }
+   }
+   return true;
+}
+
+class NVC0LegalizePostRA : public Pass
+{
+public:
+   NVC0LegalizePostRA(const Program *);
+
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void replaceZero(Instruction *);
+   bool tryReplaceContWithBra(BasicBlock *);
+   void propagateJoin(BasicBlock *);
+
+   struct TexUse
+   {
+      TexUse(Instruction *use, const Instruction *tex)
+         : insn(use), tex(tex), level(-1) { }
+      Instruction *insn;
+      const Instruction *tex; // or split / mov
+      int level;
+   };
+   struct Limits
+   {
+      Limits() { }
+      Limits(int min, int max) : min(min), max(max) { }
+      int min, max;
+   };
+   bool insertTextureBarriers(Function *);
+   inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
+   void findFirstUses(const Instruction *tex, const Instruction *def,
+                      std::list<TexUse>&);
+   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
+                            const BasicBlock *term,
+                            std::list<TexUse>&);
+   void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
+   const Instruction *recurseDef(const Instruction *);
+
+private:
+   LValue *rZero;
+   LValue *carry;
+   const bool needTexBar;
+};
+
+NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
+   : rZero(NULL),
+     carry(NULL),
+     needTexBar(prog->getTarget()->getChipset() >= 0xe0)
+{
+}
+
+bool
+NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
+                                    const Instruction *early) const
+{
+   if (early->bb == later->bb)
+      return early->serial < later->serial;
+   return later->bb->dominatedBy(early->bb);
+}
+
+void
+NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
+                              Instruction *usei, const Instruction *insn)
+{
+   bool add = true;
+   for (std::list<TexUse>::iterator it = uses.begin();
+        it != uses.end();) {
+      if (insnDominatedBy(usei, it->insn)) {
+         add = false;
+         break;
+      }
+      if (insnDominatedBy(it->insn, usei))
+         it = uses.erase(it);
+      else
+         ++it;
+   }
+   if (add)
+      uses.push_back(TexUse(usei, insn));
+}
+
+void
+NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
+                                        Instruction *insn,
+                                        const BasicBlock *term,
+                                        std::list<TexUse> &uses)
+{
+   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
+      insn = insn->getSrc(0)->getUniqueInsn();
+
+   if (!insn || !insn->bb->reachableBy(texi->bb, term))
+      return;
+
+   switch (insn->op) {
+   /* Values not connected to the tex's definition through any of these should
+    * not be conflicting.
+    */
+   case OP_SPLIT:
+   case OP_MERGE:
+   case OP_PHI:
+   case OP_UNION:
+      /* recurse again */
+      for (int s = 0; insn->srcExists(s); ++s)
+         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
+                             uses);
+      break;
+   default:
+      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
+      addTexUse(uses, insn, texi);
+      break;
+   }
+}
+
+void
+NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
+                                  const Instruction *insn,
+                                  std::list<TexUse> &uses)
+{
+   for (int d = 0; insn->defExists(d); ++d) {
+      Value *v = insn->getDef(d);
+      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
+         Instruction *usei = (*u)->getInsn();
+
+         if (usei->op == OP_PHI || usei->op == OP_UNION) {
+            // need a barrier before WAW cases
+            for (int s = 0; usei->srcExists(s); ++s) {
+               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
+               if (defi && &usei->src(s) != *u)
+                  findOverwritingDefs(texi, defi, usei->bb, uses);
+            }
+         }
+
+         if (usei->op == OP_SPLIT ||
+             usei->op == OP_MERGE ||
+             usei->op == OP_PHI ||
+             usei->op == OP_UNION) {
+            // these uses don't manifest in the machine code
+            findFirstUses(texi, usei, uses);
+         } else
+         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
+             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
+            findFirstUses(texi, usei, uses);
+         } else {
+            addTexUse(uses, usei, insn);
+         }
+      }
+   }
+}
+
+// Texture barriers:
+// This pass is a bit long and ugly and can probably be optimized.
+//
+// 1. obtain a list of TEXes and their outputs' first use(s)
+// 2. calculate the barrier level of each first use (minimal number of TEXes,
+//    over all paths, between the TEX and the use in question)
+// 3. for each barrier, if all paths from the source TEX to that barrier
+//    contain a barrier of lesser level, it can be culled
+bool
+NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
+{
+   std::list<TexUse> *uses;
+   std::vector<Instruction *> texes;
+   std::vector<int> bbFirstTex;
+   std::vector<int> bbFirstUse;
+   std::vector<int> texCounts;
+   std::vector<TexUse> useVec;
+   ArrayList insns;
+
+   fn->orderInstructions(insns);
+
+   texCounts.resize(fn->allBBlocks.getSize(), 0);
+   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
+   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
+
+   // tag BB CFG nodes by their id for later
+   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
+      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
+      if (bb)
+         bb->cfg.tag = bb->getId();
+   }
+
+   // gather the first uses for each TEX
+   for (int i = 0; i < insns.getSize(); ++i) {
+      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
+      if (isTextureOp(tex->op)) {
+         texes.push_back(tex);
+         if (!texCounts.at(tex->bb->getId()))
+            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
+         texCounts[tex->bb->getId()]++;
+      }
+   }
+   insns.clear();
+   if (texes.empty())
+      return false;
+   uses = new std::list<TexUse>[texes.size()];
+   if (!uses)
+      return false;
+   for (size_t i = 0; i < texes.size(); ++i)
+      findFirstUses(texes[i], texes[i], uses[i]);
+
+   // determine the barrier level at each use
+   for (size_t i = 0; i < texes.size(); ++i) {
+      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
+           ++u) {
+         BasicBlock *tb = texes[i]->bb;
+         BasicBlock *ub = u->insn->bb;
+         if (tb == ub) {
+            u->level = 0;
+            for (size_t j = i + 1; j < texes.size() &&
+                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         } else {
+            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
+                                                      &ub->cfg, texCounts);
+            if (u->level < 0) {
+               WARN("Failed to find path TEX -> TEXBAR\n");
+               u->level = 0;
+               continue;
+            }
+            // this counted all TEXes in the origin block, correct that
+            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
+            // and did not count the TEXes in the destination block, add those
+            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
+                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         }
+         assert(u->level >= 0);
+         useVec.push_back(*u);
+      }
+   }
+   delete[] uses;
+   uses = NULL;
+
+   // insert the barriers
+   for (size_t i = 0; i < useVec.size(); ++i) {
+      Instruction *prev = useVec[i].insn->prev;
+      if (useVec[i].level < 0)
+         continue;
+      if (prev && prev->op == OP_TEXBAR) {
+         if (prev->subOp > useVec[i].level)
+            prev->subOp = useVec[i].level;
+         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
+      } else {
+         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+         bar->fixed = 1;
+         bar->subOp = useVec[i].level;
+         // make use explicit to ease latency calculation
+         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
+         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
+      }
+   }
+
+   if (fn->getProgram()->optLevel < 3) {
+      if (uses)
+         delete[] uses;
+      return true;
+   }
+
+   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
+
+   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitS.resize(fn->allBBlocks.getSize());
+
+   // cull unneeded barriers (should do that earlier, but for simplicity)
+   IteratorRef bi = fn->cfg.iteratorCFG();
+   // first calculate min/max outstanding TEXes for each BB
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      int min = 0;
+      int max = std::numeric_limits<int>::max();
+      for (Instruction *i = bb->getFirst(); i; i = i->next) {
+         if (isTextureOp(i->op)) {
+            min++;
+            if (max < std::numeric_limits<int>::max())
+               max++;
+         } else
+         if (i->op == OP_TEXBAR) {
+            min = MIN2(min, i->subOp);
+            max = MIN2(max, i->subOp);
+         }
+      }
+      // limits when looking at an isolated block
+      limitS[bb->getId()].min = min;
+      limitS[bb->getId()].max = max;
+   }
+   // propagate the min/max values
+   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
+      for (bi->reset(); !bi->end(); bi->next()) {
+         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+         BasicBlock *bb = BasicBlock::get(n);
+         const int bbId = bb->getId();
+         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
+            BasicBlock *in = BasicBlock::get(ei.getNode());
+            const int inId = in->getId();
+            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
+            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
+         }
+         // I just hope this is correct ...
+         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
+            // no barrier
+            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
+            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
+         } else {
+            // block contained a barrier
+            limitB[bbId].min = MIN2(limitS[bbId].max,
+                                    limitT[bbId].min + limitS[bbId].min);
+            limitB[bbId].max = MIN2(limitS[bbId].max,
+                                    limitT[bbId].max + limitS[bbId].min);
+         }
+      }
+   }
+   // finally delete unnecessary barriers
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      Instruction *prev = NULL;
+      Instruction *next;
+      int max = limitT[bb->getId()].max;
+      for (Instruction *i = bb->getFirst(); i; i = next) {
+         next = i->next;
+         if (i->op == OP_TEXBAR) {
+            if (i->subOp >= max) {
+               delete_Instruction(prog, i);
+               i = NULL;
+            } else {
+               max = i->subOp;
+               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
+                  delete_Instruction(prog, prev);
+                  prev = NULL;
+               }
+            }
+         } else
+         if (isTextureOp(i->op)) {
+            max++;
+         }
+         if (i && !i->isNop())
+            prev = i;
+      }
+   }
+   if (uses)
+      delete[] uses;
+   return true;
+}
+
+bool
+NVC0LegalizePostRA::visit(Function *fn)
+{
+   if (needTexBar)
+      insertTextureBarriers(fn);
+
+   rZero = new_LValue(fn, FILE_GPR);
+   carry = new_LValue(fn, FILE_FLAGS);
+
+   rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
+   carry->reg.data.id = 0;
+
+   return true;
+}
+
+void
+NVC0LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      if (s == 2 && i->op == OP_SUCLAMP)
+         continue;
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, rZero);
+   }
+}
+
+// replace CONT with BRA for single unconditional continue
+bool
+NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
+{
+   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
+      return false;
+   Graph::EdgeIterator ei = bb->cfg.incident();
+   if (ei.getType() != Graph::Edge::BACK)
+      ei.next();
+   if (ei.getType() != Graph::Edge::BACK)
+      return false;
+   BasicBlock *contBB = BasicBlock::get(ei.getNode());
+
+   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
+       contBB->getExit()->getPredicate())
+      return false;
+   contBB->getExit()->op = OP_BRA;
+   bb->remove(bb->getEntry()); // delete PRECONT
+
+   ei.next();
+   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
+   return true;
+}
+
+// replace branches to join blocks with join ops
+void
+NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
+{
+   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
+      return;
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      Instruction *exit = in->getExit();
+      if (!exit) {
+         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
+         // there should always be a terminator instruction
+         WARN("inserted missing terminator in BB:%i\n", in->getId());
+      } else
+      if (exit->op == OP_BRA) {
+         exit->op = OP_JOIN;
+         exit->asFlow()->limit = 1; // must-not-propagate marker
+      }
+   }
+   bb->remove(bb->getEntry());
+}
+
+bool
+NVC0LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->op == OP_EMIT || i->op == OP_RESTART) {
+         if (!i->getDef(0)->refCount())
+            i->setDef(0, NULL);
+         if (i->src(0).getFile() == FILE_IMMEDIATE)
+            i->setSrc(0, rZero); // initial value must be 0
+      } else
+      if (i->isNop()) {
+         bb->remove(i);
+      } else {
+         // TODO: Move this to before register allocation for operations that
+         // need the $c register !
+         if (typeSizeof(i->dType) == 8) {
+            Instruction *hi;
+            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
+            if (hi)
+               next = hi;
+         }
+
+         if (i->op != OP_MOV && i->op != OP_PFETCH)
+            replaceZero(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   if (!tryReplaceContWithBra(bb))
+      propagateJoin(bb);
+
+   return true;
+}
+
+class NVC0LoweringPass : public Pass
+{
+public:
+   NVC0LoweringPass(Program *);
+
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Instruction *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+   bool handleEXPORT(Instruction *);
+   bool handleOUT(Instruction *);
+   bool handleDIV(Instruction *);
+   bool handleMOD(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+   bool handleTEX(TexInstruction *);
+   bool handleTXD(TexInstruction *);
+   bool handleTXQ(TexInstruction *);
+   bool handleManualTXD(TexInstruction *);
+   bool handleATOM(Instruction *);
+   bool handleCasExch(Instruction *, bool needCctl);
+   void handleSurfaceOpNVE4(TexInstruction *);
+
+   void checkPredicate(Instruction *);
+
+   void readTessCoord(LValue *dst, int c);
+
+   Value *loadResInfo32(Value *ptr, uint32_t off);
+   Value *loadMsInfo32(Value *ptr, uint32_t off);
+   Value *loadTexHandle(Value *ptr, unsigned int slot);
+
+   void adjustCoordinatesMS(TexInstruction *);
+   void processSurfaceCoordsNVE4(TexInstruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Symbol *gMemBase;
+   LValue *gpEmitAddress;
+};
+
+NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
+{
+   bld.setProgram(prog);
+   gMemBase = NULL;
+}
+
+bool
+NVC0LoweringPass::visit(Function *fn)
+{
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      assert(!strncmp(fn->getName(), "MAIN", 4));
+      // TODO: when we generate actual functions pass this value along somehow
+      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
+      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
+      if (fn->cfgExit) {
+         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
+         bld.mkMovToReg(0, gpEmitAddress);
+      }
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::visit(BasicBlock *bb)
+{
+   return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint32_t off = prog->driver->io.texBindBase + slot * 4;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+// move array source to first slot, convert to u16, add indirections
+bool
+NVC0LoweringPass::handleTEX(TexInstruction *i)
+{
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
+   const int arg = i->tex.target.getArgCount();
+   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
+
+   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+         WARN("indirect TEX not implemented\n");
+      }
+      if (i->tex.r == i->tex.s) {
+         i->tex.r += prog->driver->io.texBindBase / 4;
+         i->tex.s  = 0; // only a single cX[] value possible here
+      } else {
+         Value *hnd = bld.getScratch();
+         Value *rHnd = loadTexHandle(NULL, i->tex.r);
+         Value *sHnd = loadTexHandle(NULL, i->tex.s);
+
+         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
+
+         i->tex.r = 0; // not used for indirect tex
+         i->tex.s = 0;
+         i->setIndirectR(hnd);
+      }
+      if (i->tex.target.isArray()) {
+         LValue *layer = new_LValue(func, FILE_GPR);
+         Value *src = i->getSrc(lyr);
+         const int sat = (i->op == OP_TXF) ? 1 : 0;
+         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, layer);
+      }
+   } else
+   // (nvc0) generate and move the tsc/tic/array source to the front
+   if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
+      for (int s = dim; s >= 1; --s)
+         i->setSrc(s, i->getSrc(s - 1));
+      i->setSrc(0, arrayIndex);
+
+      Value *ticRel = i->getIndirectR();
+      Value *tscRel = i->getIndirectS();
+
+      if (arrayIndex) {
+         int sat = (i->op == OP_TXF) ? 1 : 0;
+         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
+      } else {
+         bld.loadImm(src, 0);
+      }
+
+      if (ticRel) {
+         i->setSrc(i->tex.rIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
+      }
+      if (tscRel) {
+         i->setSrc(i->tex.sIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+      }
+
+      i->setSrc(0, src);
+   }
+
+   // offset is last source (lod 1st, dc 2nd)
+   if (i->tex.useOffsets) {
+      uint32_t value = 0;
+      int n, c;
+      int s = i->srcCount(0xff, true);
+      if (i->srcExists(s)) // move potential predicate out of the way
+         i->moveSources(s, 1);
+      for (n = 0; n < i->tex.useOffsets; ++n)
+         for (c = 0; c < 3; ++c)
+            value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
+      i->setSrc(s, bld.loadImm(NULL, value));
+   }
+
+   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+      //
+      // If TEX requires more than 4 sources, the 2nd register tuple must be
+      // aligned to 4, even if it consists of just a single 4-byte register.
+      //
+      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
+      //
+      int s = i->srcCount(0xff, true);
+      if (s > 4 && s < 7) {
+         if (i->srcExists(s)) // move potential predicate out of the way
+            i->moveSources(s, 7 - s);
+         while (s < 7)
+            i->setSrc(s++, bld.loadImm(NULL, 0));
+      }
+   }
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleManualTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleTXD(TexInstruction *txd)
+{
+   int dim = txd->tex.target.getDim();
+   int arg = txd->tex.target.getArgCount();
+
+   handleTEX(txd);
+   while (txd->srcExists(arg))
+      ++arg;
+
+   txd->tex.derivAll = true;
+   if (dim > 2 ||
+       txd->tex.target.isCube() ||
+       arg > 4 ||
+       txd->tex.target.isShadow())
+      return handleManualTXD(txd);
+
+   for (int c = 0; c < dim; ++c) {
+      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
+      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
+      txd->dPdx[c].set(NULL);
+      txd->dPdy[c].set(NULL);
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleTXQ(TexInstruction *txq)
+{
+   // TODO: indirect resource/sampler index
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleATOM(Instruction *atom)
+{
+   SVSemantic sv;
+
+   switch (atom->src(0).getFile()) {
+   case FILE_MEMORY_LOCAL:
+      sv = SV_LBASE;
+      break;
+   case FILE_MEMORY_SHARED:
+      sv = SV_SBASE;
+      break;
+   default:
+      assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
+      return true;
+   }
+   Value *base =
+      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
+   Value *ptr = atom->getIndirect(0, 0);
+
+   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
+   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+   if (ptr)
+      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
+   atom->setIndirect(0, 0, base);
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
+{
+   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
+       cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
+      return false;
+   bld.setPosition(cas, true);
+
+   if (needCctl) {
+      Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
+      cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
+      cctl->fixed = 1;
+      cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
+      if (cas->isPredicated())
+         cctl->setPredicate(cas->cc, cas->getPredicate());
+   }
+
+   if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
+      // should be set to the high part of the double reg or bad things will
+      // happen elsewhere in the universe.
+      // Also, it sometimes returns the new value instead of the old one
+      // under mysterious circumstances.
+      Value *dreg = bld.getSSA(8);
+      bld.setPosition(cas, false);
+      bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
+      cas->setSrc(1, dreg);
+   }
+
+   return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+inline Value *
+NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.msInfoCBSlot;
+   off += prog->driver->io.msInfoBase;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+/* On nvc0, surface info is obtained via the surface binding points passed
+ * to the SULD/SUST instructions.
+ * On nve4, surface info is stored in c[] and is used by various special
+ * instructions, e.g. for clamping coordiantes or generating an address.
+ * They couldn't just have added an equivalent to TIC now, couldn't they ?
+ */
+#define NVE4_SU_INFO_ADDR   0x00
+#define NVE4_SU_INFO_FMT    0x04
+#define NVE4_SU_INFO_DIM_X  0x08
+#define NVE4_SU_INFO_PITCH  0x0c
+#define NVE4_SU_INFO_DIM_Y  0x10
+#define NVE4_SU_INFO_ARRAY  0x14
+#define NVE4_SU_INFO_DIM_Z  0x18
+#define NVE4_SU_INFO_UNK1C  0x1c
+#define NVE4_SU_INFO_WIDTH  0x20
+#define NVE4_SU_INFO_HEIGHT 0x24
+#define NVE4_SU_INFO_DEPTH  0x28
+#define NVE4_SU_INFO_TARGET 0x2c
+#define NVE4_SU_INFO_CALL   0x30
+#define NVE4_SU_INFO_RAW_X  0x34
+#define NVE4_SU_INFO_MS_X   0x38
+#define NVE4_SU_INFO_MS_Y   0x3c
+
+#define NVE4_SU_INFO__STRIDE 0x40
+
+#define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
+#define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
+#define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
+
+static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
+{
+   switch (su->tex.target.getEnum()) {
+   case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
+   case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
+                                   NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
+                                   NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+   case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+   case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+void
+NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
+{
+   const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
+   const int arg = tex->tex.target.getArgCount();
+
+   if (tex->tex.target == TEX_TARGET_2D_MS)
+      tex->tex.target = TEX_TARGET_2D;
+   else
+   if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
+      tex->tex.target = TEX_TARGET_2D_ARRAY;
+   else
+      return;
+
+   Value *x = tex->getSrc(0);
+   Value *y = tex->getSrc(1);
+   Value *s = tex->getSrc(arg - 1);
+
+   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
+
+   Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+   Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+
+   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
+   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
+
+   s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
+   s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
+
+   Value *dx = loadMsInfo32(ts, 0x0);
+   Value *dy = loadMsInfo32(ts, 0x4);
+
+   bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
+   bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
+
+   tex->setSrc(0, tx);
+   tex->setSrc(1, ty);
+   tex->moveSources(arg, -1);
+}
+
+// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
+// They're computed from the coordinates using the surface info in c[] space.
+void
+NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
+{
+   Instruction *insn;
+   const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
+   const bool raw =
+      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
+   const int idx = su->tex.r;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
+   const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
+   int c;
+   Value *zero = bld.mkImm(0);
+   Value *p1 = NULL;
+   Value *v;
+   Value *src[3];
+   Value *bf, *eau, *off;
+   Value *addr, *pred;
+
+   off = bld.getScratch(4);
+   bf = bld.getScratch(4);
+   addr = bld.getSSA(8);
+   pred = bld.getScratch(1, FILE_PREDICATE);
+
+   bld.setPosition(su, false);
+
+   adjustCoordinatesMS(su);
+
+   // calculate clamped coordinates
+   for (c = 0; c < arg; ++c) {
+      src[c] = bld.getScratch();
+      if (c == 0 && raw)
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+      else
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
+         ->subOp = getSuClampSubOp(su, c);
+   }
+   for (; c < 3; ++c)
+      src[c] = zero;
+
+   // set predicate output
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      src[0]->getInsn()->setFlagsDef(1, pred);
+   } else
+   if (su->tex.target.isArray()) {
+      p1 = bld.getSSA(1, FILE_PREDICATE);
+      src[dim]->getInsn()->setFlagsDef(1, p1);
+   }
+
+   // calculate pixel offset
+   if (dim == 1) {
+      if (su->tex.target != TEX_TARGET_BUFFER)
+         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
+   } else
+   if (dim == 3) {
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
+         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
+         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
+   } else {
+      assert(dim == 2);
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
+         ->subOp = su->tex.target.isArray() ?
+         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+   }
+
+   // calculate effective address part 1
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      if (raw) {
+         bf = src[0];
+      } else {
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
+            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
+      }
+   } else {
+      Value *y = src[1];
+      Value *z = src[2];
+      uint16_t subOp = 0;
+
+      switch (dim) {
+      case 1:
+         y = zero;
+         z = zero;
+         break;
+      case 2:
+         z = off;
+         if (!su->tex.target.isArray()) {
+            z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+            subOp = NV50_IR_SUBOP_SUBFM_3D;
+         }
+         break;
+      default:
+         subOp = NV50_IR_SUBOP_SUBFM_3D;
+         assert(dim == 3);
+         break;
+      }
+      insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
+      insn->subOp = subOp;
+      insn->setFlagsDef(1, pred);
+   }
+
+   // part 2
+   v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      eau = v;
+   } else {
+      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
+   }
+   // add array layer offset
+   if (su->tex.target.isArray()) {
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+      if (dim == 1)
+         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
+            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
+      else
+         bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
+            ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
+      // combine predicates
+      assert(p1);
+      bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
+   }
+
+   if (atom) {
+      Value *lo = bf;
+      if (su->tex.target == TEX_TARGET_BUFFER) {
+         lo = zero;
+         bld.mkMov(off, bf);
+      }
+      //  bf == g[] address & 0xff
+      // eau == g[] address >> 8
+      bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
+      bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
+   } else
+   if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
+      // Convert from u32 to u8 address format, which is what the library code
+      // doing SULDP currently uses.
+      // XXX: can SUEAU do this ?
+      // XXX: does it matter that we don't mask high bytes in bf ?
+      // Grrr.
+      bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
+      bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
+   }
+
+   bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
+
+   if (atom && su->tex.target == TEX_TARGET_BUFFER)
+      bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
+
+   // let's just set it 0 for raw access and hope it works
+   v = raw ?
+      bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+
+   // get rid of old coordinate sources, make space for fmt info and predicate
+   su->moveSources(arg, 3 - arg);
+   // set 64 bit address and 32-bit format sources
+   su->setSrc(0, addr);
+   su->setSrc(1, v);
+   su->setSrc(2, pred);
+}
+
+void
+NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
+{
+   processSurfaceCoordsNVE4(su);
+
+   // Who do we hate more ? The person who decided that nvc0's SULD doesn't
+   // have to support conversion or the person who decided that, in OpenCL,
+   // you don't have to specify the format here like you do in OpenGL ?
+
+   if (su->op == OP_SULDP) {
+      // We don't patch shaders. Ever.
+      // You get an indirect call to our library blob here.
+      // But at least it's uniform.
+      FlowInstruction *call;
+      LValue *p[3];
+      LValue *r[5];
+      uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
+
+      for (int i = 0; i < 4; ++i)
+         (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
+      for (int i = 0; i < 3; ++i)
+         (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
+      (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
+
+      bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
+      bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
+      bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
+      bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
+      bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
+
+      call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
+
+      call->indirect = 1;
+      call->absolute = 1;
+      call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.resInfoCBSlot, TYPE_U32,
+                                   prog->driver->io.suInfoBase + base));
+      call->setSrc(1, r[2]);
+      call->setSrc(2, r[4]);
+      for (int i = 0; i < 3; ++i)
+         call->setSrc(3 + i, p[i]);
+      for (int i = 0; i < 4; ++i) {
+         call->setDef(i, r[i]);
+         bld.mkMov(su->getDef(i), r[i]);
+      }
+      call->setDef(4, p[1]);
+      delete_Instruction(bld.getProgram(), su);
+   }
+
+   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
+      // FIXME: for out of bounds access, destination value will be undefined !
+      Value *pred = su->getSrc(2);
+      CondCode cc = CC_NOT_P;
+      if (su->getPredicate()) {
+         pred = bld.getScratch(1, FILE_PREDICATE);
+         cc = su->cc;
+         if (cc == CC_NOT_P) {
+            bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+         } else {
+            bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+            pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
+         }
+      }
+      Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
+      red->subOp = su->subOp;
+      if (!gMemBase)
+         gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
+      red->setSrc(0, gMemBase);
+      red->setSrc(1, su->getSrc(3));
+      if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
+         red->setSrc(2, su->getSrc(4));
+      red->setIndirect(0, 0, su->getSrc(0));
+      red->setPredicate(cc, pred);
+      delete_Instruction(bld.getProgram(), su);
+      handleCasExch(red, true);
+   } else {
+      su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
+   }
+}
+
+bool
+NVC0LoweringPass::handleWRSV(Instruction *i)
+{
+   Instruction *st;
+   Symbol *sym;
+   uint32_t addr;
+
+   // must replace, $sreg are not writeable
+   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
+                    i->getSrc(1));
+   st->perPatch = i->perPatch;
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+void
+NVC0LoweringPass::readTessCoord(LValue *dst, int c)
+{
+   Value *laneid = bld.getSSA();
+   Value *x, *y;
+
+   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
+
+   if (c == 0) {
+      x = dst;
+      y = NULL;
+   } else
+   if (c == 1) {
+      x = NULL;
+      y = dst;
+   } else {
+      assert(c == 2);
+      x = bld.getSSA();
+      y = bld.getSSA();
+   }
+   if (x)
+      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
+   if (y)
+      bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
+
+   if (c == 2) {
+      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
+      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
+   }
+}
+
+bool
+NVC0LoweringPass::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   const SVSemantic sv = sym->reg.data.sv.sv;
+   Value *vtx = NULL;
+   Instruction *ld;
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+
+   if (addr >= 0x400) {
+      // mov $sreg
+      if (sym->reg.data.sv.index == 3) {
+         // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
+         i->op = OP_MOV;
+         i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
+      }
+      return true;
+   }
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+   {
+      Value *face = i->getDef(0);
+      bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+      }
+   }
+      break;
+   case SV_TESS_COORD:
+      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
+      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
+      break;
+   case SV_NTID:
+   case SV_NCTAID:
+   case SV_GRIDID:
+      assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
+      if (sym->reg.data.sv.index == 3) {
+         i->op = OP_MOV;
+         i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
+         return true;
+      }
+      addr += prog->driver->prop.cp.gridInfoBase;
+      bld.mkLoad(TYPE_U32, i->getDef(0),
+                 bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+      break;
+   default:
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
+      ld = bld.mkFetch(i->getDef(0), i->dType,
+                       FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
+      ld->perPatch = i->perPatch;
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleMOD(Instruction *i)
+{
+   if (i->dType != TYPE_F32)
+      return true;
+   LValue *value = bld.getScratch();
+   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   i->op = OP_SUB;
+   i->setSrc(1, value);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      int id = i->getSrc(0)->reg.data.offset / 4;
+
+      if (i->src(0).isIndirect(0)) // TODO, ugly
+         return false;
+      i->op = OP_MOV;
+      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+      i->src(0).set(i->src(1));
+      i->setSrc(1, NULL);
+      i->setDef(0, new_LValue(func, FILE_GPR));
+      i->getDef(0)->reg.data.id = id;
+
+      prog->maxGPR = MAX2(prog->maxGPR, id);
+   } else
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      i->setIndirect(0, 1, gpEmitAddress);
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleOUT(Instruction *i)
+{
+   if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
+      delete_Instruction(prog, i);
+   } else {
+      assert(gpEmitAddress);
+      i->setDef(0, gpEmitAddress);
+      if (i->srcExists(0))
+         i->setSrc(1, i->getSrc(0));
+      i->setSrc(0, gpEmitAddress);
+   }
+   return true;
+}
+
+// Generate a binary predicate if an instruction is predicated by
+// e.g. an f32 value.
+void
+NVC0LoweringPass::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *pdst;
+
+   if (!pred || pred->reg.file == FILE_PREDICATE)
+      return;
+   pdst = new_LValue(func, FILE_PREDICATE);
+
+   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
+   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
+
+   bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, bld.mkImm(0), pred);
+
+   insn->setPredicate(insn->cc, pdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NVC0LoweringPass::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_TXQ:
+     return handleTXQ(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_MOD:
+      return handleMOD(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_EMIT:
+   case OP_RESTART:
+      return handleOUT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_LOAD:
+      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+         if (prog->getType() == Program::TYPE_COMPUTE) {
+            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+            i->getSrc(0)->reg.fileIndex = 0;
+         } else {
+            i->op = OP_VFETCH;
+            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+         }
+      }
+      break;
+   case OP_ATOM:
+   {
+      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
+      handleATOM(i);
+      handleCasExch(i, cctl);
+   }
+      break;
+   case OP_SULDB:
+   case OP_SULDP:
+   case OP_SUSTB:
+   case OP_SUSTP:
+   case OP_SUREDB:
+   case OP_SUREDP:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         handleSurfaceOpNVE4(i->asTex());
+      break;
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
+{
+   if (stage == CG_STAGE_PRE_SSA) {
+      NVC0LoweringPass pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NVC0LegalizePostRA pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      NVC0LegalizeSSA pass;
+      return pass.run(prog, false, true);
+   }
+   return false;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
new file mode 100644
index 00000000000..99bd2bf1a4d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -0,0 +1,2464 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_build_util.h"
+
+extern "C" {
+#include "util/u_math.h"
+}
+
+namespace nv50_ir {
+
+bool
+Instruction::isNop() const
+{
+   if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
+      return true;
+   if (terminator || join) // XXX: should terminator imply flow ?
+      return false;
+   if (op == OP_ATOM)
+      return false;
+   if (!fixed && op == OP_NOP)
+      return true;
+
+   if (defExists(0) && def(0).rep()->reg.data.id < 0) {
+      for (int d = 1; defExists(d); ++d)
+         if (def(d).rep()->reg.data.id >= 0)
+            WARN("part of vector result is unused !\n");
+      return true;
+   }
+
+   if (op == OP_MOV || op == OP_UNION) {
+      if (!getDef(0)->equals(getSrc(0)))
+         return false;
+      if (op == OP_UNION)
+         if (!def(0).rep()->equals(getSrc(1)))
+            return false;
+      return true;
+   }
+
+   return false;
+}
+
+bool Instruction::isDead() const
+{
+   if (op == OP_STORE ||
+       op == OP_EXPORT ||
+       op == OP_ATOM ||
+       op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
+       op == OP_WRSV)
+      return false;
+
+   for (int d = 0; defExists(d); ++d)
+      if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
+         return false;
+
+   if (terminator || asFlow())
+      return false;
+   if (fixed)
+      return false;
+
+   return true;
+};
+
+// =============================================================================
+
+class CopyPropagation : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+// Propagate all MOVs forward to make subsequent optimization easier, except if
+// the sources stem from a phi, in which case we don't want to mess up potential
+// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
+bool
+CopyPropagation::visit(BasicBlock *bb)
+{
+   Instruction *mov, *si, *next;
+
+   for (mov = bb->getEntry(); mov; mov = next) {
+      next = mov->next;
+      if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
+         continue;
+      if (mov->getPredicate())
+         continue;
+      if (mov->def(0).getFile() != mov->src(0).getFile())
+         continue;
+      si = mov->getSrc(0)->getInsn();
+      if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
+         // propagate
+         mov->def(0).replace(mov->getSrc(0), false);
+         delete_Instruction(prog, mov);
+      }
+   }
+   return true;
+}
+
+// =============================================================================
+
+class LoadPropagation : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   void checkSwapSrc01(Instruction *);
+
+   bool isCSpaceLoad(Instruction *);
+   bool isImmd32Load(Instruction *);
+   bool isAttribOrSharedLoad(Instruction *);
+};
+
+bool
+LoadPropagation::isCSpaceLoad(Instruction *ld)
+{
+   return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
+}
+
+bool
+LoadPropagation::isImmd32Load(Instruction *ld)
+{
+   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+      return false;
+   return ld->src(0).getFile() == FILE_IMMEDIATE;
+}
+
+bool
+LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
+{
+   return ld &&
+      (ld->op == OP_VFETCH ||
+       (ld->op == OP_LOAD &&
+        (ld->src(0).getFile() == FILE_SHADER_INPUT ||
+         ld->src(0).getFile() == FILE_MEMORY_SHARED)));
+}
+
+void
+LoadPropagation::checkSwapSrc01(Instruction *insn)
+{
+   if (!prog->getTarget()->getOpInfo(insn).commutative)
+      if (insn->op != OP_SET && insn->op != OP_SLCT)
+         return;
+   if (insn->src(1).getFile() != FILE_GPR)
+      return;
+
+   Instruction *i0 = insn->getSrc(0)->getInsn();
+   Instruction *i1 = insn->getSrc(1)->getInsn();
+
+   if (isCSpaceLoad(i0)) {
+      if (!isCSpaceLoad(i1))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else
+   if (isImmd32Load(i0)) {
+      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else
+   if (isAttribOrSharedLoad(i1)) {
+      if (!isAttribOrSharedLoad(i0))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else {
+      return;
+   }
+
+   if (insn->op == OP_SET)
+      insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
+   else
+   if (insn->op == OP_SLCT)
+      insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+}
+
+bool
+LoadPropagation::visit(BasicBlock *bb)
+{
+   const Target *targ = prog->getTarget();
+   Instruction *next;
+
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (i->op == OP_CALL) // calls have args as sources, they must be in regs
+         continue;
+
+      if (i->srcExists(1))
+         checkSwapSrc01(i);
+
+      for (int s = 0; i->srcExists(s); ++s) {
+         Instruction *ld = i->getSrc(s)->getInsn();
+
+         if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
+            continue;
+         if (!targ->insnCanLoad(i, s, ld))
+            continue;
+
+         // propagate !
+         i->setSrc(s, ld->getSrc(0));
+         if (ld->src(0).isIndirect(0))
+            i->setIndirect(s, 0, ld->getIndirect(0, 0));
+
+         if (ld->getDef(0)->refCount() == 0)
+            delete_Instruction(prog, ld);
+      }
+   }
+   return true;
+}
+
+// =============================================================================
+
+// Evaluate constant expressions.
+class ConstantFolding : public Pass
+{
+public:
+   bool foldAll(Program *);
+
+private:
+   virtual bool visit(BasicBlock *);
+
+   void expr(Instruction *, ImmediateValue&, ImmediateValue&);
+   void opnd(Instruction *, ImmediateValue&, int s);
+
+   void unary(Instruction *, const ImmediateValue&);
+
+   void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
+
+   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
+   CmpInstruction *findOriginForTestWithZero(Value *);
+
+   unsigned int foldCount;
+
+   BuildUtil bld;
+};
+
+// TODO: remember generated immediates and only revisit these
+bool
+ConstantFolding::foldAll(Program *prog)
+{
+   unsigned int iterCount = 0;
+   do {
+      foldCount = 0;
+      if (!run(prog))
+         return false;
+   } while (foldCount && ++iterCount < 2);
+   return true;
+}
+
+bool
+ConstantFolding::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->op == OP_MOV || i->op == OP_CALL)
+         continue;
+
+      ImmediateValue src0, src1;
+
+      if (i->srcExists(1) &&
+          i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
+         expr(i, src0, src1);
+      else
+      if (i->srcExists(0) && i->src(0).getImmediate(src0))
+         opnd(i, src0, 0);
+      else
+      if (i->srcExists(1) && i->src(1).getImmediate(src1))
+         opnd(i, src1, 1);
+   }
+   return true;
+}
+
+CmpInstruction *
+ConstantFolding::findOriginForTestWithZero(Value *value)
+{
+   if (!value)
+      return NULL;
+   Instruction *insn = value->getInsn();
+
+   while (insn && insn->op != OP_SET) {
+      Instruction *next = NULL;
+      switch (insn->op) {
+      case OP_NEG:
+      case OP_ABS:
+      case OP_CVT:
+         next = insn->getSrc(0)->getInsn();
+         if (insn->sType != next->dType)
+            return NULL;
+         break;
+      case OP_MOV:
+         next = insn->getSrc(0)->getInsn();
+         break;
+      default:
+         return NULL;
+      }
+      insn = next;
+   }
+   return insn ? insn->asCmp() : NULL;
+}
+
+void
+Modifier::applyTo(ImmediateValue& imm) const
+{
+   if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
+      return;
+   switch (imm.reg.type) {
+   case TYPE_F32:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.f32 = fabsf(imm.reg.data.f32);
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.f32 = -imm.reg.data.f32;
+      if (bits & NV50_IR_MOD_SAT) {
+         if (imm.reg.data.f32 < 0.0f)
+            imm.reg.data.f32 = 0.0f;
+         else
+         if (imm.reg.data.f32 > 1.0f)
+            imm.reg.data.f32 = 1.0f;
+      }
+      assert(!(bits & NV50_IR_MOD_NOT));
+      break;
+
+   case TYPE_S8: // NOTE: will be extended
+   case TYPE_S16:
+   case TYPE_S32:
+   case TYPE_U8: // NOTE: treated as signed
+   case TYPE_U16:
+   case TYPE_U32:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
+            imm.reg.data.s32 : -imm.reg.data.s32;
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.s32 = -imm.reg.data.s32;
+      if (bits & NV50_IR_MOD_NOT)
+         imm.reg.data.s32 = ~imm.reg.data.s32;
+      break;
+
+   case TYPE_F64:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.f64 = fabs(imm.reg.data.f64);
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.f64 = -imm.reg.data.f64;
+      if (bits & NV50_IR_MOD_SAT) {
+         if (imm.reg.data.f64 < 0.0)
+            imm.reg.data.f64 = 0.0;
+         else
+         if (imm.reg.data.f64 > 1.0)
+            imm.reg.data.f64 = 1.0;
+      }
+      assert(!(bits & NV50_IR_MOD_NOT));
+      break;
+
+   default:
+      assert(!"invalid/unhandled type");
+      imm.reg.data.u64 = 0;
+      break;
+   }
+}
+
+operation
+Modifier::getOp() const
+{
+   switch (bits) {
+   case NV50_IR_MOD_ABS: return OP_ABS;
+   case NV50_IR_MOD_NEG: return OP_NEG;
+   case NV50_IR_MOD_SAT: return OP_SAT;
+   case NV50_IR_MOD_NOT: return OP_NOT;
+   case 0:
+      return OP_MOV;
+   default:
+      return OP_CVT;
+   }
+}
+
+void
+ConstantFolding::expr(Instruction *i,
+                      ImmediateValue &imm0, ImmediateValue &imm1)
+{
+   struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
+   struct Storage res;
+
+   memset(&res.data, 0, sizeof(res.data));
+
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA:
+   case OP_MUL:
+      if (i->dnz && i->dType == TYPE_F32) {
+         if (!isfinite(a->data.f32))
+            a->data.f32 = 0.0f;
+         if (!isfinite(b->data.f32))
+            b->data.f32 = 0.0f;
+      }
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
+      case TYPE_S32:
+      case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_DIV:
+      if (b->data.u32 == 0)
+         break;
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
+      case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
+      case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_ADD:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
+      case TYPE_S32:
+      case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_POW:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
+      default:
+         return;
+      }
+      break;
+   case OP_MAX:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
+      case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
+      case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
+      default:
+         return;
+      }
+      break;
+   case OP_MIN:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
+      case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
+      case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
+      default:
+         return;
+      }
+      break;
+   case OP_AND:
+      res.data.u64 = a->data.u64 & b->data.u64;
+      break;
+   case OP_OR:
+      res.data.u64 = a->data.u64 | b->data.u64;
+      break;
+   case OP_XOR:
+      res.data.u64 = a->data.u64 ^ b->data.u64;
+      break;
+   case OP_SHL:
+      res.data.u32 = a->data.u32 << b->data.u32;
+      break;
+   case OP_SHR:
+      switch (i->dType) {
+      case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
+      case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_SLCT:
+      if (a->data.u32 != b->data.u32)
+         return;
+      res.data.u32 = a->data.u32;
+      break;
+   default:
+      return;
+   }
+   ++foldCount;
+
+   i->src(0).mod = Modifier(0);
+   i->src(1).mod = Modifier(0);
+
+   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
+   i->setSrc(1, NULL);
+
+   i->getSrc(0)->reg.data = res.data;
+
+   if (i->op == OP_MAD || i->op == OP_FMA) {
+      i->op = OP_ADD;
+
+      i->setSrc(1, i->getSrc(0));
+      i->src(1).mod = i->src(2).mod;
+      i->setSrc(0, i->getSrc(2));
+      i->setSrc(2, NULL);
+
+      ImmediateValue src0;
+      if (i->src(0).getImmediate(src0))
+         expr(i, src0, *i->getSrc(1)->asImm());
+   } else {
+      i->op = OP_MOV;
+   }
+}
+
+void
+ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
+{
+   Storage res;
+
+   if (i->dType != TYPE_F32)
+      return;
+   switch (i->op) {
+   case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
+   case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
+   case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
+   case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
+   case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
+   case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
+   case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
+   case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
+   case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      // these should be handled in subsequent OP_SIN/COS/EX2
+      res.data.f32 = imm.reg.data.f32;
+      break;
+   default:
+      return;
+   }
+   i->op = OP_MOV;
+   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
+   i->src(0).mod = Modifier(0);
+}
+
+void
+ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
+                                        const int s, ImmediateValue& imm2)
+{
+   const int t = s ? 0 : 1;
+   Instruction *insn;
+   Instruction *mul1 = NULL; // mul1 before mul2
+   int e = 0;
+   float f = imm2.reg.data.f32;
+   ImmediateValue imm1;
+
+   assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
+
+   if (mul2->getSrc(t)->refCount() == 1) {
+      insn = mul2->getSrc(t)->getInsn();
+      if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
+         mul1 = insn;
+      if (mul1 && !mul1->saturate) {
+         int s1;
+
+         if (mul1->src(s1 = 0).getImmediate(imm1) ||
+             mul1->src(s1 = 1).getImmediate(imm1)) {
+            bld.setPosition(mul1, false);
+            // a = mul r, imm1
+            // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
+            mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
+            mul1->src(s1).mod = Modifier(0);
+            mul2->def(0).replace(mul1->getDef(0), false);
+         } else
+         if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+            // c = mul a, b
+            // d = mul c, imm   -> d = mul_x_imm a, b
+            mul1->postFactor = e;
+            mul2->def(0).replace(mul1->getDef(0), false);
+            if (f < 0)
+               mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
+         }
+         mul1->saturate = mul2->saturate;
+         return;
+      }
+   }
+   if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
+      // b = mul a, imm
+      // d = mul b, c   -> d = mul_x_imm a, c
+      int s2, t2;
+      insn = mul2->getDef(0)->uses.front()->getInsn();
+      if (!insn)
+         return;
+      mul1 = mul2;
+      mul2 = NULL;
+      s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
+      t2 = s2 ? 0 : 1;
+      if (insn->op == OP_MUL && insn->dType == TYPE_F32)
+         if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
+            mul2 = insn;
+      if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+         mul2->postFactor = e;
+         mul2->setSrc(s2, mul1->src(t));
+         if (f < 0)
+            mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
+      }
+   }
+}
+
+void
+ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
+{
+   const int t = !s;
+   const operation op = i->op;
+
+   switch (i->op) {
+   case OP_MUL:
+      if (i->dType == TYPE_F32)
+         tryCollapseChainedMULs(i, s, imm0);
+
+      if (imm0.isInteger(0)) {
+         i->op = OP_MOV;
+         i->setSrc(0, new_ImmediateValue(prog, 0u));
+         i->src(0).mod = Modifier(0);
+         i->setSrc(1, NULL);
+      } else
+      if (imm0.isInteger(1) || imm0.isInteger(-1)) {
+         if (imm0.isNegative())
+            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+         i->op = i->src(t).mod.getOp();
+         if (s == 0) {
+            i->setSrc(0, i->getSrc(1));
+            i->src(0).mod = i->src(1).mod;
+            i->src(1).mod = 0;
+         }
+         if (i->op != OP_CVT)
+            i->src(0).mod = 0;
+         i->setSrc(1, NULL);
+      } else
+      if (imm0.isInteger(2) || imm0.isInteger(-2)) {
+         if (imm0.isNegative())
+            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+         i->op = OP_ADD;
+         i->setSrc(s, i->getSrc(t));
+         i->src(s).mod = i->src(t).mod;
+      } else
+      if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
+         i->op = OP_SHL;
+         imm0.applyLog2();
+         i->setSrc(0, i->getSrc(t));
+         i->src(0).mod = i->src(t).mod;
+         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
+         i->src(1).mod = 0;
+      }
+      break;
+   case OP_ADD:
+      if (i->usesFlags())
+         break;
+      if (imm0.isInteger(0)) {
+         if (s == 0) {
+            i->setSrc(0, i->getSrc(1));
+            i->src(0).mod = i->src(1).mod;
+         }
+         i->setSrc(1, NULL);
+         i->op = i->src(0).mod.getOp();
+         if (i->op != OP_CVT)
+            i->src(0).mod = Modifier(0);
+      }
+      break;
+
+   case OP_DIV:
+      if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
+         break;
+      bld.setPosition(i, false);
+      if (imm0.reg.data.u32 == 0) {
+         break;
+      } else
+      if (imm0.reg.data.u32 == 1) {
+         i->op = OP_MOV;
+         i->setSrc(1, NULL);
+      } else
+      if (i->dType == TYPE_U32 && imm0.isPow2()) {
+         i->op = OP_SHR;
+         i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
+      } else
+      if (i->dType == TYPE_U32) {
+         Instruction *mul;
+         Value *tA, *tB;
+         const uint32_t d = imm0.reg.data.u32;
+         uint32_t m;
+         int r, s;
+         uint32_t l = util_logbase2(d);
+         if (((uint32_t)1 << l) < d)
+            ++l;
+         m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
+         r = l ? 1 : 0;
+         s = l ? (l - 1) : 0;
+
+         tA = bld.getSSA();
+         tB = bld.getSSA();
+         mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
+                         bld.loadImm(NULL, m));
+         mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
+         bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
+         tA = bld.getSSA();
+         if (r)
+            bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
+         else
+            tA = tB;
+         tB = s ? bld.getSSA() : i->getDef(0);
+         bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
+         if (s)
+            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
+
+         delete_Instruction(prog, i);
+      } else
+      if (imm0.reg.data.s32 == -1) {
+         i->op = OP_NEG;
+         i->setSrc(1, NULL);
+      } else {
+         LValue *tA, *tB;
+         LValue *tD;
+         const int32_t d = imm0.reg.data.s32;
+         int32_t m;
+         int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
+         if ((1 << l) < abs(d))
+            ++l;
+         if (!l)
+            l = 1;
+         m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
+
+         tA = bld.getSSA();
+         tB = bld.getSSA();
+         bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
+                   i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
+         if (l > 1)
+            bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
+         else
+            tB = tA;
+         tA = bld.getSSA();
+         bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
+         tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
+         bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
+         if (d < 0)
+            bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
+
+         delete_Instruction(prog, i);
+      }
+      break;
+
+   case OP_MOD:
+      if (i->sType == TYPE_U32 && imm0.isPow2()) {
+         bld.setPosition(i, false);
+         i->op = OP_AND;
+         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+      }
+      break;
+
+   case OP_SET: // TODO: SET_AND,OR,XOR
+   {
+      CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
+      CondCode cc, ccZ;
+      if (i->src(t).mod != Modifier(0))
+         return;
+      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+         return;
+      cc = si->setCond;
+      ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+      if (s == 0)
+         ccZ = reverseCondCode(ccZ);
+      switch (ccZ) {
+      case CC_LT: cc = CC_FL; break;
+      case CC_GE: cc = CC_TR; break;
+      case CC_EQ: cc = inverseCondCode(cc); break;
+      case CC_LE: cc = inverseCondCode(cc); break;
+      case CC_GT: break;
+      case CC_NE: break;
+      default:
+         return;
+      }
+      i->asCmp()->setCond = cc;
+      i->setSrc(0, si->src(0));
+      i->setSrc(1, si->src(1));
+      i->sType = si->sType;
+   }
+      break;
+
+   case OP_SHL:
+   {
+      if (s != 1 || i->src(0).mod != Modifier(0))
+         break;
+      // try to concatenate shifts
+      Instruction *si = i->getSrc(0)->getInsn();
+      if (!si || si->op != OP_SHL)
+         break;
+      ImmediateValue imm1;
+      if (si->src(1).getImmediate(imm1)) {
+         bld.setPosition(i, false);
+         i->setSrc(0, si->getSrc(0));
+         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+      }
+   }
+      break;
+
+   case OP_ABS:
+   case OP_NEG:
+   case OP_LG2:
+   case OP_RCP:
+   case OP_SQRT:
+   case OP_RSQ:
+   case OP_PRESIN:
+   case OP_SIN:
+   case OP_COS:
+   case OP_PREEX2:
+   case OP_EX2:
+      unary(i, imm0);
+      break;
+   default:
+      return;
+   }
+   if (i->op != op)
+      foldCount++;
+}
+
+// =============================================================================
+
+// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
+class ModifierFolding : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+ModifierFolding::visit(BasicBlock *bb)
+{
+   const Target *target = prog->getTarget();
+
+   Instruction *i, *next, *mi;
+   Modifier mod;
+
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (0 && i->op == OP_SUB) {
+         // turn "sub" into "add neg" (do we really want this ?)
+         i->op = OP_ADD;
+         i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+      }
+
+      for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+         mi = i->getSrc(s)->getInsn();
+         if (!mi ||
+             mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
+            continue;
+         if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
+            if ((i->op != OP_ADD &&
+                 i->op != OP_MUL) ||
+                (mi->op != OP_ABS &&
+                 mi->op != OP_NEG))
+               continue;
+         } else
+         if (i->sType != mi->dType) {
+            continue;
+         }
+         if ((mod = Modifier(mi->op)) == Modifier(0))
+            continue;
+         mod *= mi->src(0).mod;
+
+         if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
+            // abs neg [abs] = abs
+            mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
+         } else
+         if ((i->op == OP_NEG) && mod.neg()) {
+            assert(s == 0);
+            // neg as both opcode and modifier on same insn is prohibited
+            // neg neg abs = abs, neg neg = identity
+            mod = mod & Modifier(~NV50_IR_MOD_NEG);
+            i->op = mod.getOp();
+            mod = mod & Modifier(~NV50_IR_MOD_ABS);
+            if (mod == Modifier(0))
+               i->op = OP_MOV;
+         }
+
+         if (target->isModSupported(i, s, mod)) {
+            i->setSrc(s, mi->getSrc(0));
+            i->src(s).mod *= mod;
+         }
+      }
+
+      if (i->op == OP_SAT) {
+         mi = i->getSrc(0)->getInsn();
+         if (mi &&
+             mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
+            mi->saturate = 1;
+            mi->setDef(0, i->getDef(0));
+            delete_Instruction(prog, i);
+         }
+      }
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+// MUL + ADD -> MAD/FMA
+// MIN/MAX(a, a) -> a, etc.
+// SLCT(a, b, const) -> cc(const) ? a : b
+// RCP(RCP(a)) -> a
+// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+class AlgebraicOpt : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   void handleABS(Instruction *);
+   bool handleADD(Instruction *);
+   bool tryADDToMADOrSAD(Instruction *, operation toOp);
+   void handleMINMAX(Instruction *);
+   void handleRCP(Instruction *);
+   void handleSLCT(Instruction *);
+   void handleLOGOP(Instruction *);
+   void handleCVT(Instruction *);
+   void handleSUCLAMP(Instruction *);
+
+   BuildUtil bld;
+};
+
+void
+AlgebraicOpt::handleABS(Instruction *abs)
+{
+   Instruction *sub = abs->getSrc(0)->getInsn();
+   DataType ty;
+   if (!sub ||
+       !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
+      return;
+   // expect not to have mods yet, if we do, bail
+   if (sub->src(0).mod || sub->src(1).mod)
+      return;
+   // hidden conversion ?
+   ty = intTypeToSigned(sub->dType);
+   if (abs->dType != abs->sType || ty != abs->sType)
+      return;
+
+   if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
+       sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
+       sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
+         return;
+
+   Value *src0 = sub->getSrc(0);
+   Value *src1 = sub->getSrc(1);
+
+   if (sub->op == OP_ADD) {
+      Instruction *neg = sub->getSrc(1)->getInsn();
+      if (neg && neg->op != OP_NEG) {
+         neg = sub->getSrc(0)->getInsn();
+         src0 = sub->getSrc(1);
+      }
+      if (!neg || neg->op != OP_NEG ||
+          neg->dType != neg->sType || neg->sType != ty)
+         return;
+      src1 = neg->getSrc(0);
+   }
+
+   // found ABS(SUB))
+   abs->moveSources(1, 2); // move sources >=1 up by 2
+   abs->op = OP_SAD;
+   abs->setType(sub->dType);
+   abs->setSrc(0, src0);
+   abs->setSrc(1, src1);
+   bld.setPosition(abs, false);
+   abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
+}
+
+bool
+AlgebraicOpt::handleADD(Instruction *add)
+{
+   Value *src0 = add->getSrc(0);
+   Value *src1 = add->getSrc(1);
+
+   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+      return false;
+
+   bool changed = false;
+   if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
+      changed = tryADDToMADOrSAD(add, OP_MAD);
+   if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
+      changed = tryADDToMADOrSAD(add, OP_SAD);
+   return changed;
+}
+
+// ADD(SAD(a,b,0), c) -> SAD(a,b,c)
+// ADD(MUL(a,b), c) -> MAD(a,b,c)
+bool
+AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
+{
+   Value *src0 = add->getSrc(0);
+   Value *src1 = add->getSrc(1);
+   Value *src;
+   int s;
+   const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
+   const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
+   Modifier mod[4];
+
+   if (src0->refCount() == 1 &&
+       src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
+      s = 0;
+   else
+   if (src1->refCount() == 1 &&
+       src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
+      s = 1;
+   else
+      return false;
+
+   if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
+       (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
+      return false;
+
+   src = add->getSrc(s);
+
+   if (src->getInsn()->postFactor)
+      return false;
+   if (toOp == OP_SAD) {
+      ImmediateValue imm;
+      if (!src->getInsn()->src(2).getImmediate(imm))
+         return false;
+      if (!imm.isInteger(0))
+         return false;
+   }
+
+   mod[0] = add->src(0).mod;
+   mod[1] = add->src(1).mod;
+   mod[2] = src->getUniqueInsn()->src(0).mod;
+   mod[3] = src->getUniqueInsn()->src(1).mod;
+
+   if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
+      return false;
+
+   add->op = toOp;
+   add->subOp = src->getInsn()->subOp; // potentially mul-high
+
+   add->setSrc(2, add->src(s ? 0 : 1));
+
+   add->setSrc(0, src->getInsn()->getSrc(0));
+   add->src(0).mod = mod[2] ^ mod[s];
+   add->setSrc(1, src->getInsn()->getSrc(1));
+   add->src(1).mod = mod[3];
+
+   return true;
+}
+
+void
+AlgebraicOpt::handleMINMAX(Instruction *minmax)
+{
+   Value *src0 = minmax->getSrc(0);
+   Value *src1 = minmax->getSrc(1);
+
+   if (src0 != src1 || src0->reg.file != FILE_GPR)
+      return;
+   if (minmax->src(0).mod == minmax->src(1).mod) {
+      if (minmax->def(0).mayReplace(minmax->src(0))) {
+         minmax->def(0).replace(minmax->src(0), false);
+         minmax->bb->remove(minmax);
+      } else {
+         minmax->op = OP_CVT;
+         minmax->setSrc(1, NULL);
+      }
+   } else {
+      // TODO:
+      // min(x, -x) = -abs(x)
+      // min(x, -abs(x)) = -abs(x)
+      // min(x, abs(x)) = x
+      // max(x, -abs(x)) = x
+      // max(x, abs(x)) = abs(x)
+      // max(x, -x) = abs(x)
+   }
+}
+
+void
+AlgebraicOpt::handleRCP(Instruction *rcp)
+{
+   Instruction *si = rcp->getSrc(0)->getUniqueInsn();
+
+   if (si && si->op == OP_RCP) {
+      Modifier mod = rcp->src(0).mod * si->src(0).mod;
+      rcp->op = mod.getOp();
+      rcp->setSrc(0, si->getSrc(0));
+   }
+}
+
+void
+AlgebraicOpt::handleSLCT(Instruction *slct)
+{
+   if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
+      if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
+         slct->setSrc(0, slct->getSrc(1));
+   } else
+   if (slct->getSrc(0) != slct->getSrc(1)) {
+      return;
+   }
+   slct->op = OP_MOV;
+   slct->setSrc(1, NULL);
+   slct->setSrc(2, NULL);
+}
+
+void
+AlgebraicOpt::handleLOGOP(Instruction *logop)
+{
+   Value *src0 = logop->getSrc(0);
+   Value *src1 = logop->getSrc(1);
+
+   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+      return;
+
+   if (src0 == src1) {
+      if ((logop->op == OP_AND || logop->op == OP_OR) &&
+          logop->def(0).mayReplace(logop->src(0))) {
+         logop->def(0).replace(logop->src(0), false);
+         delete_Instruction(prog, logop);
+      }
+   } else {
+      // try AND(SET, SET) -> SET_AND(SET)
+      Instruction *set0 = src0->getInsn();
+      Instruction *set1 = src1->getInsn();
+
+      if (!set0 || set0->fixed || !set1 || set1->fixed)
+         return;
+      if (set1->op != OP_SET) {
+         Instruction *xchg = set0;
+         set0 = set1;
+         set1 = xchg;
+         if (set1->op != OP_SET)
+            return;
+      }
+      operation redOp = (logop->op == OP_AND ? OP_SET_AND :
+                         logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
+      if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
+         return;
+      if (set0->op != OP_SET &&
+          set0->op != OP_SET_AND &&
+          set0->op != OP_SET_OR &&
+          set0->op != OP_SET_XOR)
+         return;
+      if (set0->getDef(0)->refCount() > 1 &&
+          set1->getDef(0)->refCount() > 1)
+         return;
+      if (set0->getPredicate() || set1->getPredicate())
+         return;
+      // check that they don't source each other
+      for (int s = 0; s < 2; ++s)
+         if (set0->getSrc(s) == set1->getDef(0) ||
+             set1->getSrc(s) == set0->getDef(0))
+            return;
+
+      set0 = cloneForward(func, set0);
+      set1 = cloneShallow(func, set1);
+      logop->bb->insertAfter(logop, set1);
+      logop->bb->insertAfter(logop, set0);
+
+      set0->dType = TYPE_U8;
+      set0->getDef(0)->reg.file = FILE_PREDICATE;
+      set0->getDef(0)->reg.size = 1;
+      set1->setSrc(2, set0->getDef(0));
+      set1->op = redOp;
+      set1->setDef(0, logop->getDef(0));
+      delete_Instruction(prog, logop);
+   }
+}
+
+// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
+// nv50:
+//  F2I(NEG(I2F(ABS(SET))))
+void
+AlgebraicOpt::handleCVT(Instruction *cvt)
+{
+   if (cvt->sType != TYPE_F32 ||
+       cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
+      return;
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
+      return;
+   if (insn->src(0).mod != Modifier(0))
+      return;
+   insn = insn->getSrc(0)->getInsn();
+
+   // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
+   if (insn && insn->op == OP_CVT &&
+       insn->dType == TYPE_F32 &&
+       insn->sType == TYPE_S32) {
+      insn = insn->getSrc(0)->getInsn();
+      if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
+          insn->src(0).mod)
+         return;
+      insn = insn->getSrc(0)->getInsn();
+      if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
+         return;
+   } else
+   if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
+      return;
+   }
+
+   Instruction *bset = cloneShallow(func, insn);
+   bset->dType = TYPE_U32;
+   bset->setDef(0, cvt->getDef(0));
+   cvt->bb->insertAfter(cvt, bset);
+   delete_Instruction(prog, cvt);
+}
+
+// SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
+void
+AlgebraicOpt::handleSUCLAMP(Instruction *insn)
+{
+   ImmediateValue imm;
+   int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
+   int s;
+   Instruction *add;
+
+   assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
+
+   // look for ADD (TODO: only count references by non-SUCLAMP)
+   if (insn->getSrc(0)->refCount() > 1)
+      return;
+   add = insn->getSrc(0)->getInsn();
+   if (!add || add->op != OP_ADD ||
+       (add->dType != TYPE_U32 &&
+        add->dType != TYPE_S32))
+      return;
+
+   // look for immediate
+   for (s = 0; s < 2; ++s)
+      if (add->src(s).getImmediate(imm))
+         break;
+   if (s >= 2)
+      return;
+   s = s ? 0 : 1;
+   // determine if immediate fits
+   val += imm.reg.data.s32;
+   if (val > 31 || val < -32)
+      return;
+   // determine if other addend fits
+   if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
+      return;
+
+   bld.setPosition(insn, false); // make sure bld is init'ed
+   // replace sources
+   insn->setSrc(2, bld.mkImm(val));
+   insn->setSrc(0, add->getSrc(s));
+}
+
+bool
+AlgebraicOpt::visit(BasicBlock *bb)
+{
+   Instruction *next;
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      switch (i->op) {
+      case OP_ABS:
+         handleABS(i);
+         break;
+      case OP_ADD:
+         handleADD(i);
+         break;
+      case OP_RCP:
+         handleRCP(i);
+         break;
+      case OP_MIN:
+      case OP_MAX:
+         handleMINMAX(i);
+         break;
+      case OP_SLCT:
+         handleSLCT(i);
+         break;
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         handleLOGOP(i);
+         break;
+      case OP_CVT:
+         handleCVT(i);
+         break;
+      case OP_SUCLAMP:
+         handleSUCLAMP(i);
+         break;
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+static inline void
+updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
+{
+   if (offset != ldst->getSrc(0)->reg.data.offset) {
+      if (ldst->getSrc(0)->refCount() > 1)
+         ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
+      ldst->getSrc(0)->reg.data.offset = offset;
+   }
+}
+
+// Combine loads and stores, forward stores to loads where possible.
+class MemoryOpt : public Pass
+{
+private:
+   class Record
+   {
+   public:
+      Record *next;
+      Instruction *insn;
+      const Value *rel[2];
+      const Value *base;
+      int32_t offset;
+      int8_t fileIndex;
+      uint8_t size;
+      bool locked;
+      Record *prev;
+
+      bool overlaps(const Instruction *ldst) const;
+
+      inline void link(Record **);
+      inline void unlink(Record **);
+      inline void set(const Instruction *ldst);
+   };
+
+public:
+   MemoryOpt();
+
+   Record *loads[DATA_FILE_COUNT];
+   Record *stores[DATA_FILE_COUNT];
+
+   MemoryPool recordPool;
+
+private:
+   virtual bool visit(BasicBlock *);
+   bool runOpt(BasicBlock *);
+
+   Record **getList(const Instruction *);
+
+   Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
+
+   // merge @insn into load/store instruction from @rec
+   bool combineLd(Record *rec, Instruction *ld);
+   bool combineSt(Record *rec, Instruction *st);
+
+   bool replaceLdFromLd(Instruction *ld, Record *ldRec);
+   bool replaceLdFromSt(Instruction *ld, Record *stRec);
+   bool replaceStFromSt(Instruction *restrict st, Record *stRec);
+
+   void addRecord(Instruction *ldst);
+   void purgeRecords(Instruction *const st, DataFile);
+   void lockStores(Instruction *const ld);
+   void reset();
+
+private:
+   Record *prevRecord;
+};
+
+MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
+{
+   for (int i = 0; i < DATA_FILE_COUNT; ++i) {
+      loads[i] = NULL;
+      stores[i] = NULL;
+   }
+   prevRecord = NULL;
+}
+
+void
+MemoryOpt::reset()
+{
+   for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
+      Record *it, *next;
+      for (it = loads[i]; it; it = next) {
+         next = it->next;
+         recordPool.release(it);
+      }
+      loads[i] = NULL;
+      for (it = stores[i]; it; it = next) {
+         next = it->next;
+         recordPool.release(it);
+      }
+      stores[i] = NULL;
+   }
+}
+
+bool
+MemoryOpt::combineLd(Record *rec, Instruction *ld)
+{
+   int32_t offRc = rec->offset;
+   int32_t offLd = ld->getSrc(0)->reg.data.offset;
+   int sizeRc = rec->size;
+   int sizeLd = typeSizeof(ld->dType);
+   int size = sizeRc + sizeLd;
+   int d, j;
+
+   if (!prog->getTarget()->
+       isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
+      return false;
+   // no unaligned loads
+   if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
+       ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
+      return false;
+
+   assert(sizeRc + sizeLd <= 16 && offRc != offLd);
+
+   for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
+
+   if (offLd < offRc) {
+      int sz;
+      for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
+      // d: nr of definitions in ld
+      // j: nr of definitions in rec->insn, move:
+      for (d = d + j - 1; j > 0; --j, --d)
+         rec->insn->setDef(d, rec->insn->getDef(j - 1));
+
+      if (rec->insn->getSrc(0)->refCount() > 1)
+         rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
+      rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
+
+      d = 0;
+   } else {
+      d = j;
+   }
+   // move definitions of @ld to @rec->insn
+   for (j = 0; sizeLd; ++j, ++d) {
+      sizeLd -= ld->getDef(j)->reg.size;
+      rec->insn->setDef(d, ld->getDef(j));
+   }
+
+   rec->size = size;
+   rec->insn->getSrc(0)->reg.size = size;
+   rec->insn->setType(typeOfSize(size));
+
+   delete_Instruction(prog, ld);
+
+   return true;
+}
+
+bool
+MemoryOpt::combineSt(Record *rec, Instruction *st)
+{
+   int32_t offRc = rec->offset;
+   int32_t offSt = st->getSrc(0)->reg.data.offset;
+   int sizeRc = rec->size;
+   int sizeSt = typeSizeof(st->dType);
+   int s = sizeSt / 4;
+   int size = sizeRc + sizeSt;
+   int j, k;
+   Value *src[4]; // no modifiers in ValueRef allowed for st
+   Value *extra[3];
+
+   if (!prog->getTarget()->
+       isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
+      return false;
+   if (size == 8 && MIN2(offRc, offSt) & 0x7)
+      return false;
+
+   st->takeExtraSources(0, extra); // save predicate and indirect address
+
+   if (offRc < offSt) {
+      // save values from @st
+      for (s = 0; sizeSt; ++s) {
+         sizeSt -= st->getSrc(s + 1)->reg.size;
+         src[s] = st->getSrc(s + 1);
+      }
+      // set record's values as low sources of @st
+      for (j = 1; sizeRc; ++j) {
+         sizeRc -= rec->insn->getSrc(j)->reg.size;
+         st->setSrc(j, rec->insn->getSrc(j));
+      }
+      // set saved values as high sources of @st
+      for (k = j, j = 0; j < s; ++j)
+         st->setSrc(k++, src[j]);
+
+      updateLdStOffset(st, offRc, func);
+   } else {
+      for (j = 1; sizeSt; ++j)
+         sizeSt -= st->getSrc(j)->reg.size;
+      for (s = 1; sizeRc; ++j, ++s) {
+         sizeRc -= rec->insn->getSrc(s)->reg.size;
+         st->setSrc(j, rec->insn->getSrc(s));
+      }
+      rec->offset = offSt;
+   }
+   st->putExtraSources(0, extra); // restore pointer and predicate
+
+   delete_Instruction(prog, rec->insn);
+   rec->insn = st;
+   rec->size = size;
+   rec->insn->getSrc(0)->reg.size = size;
+   rec->insn->setType(typeOfSize(size));
+   return true;
+}
+
+void
+MemoryOpt::Record::set(const Instruction *ldst)
+{
+   const Symbol *mem = ldst->getSrc(0)->asSym();
+   fileIndex = mem->reg.fileIndex;
+   rel[0] = ldst->getIndirect(0, 0);
+   rel[1] = ldst->getIndirect(0, 1);
+   offset = mem->reg.data.offset;
+   base = mem->getBase();
+   size = typeSizeof(ldst->sType);
+}
+
+void
+MemoryOpt::Record::link(Record **list)
+{
+   next = *list;
+   if (next)
+      next->prev = this;
+   prev = NULL;
+   *list = this;
+}
+
+void
+MemoryOpt::Record::unlink(Record **list)
+{
+   if (next)
+      next->prev = prev;
+   if (prev)
+      prev->next = next;
+   else
+      *list = next;
+}
+
+MemoryOpt::Record **
+MemoryOpt::getList(const Instruction *insn)
+{
+   if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
+      return &loads[insn->src(0).getFile()];
+   return &stores[insn->src(0).getFile()];
+}
+
+void
+MemoryOpt::addRecord(Instruction *i)
+{
+   Record **list = getList(i);
+   Record *it = reinterpret_cast<Record *>(recordPool.allocate());
+
+   it->link(list);
+   it->set(i);
+   it->insn = i;
+   it->locked = false;
+}
+
+MemoryOpt::Record *
+MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
+{
+   const Symbol *sym = insn->getSrc(0)->asSym();
+   const int size = typeSizeof(insn->sType);
+   Record *rec = NULL;
+   Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
+
+   for (; it; it = it->next) {
+      if (it->locked && insn->op != OP_LOAD)
+         continue;
+      if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
+          it->rel[0] != insn->getIndirect(0, 0) ||
+          it->fileIndex != sym->reg.fileIndex ||
+          it->rel[1] != insn->getIndirect(0, 1))
+         continue;
+
+      if (it->offset < sym->reg.data.offset) {
+         if (it->offset + it->size >= sym->reg.data.offset) {
+            isAdj = (it->offset + it->size == sym->reg.data.offset);
+            if (!isAdj)
+               return it;
+            if (!(it->offset & 0x7))
+               rec = it;
+         }
+      } else {
+         isAdj = it->offset != sym->reg.data.offset;
+         if (size <= it->size && !isAdj)
+            return it;
+         else
+         if (!(sym->reg.data.offset & 0x7))
+            if (it->offset - size <= sym->reg.data.offset)
+               rec = it;
+      }
+   }
+   return rec;
+}
+
+bool
+MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
+{
+   Instruction *st = rec->insn;
+   int32_t offSt = rec->offset;
+   int32_t offLd = ld->getSrc(0)->reg.data.offset;
+   int d, s;
+
+   for (s = 1; offSt != offLd && st->srcExists(s); ++s)
+      offSt += st->getSrc(s)->reg.size;
+   if (offSt != offLd)
+      return false;
+
+   for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
+      if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
+         return false;
+      if (st->getSrc(s)->reg.file != FILE_GPR)
+         return false;
+      ld->def(d).replace(st->src(s), false);
+   }
+   ld->bb->remove(ld);
+   return true;
+}
+
+bool
+MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
+{
+   Instruction *ldR = rec->insn;
+   int32_t offR = rec->offset;
+   int32_t offE = ldE->getSrc(0)->reg.data.offset;
+   int dR, dE;
+
+   assert(offR <= offE);
+   for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
+      offR += ldR->getDef(dR)->reg.size;
+   if (offR != offE)
+      return false;
+
+   for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
+      if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
+         return false;
+      ldE->def(dE).replace(ldR->getDef(dR), false);
+   }
+
+   delete_Instruction(prog, ldE);
+   return true;
+}
+
+bool
+MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
+{
+   const Instruction *const ri = rec->insn;
+   Value *extra[3];
+
+   int32_t offS = st->getSrc(0)->reg.data.offset;
+   int32_t offR = rec->offset;
+   int32_t endS = offS + typeSizeof(st->dType);
+   int32_t endR = offR + typeSizeof(ri->dType);
+
+   rec->size = MAX2(endS, endR) - MIN2(offS, offR);
+
+   st->takeExtraSources(0, extra);
+
+   if (offR < offS) {
+      Value *vals[10];
+      int s, n;
+      int k = 0;
+      // get non-replaced sources of ri
+      for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
+         vals[k++] = ri->getSrc(s);
+      n = s;
+      // get replaced sources of st
+      for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
+         vals[k++] = st->getSrc(s);
+      // skip replaced sources of ri
+      for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
+      // get non-replaced sources after values covered by st
+      for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
+         vals[k++] = ri->getSrc(s);
+      assert((unsigned int)k <= Elements(vals));
+      for (s = 0; s < k; ++s)
+         st->setSrc(s + 1, vals[s]);
+      st->setSrc(0, ri->getSrc(0));
+   } else
+   if (endR > endS) {
+      int j, s;
+      for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
+      for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
+      for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
+         st->setSrc(s++, ri->getSrc(j));
+   }
+   st->putExtraSources(0, extra);
+
+   delete_Instruction(prog, rec->insn);
+
+   rec->insn = st;
+   rec->offset = st->getSrc(0)->reg.data.offset;
+
+   st->setType(typeOfSize(rec->size));
+
+   return true;
+}
+
+bool
+MemoryOpt::Record::overlaps(const Instruction *ldst) const
+{
+   Record that;
+   that.set(ldst);
+
+   if (this->fileIndex != that.fileIndex)
+      return false;
+
+   if (this->rel[0] || that.rel[0])
+      return this->base == that.base;
+   return
+      (this->offset < that.offset + that.size) &&
+      (this->offset + this->size > that.offset);
+}
+
+// We must not eliminate stores that affect the result of @ld if
+// we find later stores to the same location, and we may no longer
+// merge them with later stores.
+// The stored value can, however, still be used to determine the value
+// returned by future loads.
+void
+MemoryOpt::lockStores(Instruction *const ld)
+{
+   for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
+      if (!r->locked && r->overlaps(ld))
+         r->locked = true;
+}
+
+// Prior loads from the location of @st are no longer valid.
+// Stores to the location of @st may no longer be used to derive
+// the value at it nor be coalesced into later stores.
+void
+MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
+{
+   if (st)
+      f = st->src(0).getFile();
+
+   for (Record *r = loads[f]; r; r = r->next)
+      if (!st || r->overlaps(st))
+         r->unlink(&loads[f]);
+
+   for (Record *r = stores[f]; r; r = r->next)
+      if (!st || r->overlaps(st))
+         r->unlink(&stores[f]);
+}
+
+bool
+MemoryOpt::visit(BasicBlock *bb)
+{
+   bool ret = runOpt(bb);
+   // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
+   // where 96 bit memory operations are forbidden.
+   if (ret)
+      ret = runOpt(bb);
+   return ret;
+}
+
+bool
+MemoryOpt::runOpt(BasicBlock *bb)
+{
+   Instruction *ldst, *next;
+   Record *rec;
+   bool isAdjacent = true;
+
+   for (ldst = bb->getEntry(); ldst; ldst = next) {
+      bool keep = true;
+      bool isLoad = true;
+      next = ldst->next;
+
+      if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
+         if (ldst->isDead()) {
+            // might have been produced by earlier optimization
+            delete_Instruction(prog, ldst);
+            continue;
+         }
+      } else
+      if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+         isLoad = false;
+      } else {
+         // TODO: maybe have all fixed ops act as barrier ?
+         if (ldst->op == OP_CALL ||
+             ldst->op == OP_BAR ||
+             ldst->op == OP_MEMBAR) {
+            purgeRecords(NULL, FILE_MEMORY_LOCAL);
+            purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+            purgeRecords(NULL, FILE_MEMORY_SHARED);
+            purgeRecords(NULL, FILE_SHADER_OUTPUT);
+         } else
+         if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
+            if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+               purgeRecords(NULL, FILE_MEMORY_LOCAL);
+               purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+               purgeRecords(NULL, FILE_MEMORY_SHARED);
+            } else {
+               purgeRecords(NULL, ldst->src(0).getFile());
+            }
+         } else
+         if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
+            purgeRecords(NULL, FILE_SHADER_OUTPUT);
+         }
+         continue;
+      }
+      if (ldst->getPredicate()) // TODO: handle predicated ld/st
+         continue;
+
+      if (isLoad) {
+         DataFile file = ldst->src(0).getFile();
+
+         // if ld l[]/g[] look for previous store to eliminate the reload
+         if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
+            // TODO: shared memory ?
+            rec = findRecord(ldst, false, isAdjacent);
+            if (rec && !isAdjacent)
+               keep = !replaceLdFromSt(ldst, rec);
+         }
+
+         // or look for ld from the same location and replace this one
+         rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
+         if (rec) {
+            if (!isAdjacent)
+               keep = !replaceLdFromLd(ldst, rec);
+            else
+               // or combine a previous load with this one
+               keep = !combineLd(rec, ldst);
+         }
+         if (keep)
+            lockStores(ldst);
+      } else {
+         rec = findRecord(ldst, false, isAdjacent);
+         if (rec) {
+            if (!isAdjacent)
+               keep = !replaceStFromSt(ldst, rec);
+            else
+               keep = !combineSt(rec, ldst);
+         }
+         if (keep)
+            purgeRecords(ldst, DATA_FILE_COUNT);
+      }
+      if (keep)
+         addRecord(ldst);
+   }
+   reset();
+
+   return true;
+}
+
+// =============================================================================
+
+// Turn control flow into predicated instructions (after register allocation !).
+// TODO:
+// Could move this to before register allocation on NVC0 and also handle nested
+// constructs.
+class FlatteningPass : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   bool tryPredicateConditional(BasicBlock *);
+   void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
+   void tryPropagateBranch(BasicBlock *);
+   inline bool isConstantCondition(Value *pred);
+   inline bool mayPredicate(const Instruction *, const Value *pred) const;
+   inline void removeFlow(Instruction *);
+};
+
+bool
+FlatteningPass::isConstantCondition(Value *pred)
+{
+   Instruction *insn = pred->getUniqueInsn();
+   assert(insn);
+   if (insn->op != OP_SET || insn->srcExists(2))
+      return false;
+
+   for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
+      Instruction *ld = insn->getSrc(s)->getUniqueInsn();
+      DataFile file;
+      if (ld) {
+         if (ld->op != OP_MOV && ld->op != OP_LOAD)
+            return false;
+         if (ld->src(0).isIndirect(0))
+            return false;
+         file = ld->src(0).getFile();
+      } else {
+         file = insn->src(s).getFile();
+         // catch $r63 on NVC0
+         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
+            file = FILE_IMMEDIATE;
+      }
+      if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
+         return false;
+   }
+   return true;
+}
+
+void
+FlatteningPass::removeFlow(Instruction *insn)
+{
+   FlowInstruction *term = insn ? insn->asFlow() : NULL;
+   if (!term)
+      return;
+   Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
+
+   if (term->op == OP_BRA) {
+      // TODO: this might get more difficult when we get arbitrary BRAs
+      if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
+         return;
+   } else
+   if (term->op != OP_JOIN)
+      return;
+
+   Value *pred = term->getPredicate();
+
+   delete_Instruction(prog, term);
+
+   if (pred && pred->refCount() == 0) {
+      Instruction *pSet = pred->getUniqueInsn();
+      pred->join->reg.data.id = -1; // deallocate
+      if (pSet->isDead())
+         delete_Instruction(prog, pSet);
+   }
+}
+
+void
+FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
+{
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      if (i->isNop())
+         continue;
+      assert(!i->getPredicate());
+      i->setPredicate(cc, pred);
+   }
+   removeFlow(bb->getExit());
+}
+
+bool
+FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->isPseudo())
+      return true;
+   // TODO: calls where we don't know which registers are modified
+
+   if (!prog->getTarget()->mayPredicate(insn, pred))
+      return false;
+   for (int d = 0; insn->defExists(d); ++d)
+      if (insn->getDef(d)->equals(pred))
+         return false;
+   return true;
+}
+
+// If we jump to BRA/RET/EXIT, replace the jump with it.
+// NOTE: We do not update the CFG anymore here !
+//
+// TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
+//  BB:0
+//   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
+//  BB1:
+//   bra BB:3
+//  BB2:
+//   ...
+//  BB3:
+//   ...
+void
+FlatteningPass::tryPropagateBranch(BasicBlock *bb)
+{
+   for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
+      BasicBlock *bf = i->asFlow()->target.bb;
+
+      if (bf->getInsnCount() != 1)
+         continue;
+
+      FlowInstruction *bra = i->asFlow();
+      FlowInstruction *rep = bf->getExit()->asFlow();
+
+      if (!rep || rep->getPredicate())
+         continue;
+      if (rep->op != OP_BRA &&
+          rep->op != OP_JOIN &&
+          rep->op != OP_EXIT)
+         continue;
+
+      // TODO: If there are multiple branches to @rep, only the first would
+      // be replaced, so only remove them after this pass is done ?
+      // Also, need to check all incident blocks for fall-through exits and
+      // add the branch there.
+      bra->op = rep->op;
+      bra->target.bb = rep->target.bb;
+      if (bf->cfg.incidentCount() == 1)
+         bf->remove(rep);
+   }
+}
+
+bool
+FlatteningPass::visit(BasicBlock *bb)
+{
+   if (tryPredicateConditional(bb))
+      return true;
+
+   // try to attach join to previous instruction
+   Instruction *insn = bb->getExit();
+   if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
+      insn = insn->prev;
+      if (insn && !insn->getPredicate() &&
+          !insn->asFlow() &&
+          insn->op != OP_TEXBAR &&
+          !isTextureOp(insn->op) && // probably just nve4
+          !isSurfaceOp(insn->op) && // not confirmed
+          insn->op != OP_LINTERP && // probably just nve4
+          insn->op != OP_PINTERP && // probably just nve4
+          ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
+           typeSizeof(insn->dType) <= 4) &&
+          !insn->isNop()) {
+         insn->join = 1;
+         bb->remove(bb->getExit());
+         return true;
+      }
+   }
+
+   tryPropagateBranch(bb);
+
+   return true;
+}
+
+bool
+FlatteningPass::tryPredicateConditional(BasicBlock *bb)
+{
+   BasicBlock *bL = NULL, *bR = NULL;
+   unsigned int nL = 0, nR = 0, limit = 12;
+   Instruction *insn;
+   unsigned int mask;
+
+   mask = bb->initiatesSimpleConditional();
+   if (!mask)
+      return false;
+
+   assert(bb->getExit());
+   Value *pred = bb->getExit()->getPredicate();
+   assert(pred);
+
+   if (isConstantCondition(pred))
+      limit = 4;
+
+   Graph::EdgeIterator ei = bb->cfg.outgoing();
+
+   if (mask & 1) {
+      bL = BasicBlock::get(ei.getNode());
+      for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
+         if (!mayPredicate(insn, pred))
+            return false;
+      if (nL > limit)
+         return false; // too long, do a real branch
+   }
+   ei.next();
+
+   if (mask & 2) {
+      bR = BasicBlock::get(ei.getNode());
+      for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
+         if (!mayPredicate(insn, pred))
+            return false;
+      if (nR > limit)
+         return false; // too long, do a real branch
+   }
+
+   if (bL)
+      predicateInstructions(bL, pred, bb->getExit()->cc);
+   if (bR)
+      predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
+
+   if (bb->joinAt) {
+      bb->remove(bb->joinAt);
+      bb->joinAt = NULL;
+   }
+   removeFlow(bb->getExit()); // delete the branch/join at the fork point
+
+   // remove potential join operations at the end of the conditional
+   if (prog->getTarget()->joinAnterior) {
+      bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
+      if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
+         removeFlow(bb->getEntry());
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+// Common subexpression elimination. Stupid O^2 implementation.
+class LocalCSE : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   inline bool tryReplace(Instruction **, Instruction *);
+
+   DLList ops[OP_LAST + 1];
+};
+
+class GlobalCSE : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+Instruction::isActionEqual(const Instruction *that) const
+{
+   if (this->op != that->op ||
+       this->dType != that->dType ||
+       this->sType != that->sType)
+      return false;
+   if (this->cc != that->cc)
+      return false;
+
+   if (this->asTex()) {
+      if (memcmp(&this->asTex()->tex,
+                 &that->asTex()->tex,
+                 sizeof(this->asTex()->tex)))
+         return false;
+   } else
+   if (this->asCmp()) {
+      if (this->asCmp()->setCond != that->asCmp()->setCond)
+         return false;
+   } else
+   if (this->asFlow()) {
+      return false;
+   } else {
+      if (this->ipa != that->ipa ||
+          this->lanes != that->lanes ||
+          this->perPatch != that->perPatch)
+         return false;
+      if (this->postFactor != that->postFactor)
+         return false;
+   }
+
+   if (this->subOp != that->subOp ||
+       this->saturate != that->saturate ||
+       this->rnd != that->rnd ||
+       this->ftz != that->ftz ||
+       this->dnz != that->dnz ||
+       this->cache != that->cache ||
+       this->mask != that->mask)
+      return false;
+
+   return true;
+}
+
+bool
+Instruction::isResultEqual(const Instruction *that) const
+{
+   unsigned int d, s;
+
+   // NOTE: location of discard only affects tex with liveOnly and quadops
+   if (!this->defExists(0) && this->op != OP_DISCARD)
+      return false;
+
+   if (!isActionEqual(that))
+      return false;
+
+   if (this->predSrc != that->predSrc)
+      return false;
+
+   for (d = 0; this->defExists(d); ++d) {
+      if (!that->defExists(d) ||
+          !this->getDef(d)->equals(that->getDef(d), false))
+         return false;
+   }
+   if (that->defExists(d))
+      return false;
+
+   for (s = 0; this->srcExists(s); ++s) {
+      if (!that->srcExists(s))
+         return false;
+      if (this->src(s).mod != that->src(s).mod)
+         return false;
+      if (!this->getSrc(s)->equals(that->getSrc(s), true))
+         return false;
+   }
+   if (that->srcExists(s))
+      return false;
+
+   if (op == OP_LOAD || op == OP_VFETCH) {
+      switch (src(0).getFile()) {
+      case FILE_MEMORY_CONST:
+      case FILE_SHADER_INPUT:
+         return true;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+// pull through common expressions from different in-blocks
+bool
+GlobalCSE::visit(BasicBlock *bb)
+{
+   Instruction *phi, *next, *ik;
+   int s;
+
+   // TODO: maybe do this with OP_UNION, too
+
+   for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
+      next = phi->next;
+      if (phi->getSrc(0)->refCount() > 1)
+         continue;
+      ik = phi->getSrc(0)->getInsn();
+      if (!ik)
+         continue; // probably a function input
+      for (s = 1; phi->srcExists(s); ++s) {
+         if (phi->getSrc(s)->refCount() > 1)
+            break;
+         if (!phi->getSrc(s)->getInsn() ||
+             !phi->getSrc(s)->getInsn()->isResultEqual(ik))
+            break;
+      }
+      if (!phi->srcExists(s)) {
+         Instruction *entry = bb->getEntry();
+         ik->bb->remove(ik);
+         if (!entry || entry->op != OP_JOIN)
+            bb->insertHead(ik);
+         else
+            bb->insertAfter(entry, ik);
+         ik->setDef(0, phi->getDef(0));
+         delete_Instruction(prog, phi);
+      }
+   }
+
+   return true;
+}
+
+bool
+LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
+{
+   Instruction *old = *ptr;
+
+   // TODO: maybe relax this later (causes trouble with OP_UNION)
+   if (i->isPredicated())
+      return false;
+
+   if (!old->isResultEqual(i))
+      return false;
+
+   for (int d = 0; old->defExists(d); ++d)
+      old->def(d).replace(i->getDef(d), false);
+   delete_Instruction(prog, old);
+   *ptr = NULL;
+   return true;
+}
+
+bool
+LocalCSE::visit(BasicBlock *bb)
+{
+   unsigned int replaced;
+
+   do {
+      Instruction *ir, *next;
+
+      replaced = 0;
+
+      // will need to know the order of instructions
+      int serial = 0;
+      for (ir = bb->getFirst(); ir; ir = ir->next)
+         ir->serial = serial++;
+
+      for (ir = bb->getEntry(); ir; ir = next) {
+         int s;
+         Value *src = NULL;
+
+         next = ir->next;
+
+         if (ir->fixed) {
+            ops[ir->op].insert(ir);
+            continue;
+         }
+
+         for (s = 0; ir->srcExists(s); ++s)
+            if (ir->getSrc(s)->asLValue())
+               if (!src || ir->getSrc(s)->refCount() < src->refCount())
+                  src = ir->getSrc(s);
+
+         if (src) {
+            for (Value::UseIterator it = src->uses.begin();
+                 it != src->uses.end(); ++it) {
+               Instruction *ik = (*it)->getInsn();
+               if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
+                  if (tryReplace(&ir, ik))
+                     break;
+            }
+         } else {
+            DLLIST_FOR_EACH(&ops[ir->op], iter)
+            {
+               Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
+               if (tryReplace(&ir, ik))
+                  break;
+            }
+         }
+
+         if (ir)
+            ops[ir->op].insert(ir);
+         else
+            ++replaced;
+      }
+      for (unsigned int i = 0; i <= OP_LAST; ++i)
+         ops[i].clear();
+
+   } while (replaced);
+
+   return true;
+}
+
+// =============================================================================
+
+// Remove computations of unused values.
+class DeadCodeElim : public Pass
+{
+public:
+   bool buryAll(Program *);
+
+private:
+   virtual bool visit(BasicBlock *);
+
+   void checkSplitLoad(Instruction *ld); // for partially dead loads
+
+   unsigned int deadCount;
+};
+
+bool
+DeadCodeElim::buryAll(Program *prog)
+{
+   do {
+      deadCount = 0;
+      if (!this->run(prog, false, false))
+         return false;
+   } while (deadCount);
+
+   return true;
+}
+
+bool
+DeadCodeElim::visit(BasicBlock *bb)
+{
+   Instruction *next;
+
+   for (Instruction *i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isDead()) {
+         ++deadCount;
+         delete_Instruction(prog, i);
+      } else
+      if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+         checkSplitLoad(i);
+      } else
+      if (i->defExists(0) && !i->getDef(0)->refCount()) {
+         if (i->op == OP_ATOM ||
+             i->op == OP_SUREDP ||
+             i->op == OP_SUREDB)
+            i->setDef(0, NULL);
+      }
+   }
+   return true;
+}
+
+void
+DeadCodeElim::checkSplitLoad(Instruction *ld1)
+{
+   Instruction *ld2 = NULL; // can get at most 2 loads
+   Value *def1[4];
+   Value *def2[4];
+   int32_t addr1, addr2;
+   int32_t size1, size2;
+   int d, n1, n2;
+   uint32_t mask = 0xffffffff;
+
+   for (d = 0; ld1->defExists(d); ++d)
+      if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
+         mask &= ~(1 << d);
+   if (mask == 0xffffffff)
+      return;
+
+   addr1 = ld1->getSrc(0)->reg.data.offset;
+   n1 = n2 = 0;
+   size1 = size2 = 0;
+   for (d = 0; ld1->defExists(d); ++d) {
+      if (mask & (1 << d)) {
+         if (size1 && (addr1 & 0x7))
+            break;
+         def1[n1] = ld1->getDef(d);
+         size1 += def1[n1++]->reg.size;
+      } else
+      if (!n1) {
+         addr1 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
+      }
+   }
+   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
+      if (mask & (1 << d)) {
+         def2[n2] = ld1->getDef(d);
+         size2 += def2[n2++]->reg.size;
+      } else {
+         assert(!n2);
+         addr2 += ld1->getDef(d)->reg.size;
+      }
+   }
+
+   updateLdStOffset(ld1, addr1, func);
+   ld1->setType(typeOfSize(size1));
+   for (d = 0; d < 4; ++d)
+      ld1->setDef(d, (d < n1) ? def1[d] : NULL);
+
+   if (!n2)
+      return;
+
+   ld2 = cloneShallow(func, ld1);
+   updateLdStOffset(ld2, addr2, func);
+   ld2->setType(typeOfSize(size2));
+   for (d = 0; d < 4; ++d)
+      ld2->setDef(d, (d < n2) ? def2[d] : NULL);
+
+   ld1->bb->insertAfter(ld1, ld2);
+}
+
+// =============================================================================
+
+#define RUN_PASS(l, n, f)                       \
+   if (level >= (l)) {                          \
+      if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
+         INFO("PEEPHOLE: %s\n", #n);            \
+      n pass;                                   \
+      if (!pass.f(this))                        \
+         return false;                          \
+   }
+
+bool
+Program::optimizeSSA(int level)
+{
+   RUN_PASS(1, DeadCodeElim, buryAll);
+   RUN_PASS(1, CopyPropagation, run);
+   RUN_PASS(2, GlobalCSE, run);
+   RUN_PASS(1, LocalCSE, run);
+   RUN_PASS(2, AlgebraicOpt, run);
+   RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
+   RUN_PASS(1, ConstantFolding, foldAll);
+   RUN_PASS(1, LoadPropagation, run);
+   RUN_PASS(2, MemoryOpt, run);
+   RUN_PASS(2, LocalCSE, run);
+   RUN_PASS(0, DeadCodeElim, buryAll);
+
+   return true;
+}
+
+bool
+Program::optimizePostRA(int level)
+{
+   RUN_PASS(2, FlatteningPass, run);
+   return true;
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
new file mode 100644
index 00000000000..ee39b3c5880
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+namespace nv50_ir {
+
+enum TextStyle
+{
+   TXT_DEFAULT,
+   TXT_GPR,
+   TXT_REGISTER,
+   TXT_FLAGS,
+   TXT_MEM,
+   TXT_IMMD,
+   TXT_BRA,
+   TXT_INSN
+};
+
+static const char *_colour[8] =
+{
+   "\x1b[00m",
+   "\x1b[34m",
+   "\x1b[35m",
+   "\x1b[35m",
+   "\x1b[36m",
+   "\x1b[33m",
+   "\x1b[37m",
+   "\x1b[32m"
+};
+
+static const char *_nocolour[8] =
+{
+      "", "", "", "", "", "", "", ""
+};
+
+static const char **colour;
+
+static void init_colours()
+{
+   if (getenv("NV50_PROG_DEBUG_NO_COLORS") != NULL)
+      colour = _nocolour;
+   else
+      colour = _colour;
+}
+
+const char *operationStr[OP_LAST + 1] =
+{
+   "nop",
+   "phi",
+   "union",
+   "split",
+   "merge",
+   "consec",
+   "mov",
+   "ld",
+   "st",
+   "add",
+   "sub",
+   "mul",
+   "div",
+   "mod",
+   "mad",
+   "fma",
+   "sad",
+   "abs",
+   "neg",
+   "not",
+   "and",
+   "or",
+   "xor",
+   "shl",
+   "shr",
+   "max",
+   "min",
+   "sat",
+   "ceil",
+   "floor",
+   "trunc",
+   "cvt",
+   "set and",
+   "set or",
+   "set xor",
+   "set",
+   "selp",
+   "slct",
+   "rcp",
+   "rsq",
+   "lg2",
+   "sin",
+   "cos",
+   "ex2",
+   "exp",
+   "log",
+   "presin",
+   "preex2",
+   "sqrt",
+   "pow",
+   "bra",
+   "call",
+   "ret",
+   "cont",
+   "break",
+   "preret",
+   "precont",
+   "prebreak",
+   "brkpt",
+   "joinat",
+   "join",
+   "discard",
+   "exit",
+   "membar",
+   "vfetch",
+   "pfetch",
+   "export",
+   "linterp",
+   "pinterp",
+   "emit",
+   "restart",
+   "tex",
+   "texbias",
+   "texlod",
+   "texfetch",
+   "texquery",
+   "texgrad",
+   "texgather",
+   "texcsaa",
+   "texprep",
+   "suldb",
+   "suldp",
+   "sustb",
+   "sustp",
+   "suredb",
+   "suredp",
+   "sulea",
+   "subfm",
+   "suclamp",
+   "sueau",
+   "madsp",
+   "texbar",
+   "dfdx",
+   "dfdy",
+   "rdsv",
+   "wrsv",
+   "quadop",
+   "quadon",
+   "quadpop",
+   "popcnt",
+   "insbf",
+   "extbf",
+   "permt",
+   "atom",
+   "bar",
+   "vadd",
+   "vavg",
+   "vmin",
+   "vmax",
+   "vsad",
+   "vset",
+   "vshr",
+   "vshl",
+   "vsel",
+   "cctl",
+   "(invalid)"
+};
+
+static const char *atomSubOpStr[] =
+{
+   "add", "min", "max", "inc", "dec", "and", "or", "xor", "cas", "exch"
+};
+
+static const char *DataTypeStr[] =
+{
+   "-",
+   "u8", "s8",
+   "u16", "s16",
+   "u32", "s32",
+   "u64", "s64",
+   "f16", "f32", "f64",
+   "b96", "b128"
+};
+
+static const char *RoundModeStr[] =
+{
+   "", "rm", "rz", "rp", "rni", "rmi", "rzi", "rpi"
+};
+
+static const char *CondCodeStr[] =
+{
+   "never",
+   "lt",
+   "eq",
+   "le",
+   "gt",
+   "ne",
+   "ge",
+   "",
+   "(invalid)",
+   "ltu",
+   "equ",
+   "leu",
+   "gtu",
+   "neu",
+   "geu",
+   "",
+   "no",
+   "nc",
+   "ns",
+   "na",
+   "a",
+   "s",
+   "c",
+   "o"
+};
+
+static const char *SemanticStr[SV_LAST + 1] =
+{
+   "POSITION",
+   "VERTEX_ID",
+   "INSTANCE_ID",
+   "INVOCATION_ID",
+   "PRIMITIVE_ID",
+   "VERTEX_COUNT",
+   "LAYER",
+   "VIEWPORT_INDEX",
+   "Y_DIR",
+   "FACE",
+   "POINT_SIZE",
+   "POINT_COORD",
+   "CLIP_DISTANCE",
+   "SAMPLE_INDEX",
+   "TESS_FACTOR",
+   "TESS_COORD",
+   "TID",
+   "CTAID",
+   "NTID",
+   "GRIDID",
+   "NCTAID",
+   "LANEID",
+   "PHYSID",
+   "NPHYSID",
+   "CLOCK",
+   "LBASE",
+   "SBASE",
+   "?",
+   "(INVALID)"
+};
+
+static const char *interpStr[16] =
+{
+   "pass",
+   "mul",
+   "flat",
+   "sc",
+   "cent pass",
+   "cent mul",
+   "cent flat",
+   "cent sc",
+   "off pass",
+   "off mul",
+   "off flat",
+   "off sc",
+   "samp pass",
+   "samp mul",
+   "samp flat",
+   "samp sc"
+};
+
+#define PRINT(args...)                                \
+   do {                                               \
+      pos += snprintf(&buf[pos], size - pos, args);   \
+   } while(0)
+
+#define SPACE_PRINT(cond, args...)                      \
+   do {                                                 \
+      if (cond)                                         \
+         buf[pos++] = ' ';                              \
+      pos += snprintf(&buf[pos], size - pos, args);     \
+   } while(0)
+
+#define SPACE()                                    \
+   do {                                            \
+      if (pos < size)                              \
+         buf[pos++] = ' ';                         \
+   } while(0)
+
+int Modifier::print(char *buf, size_t size) const
+{
+   size_t pos = 0;
+
+   if (bits)
+      PRINT("%s", colour[TXT_INSN]);
+
+   size_t base = pos;
+
+   if (bits & NV50_IR_MOD_NOT)
+      PRINT("not");
+   if (bits & NV50_IR_MOD_SAT)
+      SPACE_PRINT(pos > base && pos < size, "sat");
+   if (bits & NV50_IR_MOD_NEG)
+      SPACE_PRINT(pos > base && pos < size, "neg");
+   if (bits & NV50_IR_MOD_ABS)
+      SPACE_PRINT(pos > base && pos < size, "abs");
+
+   return pos;
+}
+
+int LValue::print(char *buf, size_t size, DataType ty) const
+{
+   const char *postFix = "";
+   size_t pos = 0;
+   int idx = join->reg.data.id >= 0 ? join->reg.data.id : id;
+   char p = join->reg.data.id >= 0 ? '$' : '%';
+   char r;
+   int col = TXT_DEFAULT;
+
+   switch (reg.file) {
+   case FILE_GPR:
+      r = 'r'; col = TXT_GPR;
+      if (reg.size == 2) {
+         if (p == '$') {
+            postFix = (idx & 1) ? "h" : "l";
+            idx /= 2;
+         } else {
+            postFix = "s";
+         }
+      } else
+      if (reg.size == 8) {
+         postFix = "d";
+      } else
+      if (reg.size == 16) {
+         postFix = "q";
+      } else
+      if (reg.size == 12) {
+         postFix = "t";
+      }
+      break;
+   case FILE_PREDICATE:
+      r = 'p'; col = TXT_REGISTER;
+      if (reg.size == 2)
+         postFix = "d";
+      else
+      if (reg.size == 4)
+         postFix = "q";
+      break;
+   case FILE_FLAGS:
+      r = 'c'; col = TXT_FLAGS;
+      break;
+   case FILE_ADDRESS:
+      r = 'a'; col = TXT_REGISTER;
+      break;
+   default:
+      assert(!"invalid file for lvalue");
+      r = '?';
+      break;
+   }
+
+   PRINT("%s%c%c%i%s", colour[col], p, r, idx, postFix);
+
+   return pos;
+}
+
+int ImmediateValue::print(char *buf, size_t size, DataType ty) const
+{
+   size_t pos = 0;
+
+   PRINT("%s", colour[TXT_IMMD]);
+
+   switch (ty) {
+   case TYPE_F32: PRINT("%f", reg.data.f32); break;
+   case TYPE_F64: PRINT("%f", reg.data.f64); break;
+   case TYPE_U8:  PRINT("0x%02x", reg.data.u8); break;
+   case TYPE_S8:  PRINT("%i", reg.data.s8); break;
+   case TYPE_U16: PRINT("0x%04x", reg.data.u16); break;
+   case TYPE_S16: PRINT("%i", reg.data.s16); break;
+   case TYPE_U32: PRINT("0x%08x", reg.data.u32); break;
+   case TYPE_S32: PRINT("%i", reg.data.s32); break;
+   case TYPE_U64:
+   case TYPE_S64:
+   default:
+      PRINT("0x%016"PRIx64, reg.data.u64);
+      break;
+   }
+   return pos;
+}
+
+int Symbol::print(char *buf, size_t size, DataType ty) const
+{
+   return print(buf, size, NULL, NULL, ty);
+}
+
+int Symbol::print(char *buf, size_t size,
+                  Value *rel, Value *dimRel, DataType ty) const
+{
+   size_t pos = 0;
+   char c;
+
+   if (ty == TYPE_NONE)
+      ty = typeOfSize(reg.size);
+
+   if (reg.file == FILE_SYSTEM_VALUE) {
+      PRINT("%ssv[%s%s:%i%s", colour[TXT_MEM],
+            colour[TXT_REGISTER],
+            SemanticStr[reg.data.sv.sv], reg.data.sv.index, colour[TXT_MEM]);
+      if (rel) {
+         PRINT("%s+", colour[TXT_DEFAULT]);
+         pos += rel->print(&buf[pos], size - pos);
+      }
+      PRINT("%s]", colour[TXT_MEM]);
+      return pos;
+   }
+
+   switch (reg.file) {
+   case FILE_MEMORY_CONST:  c = 'c'; break;
+   case FILE_SHADER_INPUT:  c = 'a'; break;
+   case FILE_SHADER_OUTPUT: c = 'o'; break;
+   case FILE_MEMORY_GLOBAL: c = 'g'; break;
+   case FILE_MEMORY_SHARED: c = 's'; break;
+   case FILE_MEMORY_LOCAL:  c = 'l'; break;
+   default:
+      assert(!"invalid file");
+      c = '?';
+      break;
+   }
+
+   if (c == 'c')
+      PRINT("%s%c%i[", colour[TXT_MEM], c, reg.fileIndex);
+   else
+      PRINT("%s%c[", colour[TXT_MEM], c);
+
+   if (dimRel) {
+      pos += dimRel->print(&buf[pos], size - pos, TYPE_S32);
+      PRINT("%s][", colour[TXT_MEM]);
+   }
+
+   if (rel) {
+      pos += rel->print(&buf[pos], size - pos);
+      PRINT("%s%c", colour[TXT_DEFAULT], (reg.data.offset < 0) ? '-' : '+');
+   } else {
+      assert(reg.data.offset >= 0);
+   }
+   PRINT("%s0x%x%s]", colour[TXT_IMMD], abs(reg.data.offset), colour[TXT_MEM]);
+
+   return pos;
+}
+
+void Instruction::print() const
+{
+   #define BUFSZ 512
+
+   const size_t size = BUFSZ;
+
+   char buf[BUFSZ];
+   int s, d;
+   size_t pos = 0;
+
+   PRINT("%s", colour[TXT_INSN]);
+
+   if (join)
+      PRINT("join ");
+
+   if (predSrc >= 0) {
+      const size_t pre = pos;
+      if (getSrc(predSrc)->reg.file == FILE_PREDICATE) {
+         if (cc == CC_NOT_P)
+            PRINT("not");
+      } else {
+         PRINT("%s", CondCodeStr[cc]);
+      }
+      if (pos > pre)
+         SPACE();
+      pos += getSrc(predSrc)->print(&buf[pos], BUFSZ - pos);
+      PRINT(" %s", colour[TXT_INSN]);
+   }
+
+   if (saturate)
+      PRINT("sat ");
+
+   if (asFlow()) {
+      PRINT("%s", operationStr[op]);
+      if (asFlow()->indirect)
+         PRINT(" ind");
+      if (asFlow()->absolute)
+         PRINT(" abs");
+      if (op == OP_CALL && asFlow()->builtin) {
+         PRINT(" %sBUILTIN:%i", colour[TXT_BRA], asFlow()->target.builtin);
+      } else
+      if (op == OP_CALL && asFlow()->target.fn) {
+         PRINT(" %s%s:%i", colour[TXT_BRA],
+               asFlow()->target.fn->getName(),
+               asFlow()->target.fn->getLabel());
+      } else
+      if (asFlow()->target.bb)
+         PRINT(" %sBB:%i", colour[TXT_BRA], asFlow()->target.bb->getId());
+   } else {
+      PRINT("%s ", operationStr[op]);
+      if (op == OP_LINTERP || op == OP_PINTERP)
+         PRINT("%s ", interpStr[ipa]);
+      switch (op) {
+      case OP_SUREDP:
+      case OP_ATOM:
+         if (subOp < Elements(atomSubOpStr))
+            PRINT("%s ", atomSubOpStr[subOp]);
+         break;
+      default:
+         if (subOp)
+            PRINT("(SUBOP:%u) ", subOp);
+         break;
+      }
+      if (perPatch)
+         PRINT("patch ");
+      if (asTex())
+         PRINT("%s %s$r%u $s%u %s", asTex()->tex.target.getName(),
+               colour[TXT_MEM], asTex()->tex.r, asTex()->tex.s,
+               colour[TXT_INSN]);
+      if (postFactor)
+         PRINT("x2^%i ", postFactor);
+      PRINT("%s%s", dnz ? "dnz " : (ftz ? "ftz " : ""),  DataTypeStr[dType]);
+   }
+
+   if (rnd != ROUND_N)
+      PRINT(" %s", RoundModeStr[rnd]);
+
+   if (defExists(1))
+      PRINT(" {");
+   for (d = 0; defExists(d); ++d) {
+      SPACE();
+      pos += getDef(d)->print(&buf[pos], size - pos);
+   }
+   if (d > 1)
+      PRINT(" %s}", colour[TXT_INSN]);
+   else
+   if (!d && !asFlow())
+      PRINT(" %s#", colour[TXT_INSN]);
+
+   if (asCmp())
+      PRINT(" %s%s", colour[TXT_INSN], CondCodeStr[asCmp()->setCond]);
+
+   if (sType != dType)
+      PRINT(" %s%s", colour[TXT_INSN], DataTypeStr[sType]);
+
+   for (s = 0; srcExists(s); ++s) {
+      if (s == predSrc || src(s).usedAsPtr)
+         continue;
+      const size_t pre = pos;
+      SPACE();
+      pos += src(s).mod.print(&buf[pos], BUFSZ - pos);
+      if (pos > pre + 1)
+         SPACE();
+      if (src(s).isIndirect(0) || src(s).isIndirect(1))
+         pos += getSrc(s)->asSym()->print(&buf[pos], BUFSZ - pos,
+                                          getIndirect(s, 0),
+                                          getIndirect(s, 1));
+      else
+         pos += getSrc(s)->print(&buf[pos], BUFSZ - pos, sType);
+   }
+   if (exit)
+      PRINT("%s exit", colour[TXT_INSN]);
+
+   PRINT("%s", colour[TXT_DEFAULT]);
+
+   buf[MIN2(pos, BUFSZ - 1)] = 0;
+
+   INFO("%s (%u)\n", buf, encSize);
+}
+
+class PrintPass : public Pass
+{
+public:
+   PrintPass() : serial(0) { }
+
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Instruction *);
+
+private:
+   int serial;
+};
+
+bool
+PrintPass::visit(Function *fn)
+{
+   char str[16];
+
+   INFO("\n%s:%i (", fn->getName(), fn->getLabel());
+
+   if (!fn->outs.empty())
+      INFO("out");
+   for (std::deque<ValueRef>::iterator it = fn->outs.begin();
+        it != fn->outs.end();
+        ++it) {
+      it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+      INFO(" %s", str);
+   }
+
+   if (!fn->ins.empty())
+      INFO("%s%sin", colour[TXT_DEFAULT], fn->outs.empty() ? "" : ", ");
+   for (std::deque<ValueDef>::iterator it = fn->ins.begin();
+        it != fn->ins.end();
+        ++it) {
+      it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+      INFO(" %s", str);
+   }
+   INFO("%s)\n", colour[TXT_DEFAULT]);
+
+   return true;
+}
+
+bool
+PrintPass::visit(BasicBlock *bb)
+{
+#if 0
+   INFO("---\n");
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next())
+      INFO(" <- BB:%i (%s)\n",
+           BasicBlock::get(ei.getNode())->getId(),
+           ei.getEdge()->typeStr());
+#endif
+   INFO("BB:%i (%u instructions) - ", bb->getId(), bb->getInsnCount());
+
+   if (bb->idom())
+      INFO("idom = BB:%i, ", bb->idom()->getId());
+
+   INFO("df = { ");
+   for (DLList::Iterator df = bb->getDF().iterator(); !df.end(); df.next())
+      INFO("BB:%i ", BasicBlock::get(df)->getId());
+
+   INFO("}\n");
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next())
+      INFO(" -> BB:%i (%s)\n",
+           BasicBlock::get(ei.getNode())->getId(),
+           ei.getEdge()->typeStr());
+
+   return true;
+}
+
+bool
+PrintPass::visit(Instruction *insn)
+{
+   INFO("%3i: ", serial++);
+   insn->print();
+   return true;
+}
+
+void
+Function::print()
+{
+   PrintPass pass;
+   pass.run(this, true, false);
+}
+
+void
+Program::print()
+{
+   PrintPass pass;
+   init_colours();
+   pass.run(this, true, false);
+}
+
+void
+Function::printLiveIntervals() const
+{
+   INFO("printing live intervals ...\n");
+
+   for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next()) {
+      const Value *lval = Value::get(it)->asLValue();
+      if (lval && !lval->livei.isEmpty()) {
+         INFO("livei(%%%i): ", lval->id);
+         lval->livei.print();
+      }
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
new file mode 100644
index 00000000000..d65003ce4eb
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -0,0 +1,2050 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#include <stack>
+#include <limits>
+
+namespace nv50_ir {
+
+#define MAX_REGISTER_FILE_SIZE 256
+
+class RegisterSet
+{
+public:
+   RegisterSet(const Target *);
+
+   void init(const Target *);
+   void reset(DataFile, bool resetMax = false);
+
+   void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
+   void intersect(DataFile f, const RegisterSet *);
+
+   bool assign(int32_t& reg, DataFile f, unsigned int size);
+   void release(DataFile f, int32_t reg, unsigned int size);
+   void occupy(DataFile f, int32_t reg, unsigned int size);
+   void occupy(const Value *);
+   void occupyMask(DataFile f, int32_t reg, uint8_t mask);
+   bool isOccupied(DataFile f, int32_t reg, unsigned int size) const;
+   bool testOccupy(const Value *);
+   bool testOccupy(DataFile f, int32_t reg, unsigned int size);
+
+   inline int getMaxAssigned(DataFile f) const { return fill[f]; }
+
+   inline unsigned int getFileSize(DataFile f, uint8_t regSize) const
+   {
+      if (restrictedGPR16Range && f == FILE_GPR && regSize == 2)
+         return (last[f] + 1) / 2;
+      return last[f] + 1;
+   }
+
+   inline unsigned int units(DataFile f, unsigned int size) const
+   {
+      return size >> unit[f];
+   }
+   // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary)
+   inline unsigned int idToBytes(const Value *v) const
+   {
+      return v->reg.data.id * MIN2(v->reg.size, 4);
+   }
+   inline unsigned int idToUnits(const Value *v) const
+   {
+      return units(v->reg.file, idToBytes(v));
+   }
+   inline int bytesToId(Value *v, unsigned int bytes) const
+   {
+      if (v->reg.size < 4)
+         return units(v->reg.file, bytes);
+      return bytes / 4;
+   }
+   inline int unitsToId(DataFile f, int u, uint8_t size) const
+   {
+      if (u < 0)
+         return -1;
+      return (size < 4) ? u : ((u << unit[f]) / 4);
+   }
+
+   void print() const;
+
+private:
+   BitSet bits[LAST_REGISTER_FILE + 1];
+
+   int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity
+
+   int last[LAST_REGISTER_FILE + 1];
+   int fill[LAST_REGISTER_FILE + 1];
+
+   const bool restrictedGPR16Range;
+};
+
+void
+RegisterSet::reset(DataFile f, bool resetMax)
+{
+   bits[f].fill(0);
+   if (resetMax)
+      fill[f] = -1;
+}
+
+void
+RegisterSet::init(const Target *targ)
+{
+   for (unsigned int rf = 0; rf <= FILE_ADDRESS; ++rf) {
+      DataFile f = static_cast<DataFile>(rf);
+      last[rf] = targ->getFileSize(f) - 1;
+      unit[rf] = targ->getFileUnit(f);
+      fill[rf] = -1;
+      assert(last[rf] < MAX_REGISTER_FILE_SIZE);
+      bits[rf].allocate(last[rf] + 1, true);
+   }
+}
+
+RegisterSet::RegisterSet(const Target *targ)
+  : restrictedGPR16Range(targ->getChipset() < 0xc0)
+{
+   init(targ);
+   for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i)
+      reset(static_cast<DataFile>(i));
+}
+
+void
+RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock)
+{
+   bits[f].periodicMask32(lock, unlock);
+}
+
+void
+RegisterSet::intersect(DataFile f, const RegisterSet *set)
+{
+   bits[f] |= set->bits[f];
+}
+
+void
+RegisterSet::print() const
+{
+   INFO("GPR:");
+   bits[FILE_GPR].print();
+   INFO("\n");
+}
+
+bool
+RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size)
+{
+   reg = bits[f].findFreeRange(size);
+   if (reg < 0)
+      return false;
+   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+   return true;
+}
+
+bool
+RegisterSet::isOccupied(DataFile f, int32_t reg, unsigned int size) const
+{
+   return bits[f].testRange(reg, size);
+}
+
+void
+RegisterSet::occupy(const Value *v)
+{
+   occupy(v->reg.file, idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+void
+RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask)
+{
+   bits[f].setMask(reg & ~31, static_cast<uint32_t>(mask) << (reg % 32));
+}
+
+void
+RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size)
+{
+   bits[f].setRange(reg, size);
+
+   INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size);
+
+   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+}
+
+bool
+RegisterSet::testOccupy(const Value *v)
+{
+   return testOccupy(v->reg.file,
+                     idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+bool
+RegisterSet::testOccupy(DataFile f, int32_t reg, unsigned int size)
+{
+   if (isOccupied(f, reg, size))
+      return false;
+   occupy(f, reg, size);
+   return true;
+}
+
+void
+RegisterSet::release(DataFile f, int32_t reg, unsigned int size)
+{
+   bits[f].clrRange(reg, size);
+
+   INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size);
+}
+
+class RegAlloc
+{
+public:
+   RegAlloc(Program *program) : prog(program), sequence(0) { }
+
+   bool exec();
+   bool execFunc();
+
+private:
+   class PhiMovesPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+      inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p);
+   };
+
+   class ArgumentMovesPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+   };
+
+   class BuildIntervalsPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+      void collectLiveValues(BasicBlock *);
+      void addLiveRange(Value *, const BasicBlock *, int end);
+   };
+
+   class InsertConstraintsPass : public Pass {
+   public:
+      bool exec(Function *func);
+   private:
+      virtual bool visit(BasicBlock *);
+
+      bool insertConstraintMoves();
+
+      void condenseDefs(Instruction *);
+      void condenseSrcs(Instruction *, const int first, const int last);
+
+      void addHazard(Instruction *i, const ValueRef *src);
+      void textureMask(TexInstruction *);
+      void addConstraint(Instruction *, int s, int n);
+      bool detectConflict(Instruction *, int s);
+
+      // target specific functions, TODO: put in subclass or Target
+      void texConstraintNV50(TexInstruction *);
+      void texConstraintNVC0(TexInstruction *);
+      void texConstraintNVE0(TexInstruction *);
+
+      std::list<Instruction *> constrList;
+
+      const Target *targ;
+   };
+
+   bool buildLiveSets(BasicBlock *);
+
+private:
+   Program *prog;
+   Function *func;
+
+   // instructions in control flow / chronological order
+   ArrayList insns;
+
+   int sequence; // for manual passes through CFG
+};
+
+typedef std::pair<Value *, Value *> ValuePair;
+
+class SpillCodeInserter
+{
+public:
+   SpillCodeInserter(Function *fn) : func(fn), stackSize(0), stackBase(0) { }
+
+   bool run(const std::list<ValuePair>&);
+
+   Symbol *assignSlot(const Interval&, const unsigned int size);
+   inline int32_t getStackSize() const { return stackSize; }
+
+private:
+   Function *func;
+
+   struct SpillSlot
+   {
+      Interval occup;
+      std::list<Value *> residents; // needed to recalculate occup
+      Symbol *sym;
+      int32_t offset;
+      inline uint8_t size() const { return sym->reg.size; }
+   };
+   std::list<SpillSlot> slots;
+   int32_t stackSize;
+   int32_t stackBase;
+
+   LValue *unspill(Instruction *usei, LValue *, Value *slot);
+   void spill(Instruction *defi, Value *slot, LValue *);
+};
+
+void
+RegAlloc::BuildIntervalsPass::addLiveRange(Value *val,
+                                           const BasicBlock *bb,
+                                           int end)
+{
+   Instruction *insn = val->getUniqueInsn();
+
+   if (!insn)
+      insn = bb->getFirst();
+
+   assert(bb->getFirst()->serial <= bb->getExit()->serial);
+   assert(bb->getExit()->serial + 1 >= end);
+
+   int begin = insn->serial;
+   if (begin < bb->getEntry()->serial || begin > bb->getExit()->serial)
+      begin = bb->getEntry()->serial;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "%%%i <- live range [%i(%i), %i)\n",
+            val->id, begin, insn->serial, end);
+
+   if (begin != end) // empty ranges are only added as hazards for fixed regs
+      val->livei.extend(begin, end);
+}
+
+bool
+RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
+{
+   if (b->cfg.incidentCount() <= 1)
+      return false;
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = p->cfg.outgoing(); !ei.end(); ei.next())
+      if (ei.getType() == Graph::Edge::TREE ||
+          ei.getType() == Graph::Edge::FORWARD)
+         ++n;
+   return (n == 2);
+}
+
+// For each operand of each PHI in b, generate a new value by inserting a MOV
+// at the end of the block it is coming from and replace the operand with its
+// result. This eliminates liveness conflicts and enables us to let values be
+// copied to the right register if such a conflict exists nonetheless.
+//
+// These MOVs are also crucial in making sure the live intervals of phi srces
+// are extended until the end of the loop, since they are not included in the
+// live-in sets.
+bool
+RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
+{
+   Instruction *phi, *mov;
+   BasicBlock *pb, *pn;
+
+   std::stack<BasicBlock *> stack;
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      pb = BasicBlock::get(ei.getNode());
+      assert(pb);
+      if (needNewElseBlock(bb, pb))
+         stack.push(pb);
+   }
+   while (!stack.empty()) {
+      pb = stack.top();
+      pn = new BasicBlock(func);
+      stack.pop();
+
+      pb->cfg.detach(&bb->cfg);
+      pb->cfg.attach(&pn->cfg, Graph::Edge::TREE);
+      pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD);
+
+      assert(pb->getExit()->op != OP_CALL);
+      if (pb->getExit()->asFlow()->target.bb == bb)
+         pb->getExit()->asFlow()->target.bb = pn;
+   }
+
+   // insert MOVs (phi->src(j) should stem from j-th in-BB)
+   int j = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      pb = BasicBlock::get(ei.getNode());
+      if (!pb->isTerminated())
+         pb->insertTail(new_FlowInstruction(func, OP_BRA, bb));
+
+      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         mov = new_Instruction(func, OP_MOV, TYPE_U32);
+
+         mov->setSrc(0, phi->getSrc(j));
+         mov->setDef(0, new_LValue(func, phi->getDef(0)->asLValue()));
+         phi->setSrc(j, mov->getDef(0));
+
+         pb->insertBefore(pb->getExit(), mov);
+      }
+      ++j;
+   }
+
+   return true;
+}
+
+bool
+RegAlloc::ArgumentMovesPass::visit(BasicBlock *bb)
+{
+   // Bind function call inputs/outputs to the same physical register
+   // the callee uses, inserting moves as appropriate for the case a
+   // conflict arises.
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      FlowInstruction *cal = i->asFlow();
+      // TODO: Handle indirect calls.
+      // Right now they should only be generated for builtins.
+      if (!cal || cal->op != OP_CALL || cal->builtin || cal->indirect)
+         continue;
+      RegisterSet clobberSet(prog->getTarget());
+
+      // Bind input values.
+      for (int s = cal->indirect ? 1 : 0; cal->srcExists(s); ++s) {
+         const int t = cal->indirect ? (s - 1) : s;
+         LValue *tmp = new_LValue(func, cal->getSrc(s)->asLValue());
+         tmp->reg.data.id = cal->target.fn->ins[t].rep()->reg.data.id;
+
+         Instruction *mov =
+            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+         mov->setDef(0, tmp);
+         mov->setSrc(0, cal->getSrc(s));
+         cal->setSrc(s, tmp);
+
+         bb->insertBefore(cal, mov);
+      }
+
+      // Bind output values.
+      for (int d = 0; cal->defExists(d); ++d) {
+         LValue *tmp = new_LValue(func, cal->getDef(d)->asLValue());
+         tmp->reg.data.id = cal->target.fn->outs[d].rep()->reg.data.id;
+
+         Instruction *mov =
+            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+         mov->setSrc(0, tmp);
+         mov->setDef(0, cal->getDef(d));
+         cal->setDef(d, tmp);
+
+         bb->insertAfter(cal, mov);
+         clobberSet.occupy(tmp);
+      }
+
+      // Bind clobbered values.
+      for (std::deque<Value *>::iterator it = cal->target.fn->clobbers.begin();
+           it != cal->target.fn->clobbers.end();
+           ++it) {
+         if (clobberSet.testOccupy(*it)) {
+            Value *tmp = new_LValue(func, (*it)->asLValue());
+            tmp->reg.data.id = (*it)->reg.data.id;
+            cal->setDef(cal->defCount(), tmp);
+         }
+      }
+   }
+
+   // Update the clobber set of the function.
+   if (BasicBlock::get(func->cfgExit) == bb) {
+      func->buildDefSets();
+      for (unsigned int i = 0; i < bb->defSet.getSize(); ++i)
+         if (bb->defSet.test(i))
+            func->clobbers.push_back(func->getLValue(i));
+   }
+
+   return true;
+}
+
+// Build the set of live-in variables of bb.
+bool
+RegAlloc::buildLiveSets(BasicBlock *bb)
+{
+   Function *f = bb->getFunction();
+   BasicBlock *bn;
+   Instruction *i;
+   unsigned int s, d;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "buildLiveSets(BB:%i)\n", bb->getId());
+
+   bb->liveSet.allocate(func->allLValues.getSize(), false);
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      bn = BasicBlock::get(ei.getNode());
+      if (bn == bb)
+         continue;
+      if (bn->cfg.visit(sequence))
+         if (!buildLiveSets(bn))
+            return false;
+      if (n++ || bb->liveSet.marker)
+         bb->liveSet |= bn->liveSet;
+      else
+         bb->liveSet = bn->liveSet;
+   }
+   if (!n && !bb->liveSet.marker)
+      bb->liveSet.fill(0);
+   bb->liveSet.marker = true;
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("BB:%i live set of out blocks:\n", bb->getId());
+      bb->liveSet.print();
+   }
+
+   // if (!bb->getEntry())
+   //   return true;
+
+   if (bb == BasicBlock::get(f->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = f->outs.begin();
+           it != f->outs.end(); ++it) {
+         assert(it->get()->asLValue());
+         bb->liveSet.set(it->get()->id);
+      }
+   }
+
+   for (i = bb->getExit(); i && i != bb->getEntry()->prev; i = i->prev) {
+      for (d = 0; i->defExists(d); ++d)
+         bb->liveSet.clr(i->getDef(d)->id);
+      for (s = 0; i->srcExists(s); ++s)
+         if (i->getSrc(s)->asLValue())
+            bb->liveSet.set(i->getSrc(s)->id);
+   }
+   for (i = bb->getPhi(); i && i->op == OP_PHI; i = i->next)
+      bb->liveSet.clr(i->getDef(0)->id);
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("BB:%i live set after propagation:\n", bb->getId());
+      bb->liveSet.print();
+   }
+
+   return true;
+}
+
+void
+RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb)
+{
+   BasicBlock *bbA = NULL, *bbB = NULL;
+
+   if (bb->cfg.outgoingCount()) {
+      // trickery to save a loop of OR'ing liveSets
+      // aliasing works fine with BitSet::setOr
+      for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+         if (ei.getType() == Graph::Edge::DUMMY)
+            continue;
+         if (bbA) {
+            bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet);
+            bbA = bb;
+         } else {
+            bbA = bbB;
+         }
+         bbB = BasicBlock::get(ei.getNode());
+      }
+      bb->liveSet.setOr(&bbB->liveSet, bbA ? &bbA->liveSet : NULL);
+   } else
+   if (bb->cfg.incidentCount()) {
+      bb->liveSet.fill(0);
+   }
+}
+
+bool
+RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb)
+{
+   collectLiveValues(bb);
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "BuildIntervals(BB:%i)\n", bb->getId());
+
+   // go through out blocks and delete phi sources that do not originate from
+   // the current block from the live set
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+
+      for (Instruction *i = out->getPhi(); i && i->op == OP_PHI; i = i->next) {
+         bb->liveSet.clr(i->getDef(0)->id);
+
+         for (int s = 0; i->srcExists(s); ++s) {
+            assert(i->src(s).getInsn());
+            if (i->getSrc(s)->getUniqueInsn()->bb == bb) // XXX: reachableBy ?
+               bb->liveSet.set(i->getSrc(s)->id);
+            else
+               bb->liveSet.clr(i->getSrc(s)->id);
+         }
+      }
+   }
+
+   // remaining live-outs are live until end
+   if (bb->getExit()) {
+      for (unsigned int j = 0; j < bb->liveSet.getSize(); ++j)
+         if (bb->liveSet.test(j))
+            addLiveRange(func->getLValue(j), bb, bb->getExit()->serial + 1);
+   }
+
+   for (Instruction *i = bb->getExit(); i && i->op != OP_PHI; i = i->prev) {
+      for (int d = 0; i->defExists(d); ++d) {
+         bb->liveSet.clr(i->getDef(d)->id);
+         if (i->getDef(d)->reg.data.id >= 0) // add hazard for fixed regs
+            i->getDef(d)->livei.extend(i->serial, i->serial);
+      }
+
+      for (int s = 0; i->srcExists(s); ++s) {
+         if (!i->getSrc(s)->asLValue())
+            continue;
+         if (!bb->liveSet.test(i->getSrc(s)->id)) {
+            bb->liveSet.set(i->getSrc(s)->id);
+            addLiveRange(i->getSrc(s), bb, i->serial);
+         }
+      }
+   }
+
+   if (bb == BasicBlock::get(func->cfg.getRoot())) {
+      for (std::deque<ValueDef>::iterator it = func->ins.begin();
+           it != func->ins.end(); ++it) {
+         if (it->get()->reg.data.id >= 0) // add hazard for fixed regs
+            it->get()->livei.extend(0, 1);
+      }
+   }
+
+   return true;
+}
+
+
+#define JOIN_MASK_PHI        (1 << 0)
+#define JOIN_MASK_UNION      (1 << 1)
+#define JOIN_MASK_MOV        (1 << 2)
+#define JOIN_MASK_TEX        (1 << 3)
+
+class GCRA
+{
+public:
+   GCRA(Function *, SpillCodeInserter&);
+   ~GCRA();
+
+   bool allocateRegisters(ArrayList& insns);
+
+   void printNodeInfo() const;
+
+private:
+   class RIG_Node : public Graph::Node
+   {
+   public:
+      RIG_Node();
+
+      void init(const RegisterSet&, LValue *);
+
+      void addInterference(RIG_Node *);
+      void addRegPreference(RIG_Node *);
+
+      inline LValue *getValue() const
+      {
+         return reinterpret_cast<LValue *>(data);
+      }
+      inline void setValue(LValue *lval) { data = lval; }
+
+      inline uint8_t getCompMask() const
+      {
+         return ((1 << colors) - 1) << (reg & 7);
+      }
+
+      static inline RIG_Node *get(const Graph::EdgeIterator& ei)
+      {
+         return static_cast<RIG_Node *>(ei.getNode());
+      }
+
+   public:
+      uint32_t degree;
+      uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
+      uint16_t colors;
+
+      DataFile f;
+      int32_t reg;
+
+      float weight;
+
+      // list pointers for simplify() phase
+      RIG_Node *next;
+      RIG_Node *prev;
+
+      // union of the live intervals of all coalesced values (we want to retain
+      //  the separate intervals for testing interference of compound values)
+      Interval livei;
+
+      std::list<RIG_Node *> prefRegs;
+   };
+
+private:
+   inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; }
+
+   void buildRIG(ArrayList&);
+   bool coalesce(ArrayList&);
+   bool doCoalesce(ArrayList&, unsigned int mask);
+   void calculateSpillWeights();
+   void simplify();
+   bool selectRegisters();
+   void cleanup(const bool success);
+
+   void simplifyEdge(RIG_Node *, RIG_Node *);
+   void simplifyNode(RIG_Node *);
+
+   bool coalesceValues(Value *, Value *, bool force);
+   void resolveSplitsAndMerges();
+   void makeCompound(Instruction *, bool isSplit);
+
+   inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&);
+
+   inline void insertOrderedTail(std::list<RIG_Node *>&, RIG_Node *);
+   void checkList(std::list<RIG_Node *>&);
+
+private:
+   std::stack<uint32_t> stack;
+
+   // list headers for simplify() phase
+   RIG_Node lo[2];
+   RIG_Node hi;
+
+   Graph RIG;
+   RIG_Node *nodes;
+   unsigned int nodeCount;
+
+   Function *func;
+   Program *prog;
+
+   static uint8_t relDegree[17][17];
+
+   RegisterSet regs;
+
+   // need to fixup register id for participants of OP_MERGE/SPLIT
+   std::list<Instruction *> merges;
+   std::list<Instruction *> splits;
+
+   SpillCodeInserter& spill;
+   std::list<ValuePair> mustSpill;
+};
+
+uint8_t GCRA::relDegree[17][17];
+
+GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this)
+{
+   colors = 0;
+}
+
+void
+GCRA::printNodeInfo() const
+{
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      if (!nodes[i].colors)
+         continue;
+      INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X",
+           i,
+           nodes[i].f,nodes[i].reg,nodes[i].colors,
+           nodes[i].weight,
+           nodes[i].degree, nodes[i].degreeLimit);
+
+      for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next())
+         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+      for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next())
+         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+      INFO("\n");
+   }
+}
+
+void
+GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
+{
+   setValue(lval);
+   if (lval->reg.data.id >= 0)
+      lval->noSpill = lval->fixedReg = 1;
+
+   colors = regs.units(lval->reg.file, lval->reg.size);
+   f = lval->reg.file;
+   reg = -1;
+   if (lval->reg.data.id >= 0)
+      reg = regs.idToUnits(lval);
+
+   weight = std::numeric_limits<float>::infinity();
+   degree = 0;
+   degreeLimit = regs.getFileSize(f, lval->reg.size);
+
+   livei.insert(lval->livei);
+}
+
+bool
+GCRA::coalesceValues(Value *dst, Value *src, bool force)
+{
+   LValue *rep = dst->join->asLValue();
+   LValue *val = src->join->asLValue();
+
+   if (!force && val->reg.data.id >= 0) {
+      rep = src->join->asLValue();
+      val = dst->join->asLValue();
+   }
+   RIG_Node *nRep = &nodes[rep->id];
+   RIG_Node *nVal = &nodes[val->id];
+
+   if (src->reg.file != dst->reg.file) {
+      if (!force)
+         return false;
+      WARN("forced coalescing of values in different files !\n");
+   }
+   if (!force && dst->reg.size != src->reg.size)
+      return false;
+
+   if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) {
+      if (force) {
+         if (val->reg.data.id >= 0)
+            WARN("forced coalescing of values in different fixed regs !\n");
+      } else {
+         if (val->reg.data.id >= 0)
+            return false;
+         // make sure that there is no overlap with the fixed register of rep
+         for (ArrayList::Iterator it = func->allLValues.iterator();
+              !it.end(); it.next()) {
+            Value *reg = reinterpret_cast<Value *>(it.get())->asLValue();
+            assert(reg);
+            if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei))
+               return false;
+         }
+      }
+   }
+
+   if (!force && nRep->livei.overlaps(nVal->livei))
+      return false;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n",
+            rep->id, rep->reg.data.id, val->id);
+
+   // set join pointer of all values joined with val
+   for (Value::DefIterator def = val->defs.begin(); def != val->defs.end();
+        ++def)
+      (*def)->get()->join = rep;
+   assert(rep->join == rep && val->join == rep);
+
+   // add val's definitions to rep and extend the live interval of its RIG node
+   rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end());
+   nRep->livei.unify(nVal->livei);
+   return true;
+}
+
+bool
+GCRA::coalesce(ArrayList& insns)
+{
+   bool ret = doCoalesce(insns, JOIN_MASK_PHI);
+   if (!ret)
+      return false;
+   switch (func->getProgram()->getTarget()->getChipset() & ~0xf) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+   case 0xa0:
+      ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX);
+      break;
+   case 0xc0:
+   case 0xd0:
+   case 0xe0:
+      ret = doCoalesce(insns, JOIN_MASK_UNION);
+      break;
+   default:
+      break;
+   }
+   if (!ret)
+      return false;
+   return doCoalesce(insns, JOIN_MASK_MOV);
+}
+
+static inline uint8_t makeCompMask(int compSize, int base, int size)
+{
+   uint8_t m = ((1 << size) - 1) << base;
+
+   switch (compSize) {
+   case 1:
+      return 0xff;
+   case 2:
+      m |= (m << 2);
+      return (m << 4) | m;
+   case 3:
+   case 4:
+      return (m << 4) | m;
+   default:
+      assert(compSize <= 8);
+      return m;
+   }
+}
+
+// Used when coalescing moves. The non-compound value will become one, e.g.:
+// mov b32 $r0 $r2            / merge b64 $r0d { $r0 $r1 }
+// split b64 { $r0 $r1 } $r0d / mov b64 $r0d f64 $r2d
+static inline void copyCompound(Value *dst, Value *src)
+{
+   LValue *ldst = dst->asLValue();
+   LValue *lsrc = src->asLValue();
+
+   if (ldst->compound && !lsrc->compound) {
+      LValue *swap = lsrc;
+      lsrc = ldst;
+      ldst = swap;
+   }
+
+   ldst->compound = lsrc->compound;
+   ldst->compMask = lsrc->compMask;
+}
+
+void
+GCRA::makeCompound(Instruction *insn, bool split)
+{
+   LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue();
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("makeCompound(split = %i): ", split);
+      insn->print();
+   }
+
+   const unsigned int size = getNode(rep)->colors;
+   unsigned int base = 0;
+
+   if (!rep->compound)
+      rep->compMask = 0xff;
+   rep->compound = 1;
+
+   for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) {
+      LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue();
+
+      val->compound = 1;
+      if (!val->compMask)
+         val->compMask = 0xff;
+      val->compMask &= makeCompMask(size, base, getNode(val)->colors);
+      assert(val->compMask);
+
+      INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n",
+           rep->id, rep->compMask, val->id, val->compMask);
+
+      base += getNode(val)->colors;
+   }
+   assert(base == size);
+}
+
+bool
+GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
+{
+   int c, n;
+
+   for (n = 0; n < insns.getSize(); ++n) {
+      Instruction *i;
+      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(n));
+
+      switch (insn->op) {
+      case OP_PHI:
+         if (!(mask & JOIN_MASK_PHI))
+            break;
+         for (c = 0; insn->srcExists(c); ++c)
+            if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) {
+               // this is bad
+               ERROR("failed to coalesce phi operands\n");
+               return false;
+            }
+         break;
+      case OP_UNION:
+      case OP_MERGE:
+         if (!(mask & JOIN_MASK_UNION))
+            break;
+         for (c = 0; insn->srcExists(c); ++c)
+            coalesceValues(insn->getDef(0), insn->getSrc(c), true);
+         if (insn->op == OP_MERGE) {
+            merges.push_back(insn);
+            if (insn->srcExists(1))
+               makeCompound(insn, false);
+         }
+         break;
+      case OP_SPLIT:
+         if (!(mask & JOIN_MASK_UNION))
+            break;
+         splits.push_back(insn);
+         for (c = 0; insn->defExists(c); ++c)
+            coalesceValues(insn->getSrc(0), insn->getDef(c), true);
+         makeCompound(insn, true);
+         break;
+      case OP_MOV:
+         if (!(mask & JOIN_MASK_MOV))
+            break;
+         i = NULL;
+         if (!insn->getDef(0)->uses.empty())
+            i = insn->getDef(0)->uses.front()->getInsn();
+         // if this is a contraint-move there will only be a single use
+         if (i && i->op == OP_MERGE) // do we really still need this ?
+            break;
+         i = insn->getSrc(0)->getUniqueInsn();
+         if (i && !i->constrainedDefs()) {
+            if (coalesceValues(insn->getDef(0), insn->getSrc(0), false))
+               copyCompound(insn->getSrc(0), insn->getDef(0));
+         }
+         break;
+      case OP_TEX:
+      case OP_TXB:
+      case OP_TXL:
+      case OP_TXF:
+      case OP_TXQ:
+      case OP_TXD:
+      case OP_TXG:
+      case OP_TEXCSAA:
+         if (!(mask & JOIN_MASK_TEX))
+            break;
+         for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
+            coalesceValues(insn->getDef(c), insn->getSrc(c), true);
+         break;
+      default:
+         break;
+      }
+   }
+   return true;
+}
+
+void
+GCRA::RIG_Node::addInterference(RIG_Node *node)
+{
+   this->degree += relDegree[node->colors][colors];
+   node->degree += relDegree[colors][node->colors];
+
+   this->attach(node, Graph::Edge::CROSS);
+}
+
+void
+GCRA::RIG_Node::addRegPreference(RIG_Node *node)
+{
+   prefRegs.push_back(node);
+}
+
+GCRA::GCRA(Function *fn, SpillCodeInserter& spill) :
+   func(fn),
+   regs(fn->getProgram()->getTarget()),
+   spill(spill)
+{
+   prog = func->getProgram();
+
+   // initialize relative degrees array - i takes away from j
+   for (int i = 1; i <= 16; ++i)
+      for (int j = 1; j <= 16; ++j)
+         relDegree[i][j] = j * ((i + j - 1) / j);
+}
+
+GCRA::~GCRA()
+{
+   if (nodes)
+      delete[] nodes;
+}
+
+void
+GCRA::checkList(std::list<RIG_Node *>& lst)
+{
+   GCRA::RIG_Node *prev = NULL;
+
+   for (std::list<RIG_Node *>::iterator it = lst.begin();
+        it != lst.end();
+        ++it) {
+      assert((*it)->getValue()->join == (*it)->getValue());
+      if (prev)
+         assert(prev->livei.begin() <= (*it)->livei.begin());
+      prev = *it;
+   }
+}
+
+void
+GCRA::insertOrderedTail(std::list<RIG_Node *>& list, RIG_Node *node)
+{
+   if (node->livei.isEmpty())
+      return;
+   // only the intervals of joined values don't necessarily arrive in order
+   std::list<RIG_Node *>::iterator prev, it;
+   for (it = list.end(); it != list.begin(); it = prev) {
+      prev = it;
+      --prev;
+      if ((*prev)->livei.begin() <= node->livei.begin())
+         break;
+   }
+   list.insert(it, node);
+}
+
+void
+GCRA::buildRIG(ArrayList& insns)
+{
+   std::list<RIG_Node *> values, active;
+
+   for (std::deque<ValueDef>::iterator it = func->ins.begin();
+        it != func->ins.end(); ++it)
+      insertOrderedTail(values, getNode(it->get()->asLValue()));
+
+   for (int i = 0; i < insns.getSize(); ++i) {
+      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(i));
+      for (int d = 0; insn->defExists(d); ++d)
+         if (insn->getDef(d)->rep() == insn->getDef(d))
+            insertOrderedTail(values, getNode(insn->getDef(d)->asLValue()));
+   }
+   checkList(values);
+
+   while (!values.empty()) {
+      RIG_Node *cur = values.front();
+
+      for (std::list<RIG_Node *>::iterator it = active.begin();
+           it != active.end();) {
+         RIG_Node *node = *it;
+
+         if (node->livei.end() <= cur->livei.begin()) {
+            it = active.erase(it);
+         } else {
+            if (node->f == cur->f && node->livei.overlaps(cur->livei))
+               cur->addInterference(node);
+            ++it;
+         }
+      }
+      values.pop_front();
+      active.push_back(cur);
+   }
+}
+
+void
+GCRA::calculateSpillWeights()
+{
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      RIG_Node *const n = &nodes[i];
+      if (!nodes[i].colors || nodes[i].livei.isEmpty())
+         continue;
+      if (nodes[i].reg >= 0) {
+         // update max reg
+         regs.occupy(n->f, n->reg, n->colors);
+         continue;
+      }
+      LValue *val = nodes[i].getValue();
+
+      if (!val->noSpill) {
+         int rc = 0;
+         for (Value::DefIterator it = val->defs.begin();
+              it != val->defs.end();
+              ++it)
+            rc += (*it)->get()->refCount();
+
+         nodes[i].weight =
+            (float)rc * (float)rc / (float)nodes[i].livei.extent();
+      }
+
+      if (nodes[i].degree < nodes[i].degreeLimit) {
+         int l = 0;
+         if (val->reg.size > 4)
+            l = 1;
+         DLLIST_ADDHEAD(&lo[l], &nodes[i]);
+      } else {
+         DLLIST_ADDHEAD(&hi, &nodes[i]);
+      }
+   }
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+      printNodeInfo();
+}
+
+void
+GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b)
+{
+   bool move = b->degree >= b->degreeLimit;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC,
+            "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n",
+            a->getValue()->id, a->degree, a->degreeLimit,
+            b->getValue()->id, b->degree, b->degreeLimit);
+
+   b->degree -= relDegree[a->colors][b->colors];
+
+   move = move && b->degree < b->degreeLimit;
+   if (move && !DLLIST_EMPTY(b)) {
+      int l = (b->getValue()->reg.size > 4) ? 1 : 0;
+      DLLIST_DEL(b);
+      DLLIST_ADDTAIL(&lo[l], b);
+   }
+}
+
+void
+GCRA::simplifyNode(RIG_Node *node)
+{
+   for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+      simplifyEdge(node, RIG_Node::get(ei));
+
+   for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+      simplifyEdge(node, RIG_Node::get(ei));
+
+   DLLIST_DEL(node);
+   stack.push(node->getValue()->id);
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n",
+            node->getValue()->id,
+            (node->degree < node->degreeLimit) ? "" : "(spill)");
+}
+
+void
+GCRA::simplify()
+{
+   for (;;) {
+      if (!DLLIST_EMPTY(&lo[0])) {
+         do {
+            simplifyNode(lo[0].next);
+         } while (!DLLIST_EMPTY(&lo[0]));
+      } else
+      if (!DLLIST_EMPTY(&lo[1])) {
+         simplifyNode(lo[1].next);
+      } else
+      if (!DLLIST_EMPTY(&hi)) {
+         RIG_Node *best = hi.next;
+         float bestScore = best->weight / (float)best->degree;
+         // spill candidate
+         for (RIG_Node *it = best->next; it != &hi; it = it->next) {
+            float score = it->weight / (float)it->degree;
+            if (score < bestScore) {
+               best = it;
+               bestScore = score;
+            }
+         }
+         if (isinf(bestScore)) {
+            ERROR("no viable spill candidates left\n");
+            break;
+         }
+         simplifyNode(best);
+      } else {
+         break;
+      }
+   }
+}
+
+void
+GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei)
+{
+   const RIG_Node *intf = RIG_Node::get(ei);
+
+   if (intf->reg < 0)
+      return;
+   const LValue *vA = node->getValue();
+   const LValue *vB = intf->getValue();
+
+   const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7);
+
+   if (vA->compound | vB->compound) {
+      // NOTE: this only works for >aligned< register tuples !
+      for (Value::DefCIterator D = vA->defs.begin(); D != vA->defs.end(); ++D) {
+      for (Value::DefCIterator d = vB->defs.begin(); d != vB->defs.end(); ++d) {
+         const LValue *vD = (*D)->get()->asLValue();
+         const LValue *vd = (*d)->get()->asLValue();
+
+         if (!vD->livei.overlaps(vd->livei)) {
+            INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n",
+                     vD->id, vd->id);
+            continue;
+         }
+
+         uint8_t mask = vD->compound ? vD->compMask : ~0;
+         if (vd->compound) {
+            assert(vB->compound);
+            mask &= vd->compMask & vB->compMask;
+         } else {
+            mask &= intfMask;
+         }
+
+         INFO_DBG(prog->dbgFlags, REG_ALLOC,
+                  "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n",
+                  vD->id,
+                  vD->compound ? vD->compMask : 0xff,
+                  vd->id,
+                  vd->compound ? vd->compMask : intfMask,
+                  vB->compMask, intf->reg & ~7, mask);
+         if (mask)
+            regs.occupyMask(node->f, intf->reg & ~7, mask);
+      }
+      }
+   } else {
+      INFO_DBG(prog->dbgFlags, REG_ALLOC,
+               "(%%%i) X (%%%i): $r%i + %u\n",
+               vA->id, vB->id, intf->reg, intf->colors);
+      regs.occupy(node->f, intf->reg, intf->colors);
+   }
+}
+
+bool
+GCRA::selectRegisters()
+{
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n");
+
+   while (!stack.empty()) {
+      RIG_Node *node = &nodes[stack.top()];
+      stack.pop();
+
+      regs.reset(node->f);
+
+      INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n",
+               node->getValue()->id, node->colors);
+
+      for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+         checkInterference(node, ei);
+      for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+         checkInterference(node, ei);
+
+      if (!node->prefRegs.empty()) {
+         for (std::list<RIG_Node *>::const_iterator it = node->prefRegs.begin();
+              it != node->prefRegs.end();
+              ++it) {
+            if ((*it)->reg >= 0 &&
+                regs.testOccupy(node->f, (*it)->reg, node->colors)) {
+               node->reg = (*it)->reg;
+               break;
+            }
+         }
+      }
+      if (node->reg >= 0)
+         continue;
+      LValue *lval = node->getValue();
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         regs.print();
+      bool ret = regs.assign(node->reg, node->f, node->colors);
+      if (ret) {
+         INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
+         lval->compMask = node->getCompMask();
+      } else {
+         INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n",
+                  lval->id, lval->reg.size);
+         Symbol *slot = NULL;
+         if (lval->reg.file == FILE_GPR)
+            slot = spill.assignSlot(node->livei, lval->reg.size);
+         mustSpill.push_back(ValuePair(lval, slot));
+      }
+   }
+   if (!mustSpill.empty())
+      return false;
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      LValue *lval = nodes[i].getValue();
+      if (nodes[i].reg >= 0 && nodes[i].colors > 0)
+         lval->reg.data.id =
+            regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size);
+   }
+   return true;
+}
+
+bool
+GCRA::allocateRegisters(ArrayList& insns)
+{
+   bool ret;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC,
+            "allocateRegisters to %u instructions\n", insns.getSize());
+
+   nodeCount = func->allLValues.getSize();
+   nodes = new RIG_Node[nodeCount];
+   if (!nodes)
+      return false;
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      LValue *lval = reinterpret_cast<LValue *>(func->allLValues.get(i));
+      if (lval) {
+         nodes[i].init(regs, lval);
+         RIG.insert(&nodes[i]);
+      }
+   }
+
+   // coalesce first, we use only 1 RIG node for a group of joined values
+   ret = coalesce(insns);
+   if (!ret)
+      goto out;
+
+   if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+      func->printLiveIntervals();
+
+   buildRIG(insns);
+   calculateSpillWeights();
+   simplify();
+
+   ret = selectRegisters();
+   if (!ret) {
+      INFO_DBG(prog->dbgFlags, REG_ALLOC,
+               "selectRegisters failed, inserting spill code ...\n");
+      regs.reset(FILE_GPR, true);
+      spill.run(mustSpill);
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         func->print();
+   } else {
+      prog->maxGPR = std::max(prog->maxGPR, regs.getMaxAssigned(FILE_GPR));
+   }
+
+out:
+   cleanup(ret);
+   return ret;
+}
+
+void
+GCRA::cleanup(const bool success)
+{
+   mustSpill.clear();
+
+   for (ArrayList::Iterator it = func->allLValues.iterator();
+        !it.end(); it.next()) {
+      LValue *lval =  reinterpret_cast<LValue *>(it.get());
+
+      lval->livei.clear();
+
+      lval->compound = 0;
+      lval->compMask = 0;
+
+      if (lval->join == lval)
+         continue;
+
+      if (success) {
+         lval->reg.data.id = lval->join->reg.data.id;
+      } else {
+         for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+              ++d)
+            lval->join->defs.remove(*d);
+         lval->join = lval;
+      }
+   }
+
+   if (success)
+      resolveSplitsAndMerges();
+   splits.clear(); // avoid duplicate entries on next coalesce pass
+   merges.clear();
+
+   delete[] nodes;
+   nodes = NULL;
+}
+
+Symbol *
+SpillCodeInserter::assignSlot(const Interval &livei, const unsigned int size)
+{
+   SpillSlot slot;
+   int32_t offsetBase = stackSize;
+   int32_t offset;
+   std::list<SpillSlot>::iterator pos = slots.end(), it = slots.begin();
+
+   if (offsetBase % size)
+      offsetBase += size - (offsetBase % size);
+
+   slot.sym = NULL;
+
+   for (offset = offsetBase; offset < stackSize; offset += size) {
+      const int32_t entryEnd = offset + size;
+      while (it != slots.end() && it->offset < offset)
+         ++it;
+      if (it == slots.end()) // no slots left
+         break;
+      std::list<SpillSlot>::iterator bgn = it;
+
+      while (it != slots.end() && it->offset < entryEnd) {
+         it->occup.print();
+         if (it->occup.overlaps(livei))
+            break;
+         ++it;
+      }
+      if (it == slots.end() || it->offset >= entryEnd) {
+         // fits
+         for (; bgn != slots.end() && bgn->offset < entryEnd; ++bgn) {
+            bgn->occup.insert(livei);
+            if (bgn->size() == size)
+               slot.sym = bgn->sym;
+         }
+         break;
+      }
+   }
+   if (!slot.sym) {
+      stackSize = offset + size;
+      slot.offset = offset;
+      slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL);
+      if (!func->stackPtr)
+         offset += func->tlsBase;
+      slot.sym->setAddress(NULL, offset);
+      slot.sym->reg.size = size;
+      slots.insert(pos, slot)->occup.insert(livei);
+   }
+   return slot.sym;
+}
+
+void
+SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
+{
+   const DataType ty = typeOfSize(slot->reg.size);
+
+   Instruction *st;
+   if (slot->reg.file == FILE_MEMORY_LOCAL) {
+      st = new_Instruction(func, OP_STORE, ty);
+      st->setSrc(0, slot);
+      st->setSrc(1, lval);
+      lval->noSpill = 1;
+   } else {
+      st = new_Instruction(func, OP_CVT, ty);
+      st->setDef(0, slot);
+      st->setSrc(0, lval);
+   }
+   defi->bb->insertAfter(defi, st);
+}
+
+LValue *
+SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
+{
+   const DataType ty = typeOfSize(slot->reg.size);
+
+   lval = cloneShallow(func, lval);
+
+   Instruction *ld;
+   if (slot->reg.file == FILE_MEMORY_LOCAL) {
+      lval->noSpill = 1;
+      ld = new_Instruction(func, OP_LOAD, ty);
+   } else {
+      ld = new_Instruction(func, OP_CVT, ty);
+   }
+   ld->setDef(0, lval);
+   ld->setSrc(0, slot);
+
+   usei->bb->insertBefore(usei, ld);
+   return lval;
+}
+
+bool
+SpillCodeInserter::run(const std::list<ValuePair>& lst)
+{
+   for (std::list<ValuePair>::const_iterator it = lst.begin(); it != lst.end();
+        ++it) {
+      LValue *lval = it->first->asLValue();
+      Symbol *mem = it->second ? it->second->asSym() : NULL;
+
+      for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+           ++d) {
+         Value *slot = mem ?
+            static_cast<Value *>(mem) : new_LValue(func, FILE_GPR);
+         Value *tmp = NULL;
+         Instruction *last = NULL;
+
+         LValue *dval = (*d)->get()->asLValue();
+         Instruction *defi = (*d)->getInsn();
+
+         // handle uses first or they'll contain the spill stores
+         while (!dval->uses.empty()) {
+            ValueRef *u = dval->uses.front();
+            Instruction *usei = u->getInsn();
+            assert(usei);
+            if (usei->op == OP_PHI) {
+               tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot;
+               last = NULL;
+            } else
+            if (!last || usei != last->next) { // TODO: sort uses
+               tmp = unspill(usei, dval, slot);
+               last = usei;
+            }
+            u->set(tmp);
+         }
+
+         assert(defi);
+         if (defi->op == OP_PHI) {
+            d = lval->defs.erase(d);
+            --d;
+            if (slot->reg.file == FILE_MEMORY_LOCAL)
+               delete_Instruction(func->getProgram(), defi);
+            else
+               defi->setDef(0, slot);
+         } else {
+            spill(defi, slot, dval);
+         }
+      }
+
+   }
+
+   // TODO: We're not trying to reuse old slots in a potential next iteration.
+   //  We have to update the slots' livei intervals to be able to do that.
+   stackBase = stackSize;
+   slots.clear();
+   return true;
+}
+
+bool
+RegAlloc::exec()
+{
+   for (IteratorRef it = prog->calls.iteratorDFS(false);
+        !it->end(); it->next()) {
+      func = Function::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+      func->tlsBase = prog->tlsSize;
+      if (!execFunc())
+         return false;
+      prog->tlsSize += func->tlsSize;
+   }
+   return true;
+}
+
+bool
+RegAlloc::execFunc()
+{
+   InsertConstraintsPass insertConstr;
+   PhiMovesPass insertPhiMoves;
+   ArgumentMovesPass insertArgMoves;
+   BuildIntervalsPass buildIntervals;
+   SpillCodeInserter insertSpills(func);
+
+   GCRA gcra(func, insertSpills);
+
+   unsigned int i, retries;
+   bool ret;
+
+   if (!func->ins.empty()) {
+      // Insert a nop at the entry so inputs only used by the first instruction
+      // don't count as having an empty live range.
+      Instruction *nop = new_Instruction(func, OP_NOP, TYPE_NONE);
+      BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+   }
+
+   ret = insertConstr.exec(func);
+   if (!ret)
+      goto out;
+
+   ret = insertPhiMoves.run(func);
+   if (!ret)
+      goto out;
+
+   ret = insertArgMoves.run(func);
+   if (!ret)
+      goto out;
+
+   // TODO: need to fix up spill slot usage ranges to support > 1 retry
+   for (retries = 0; retries < 3; ++retries) {
+      if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC))
+         INFO("Retry: %i\n", retries);
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         func->print();
+
+      // spilling to registers may add live ranges, need to rebuild everything
+      ret = true;
+      for (sequence = func->cfg.nextSequence(), i = 0;
+           ret && i <= func->loopNestingBound;
+           sequence = func->cfg.nextSequence(), ++i)
+         ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot()));
+      if (!ret)
+         break;
+      func->orderInstructions(this->insns);
+
+      ret = buildIntervals.run(func);
+      if (!ret)
+         break;
+      ret = gcra.allocateRegisters(insns);
+      if (ret)
+         break; // success
+   }
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret);
+
+   func->tlsSize = insertSpills.getStackSize();
+out:
+   return ret;
+}
+
+// TODO: check if modifying Instruction::join here breaks anything
+void
+GCRA::resolveSplitsAndMerges()
+{
+   for (std::list<Instruction *>::iterator it = splits.begin();
+        it != splits.end();
+        ++it) {
+      Instruction *split = *it;
+      unsigned int reg = regs.idToBytes(split->getSrc(0));
+      for (int d = 0; split->defExists(d); ++d) {
+         Value *v = split->getDef(d);
+         v->reg.data.id = regs.bytesToId(v, reg);
+         v->join = v;
+         reg += v->reg.size;
+      }
+   }
+   splits.clear();
+
+   for (std::list<Instruction *>::iterator it = merges.begin();
+        it != merges.end();
+        ++it) {
+      Instruction *merge = *it;
+      unsigned int reg = regs.idToBytes(merge->getDef(0));
+      for (int s = 0; merge->srcExists(s); ++s) {
+         Value *v = merge->getSrc(s);
+         v->reg.data.id = regs.bytesToId(v, reg);
+         v->join = v;
+         reg += v->reg.size;
+      }
+   }
+   merges.clear();
+}
+
+bool Program::registerAllocation()
+{
+   RegAlloc ra(this);
+   return ra.exec();
+}
+
+bool
+RegAlloc::InsertConstraintsPass::exec(Function *ir)
+{
+   constrList.clear();
+
+   bool ret = run(ir, true, true);
+   if (ret)
+      ret = insertConstraintMoves();
+   return ret;
+}
+
+// TODO: make part of texture insn
+void
+RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
+{
+   Value *def[4];
+   int c, k, d;
+   uint8_t mask = 0;
+
+   for (d = 0, k = 0, c = 0; c < 4; ++c) {
+      if (!(tex->tex.mask & (1 << c)))
+         continue;
+      if (tex->getDef(k)->refCount()) {
+         mask |= 1 << c;
+         def[d++] = tex->getDef(k);
+      }
+      ++k;
+   }
+   tex->tex.mask = mask;
+
+   for (c = 0; c < d; ++c)
+      tex->setDef(c, def[c]);
+   for (; c < 4; ++c)
+      tex->setDef(c, NULL);
+}
+
+bool
+RegAlloc::InsertConstraintsPass::detectConflict(Instruction *cst, int s)
+{
+   Value *v = cst->getSrc(s);
+
+   // current register allocation can't handle it if a value participates in
+   // multiple constraints
+   for (Value::UseIterator it = v->uses.begin(); it != v->uses.end(); ++it) {
+      if (cst != (*it)->getInsn())
+         return true;
+   }
+
+   // can start at s + 1 because detectConflict is called on all sources
+   for (int c = s + 1; cst->srcExists(c); ++c)
+      if (v == cst->getSrc(c))
+         return true;
+
+   Instruction *defi = v->getInsn();
+
+   return (!defi || defi->constrainedDefs());
+}
+
+void
+RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n)
+{
+   Instruction *cst;
+   int d;
+
+   // first, look for an existing identical constraint op
+   for (std::list<Instruction *>::iterator it = constrList.begin();
+        it != constrList.end();
+        ++it) {
+      cst = (*it);
+      if (!i->bb->dominatedBy(cst->bb))
+         break;
+      for (d = 0; d < n; ++d)
+         if (cst->getSrc(d) != i->getSrc(d + s))
+            break;
+      if (d >= n) {
+         for (d = 0; d < n; ++d, ++s)
+            i->setSrc(s, cst->getDef(d));
+         return;
+      }
+   }
+   cst = new_Instruction(func, OP_CONSTRAINT, i->dType);
+
+   for (d = 0; d < n; ++s, ++d) {
+      cst->setDef(d, new_LValue(func, FILE_GPR));
+      cst->setSrc(d, i->getSrc(s));
+      i->setSrc(s, cst->getDef(d));
+   }
+   i->bb->insertBefore(i, cst);
+
+   constrList.push_back(cst);
+}
+
+// Add a dummy use of the pointer source of >= 8 byte loads after the load
+// to prevent it from being assigned a register which overlapping the load's
+// destination, which would produce random corruptions.
+void
+RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src)
+{
+   Instruction *hzd = new_Instruction(func, OP_NOP, TYPE_NONE);
+   hzd->setSrc(0, src->get());
+   i->bb->insertAfter(i, hzd);
+
+}
+
+// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q
+void
+RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
+{
+   uint8_t size = 0;
+   int n;
+   for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n)
+      size += insn->getDef(n)->reg.size;
+   if (n < 2)
+      return;
+   LValue *lval = new_LValue(func, FILE_GPR);
+   lval->reg.size = size;
+
+   Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size));
+   split->setSrc(0, lval);
+   for (int d = 0; d < n; ++d) {
+      split->setDef(d, insn->getDef(d));
+      insn->setDef(d, NULL);
+   }
+   insn->setDef(0, lval);
+
+   for (int k = 1, d = n; insn->defExists(d); ++d, ++k) {
+      insn->setDef(k, insn->getDef(d));
+      insn->setDef(d, NULL);
+   }
+   // carry over predicate if any (mainly for OP_UNION uses)
+   split->setPredicate(insn->cc, insn->getPredicate());
+
+   insn->bb->insertAfter(insn, split);
+   constrList.push_back(split);
+}
+void
+RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
+                                              const int a, const int b)
+{
+   uint8_t size = 0;
+   if (a >= b)
+      return;
+   for (int s = a; s <= b; ++s)
+      size += insn->getSrc(s)->reg.size;
+   if (!size)
+      return;
+   LValue *lval = new_LValue(func, FILE_GPR);
+   lval->reg.size = size;
+
+   Value *save[3];
+   insn->takeExtraSources(0, save);
+
+   Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size));
+   merge->setDef(0, lval);
+   for (int s = a, i = 0; s <= b; ++s, ++i) {
+      merge->setSrc(i, insn->getSrc(s));
+      insn->setSrc(s, NULL);
+   }
+   insn->setSrc(a, lval);
+
+   for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) {
+      insn->setSrc(k, insn->getSrc(s));
+      insn->setSrc(s, NULL);
+   }
+   insn->bb->insertBefore(insn, merge);
+
+   insn->putExtraSources(0, save);
+
+   constrList.push_back(merge);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
+{
+   if (isTextureOp(tex->op))
+      textureMask(tex);
+   condenseDefs(tex);
+
+   if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
+      condenseSrcs(tex, 3, (3 + typeSizeof(tex->dType) / 4) - 1);
+   } else
+   if (isTextureOp(tex->op)) {
+      int n = tex->srcCount(0xff, true);
+      if (n > 4) {
+         condenseSrcs(tex, 0, 3);
+         if (n > 5) // NOTE: first call modified positions already
+            condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
+      } else
+      if (n > 1) {
+         condenseSrcs(tex, 0, n - 1);
+      }
+   }
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
+{
+   int n, s;
+
+   textureMask(tex);
+
+   if (tex->op == OP_TXQ) {
+      s = tex->srcCount(0xff);
+      n = 0;
+   } else {
+      s = tex->tex.target.getArgCount();
+      if (!tex->tex.target.isArray() &&
+          (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0))
+         ++s;
+      if (tex->op == OP_TXD && tex->tex.useOffsets)
+         ++s;
+      n = tex->srcCount(0xff) - s;
+      assert(n <= 4);
+   }
+
+   if (s > 1)
+      condenseSrcs(tex, 0, s - 1);
+   if (n > 1) // NOTE: first call modified positions already
+      condenseSrcs(tex, 1, n);
+
+   condenseDefs(tex);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex)
+{
+   Value *pred = tex->getPredicate();
+   if (pred)
+      tex->setPredicate(tex->cc, NULL);
+
+   textureMask(tex);
+
+   assert(tex->defExists(0) && tex->srcExists(0));
+   // make src and def count match
+   int c;
+   for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
+      if (!tex->srcExists(c))
+         tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
+      if (!tex->defExists(c))
+         tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
+   }
+   if (pred)
+      tex->setPredicate(tex->cc, pred);
+   condenseDefs(tex);
+   condenseSrcs(tex, 0, c - 1);
+}
+
+// Insert constraint markers for instructions whose multiple sources must be
+// located in consecutive registers.
+bool
+RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
+{
+   TexInstruction *tex;
+   Instruction *next;
+   int s, size;
+
+   targ = bb->getProgram()->getTarget();
+
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if ((tex = i->asTex())) {
+         switch (targ->getChipset() & ~0xf) {
+         case 0x50:
+         case 0x80:
+         case 0x90:
+         case 0xa0:
+            texConstraintNV50(tex);
+            break;
+         case 0xc0:
+         case 0xd0:
+            texConstraintNVC0(tex);
+            break;
+         case 0xe0:
+         case NVISA_GK110_CHIPSET:
+            texConstraintNVE0(tex);
+            break;
+         default:
+            break;
+         }
+      } else
+      if (i->op == OP_EXPORT || i->op == OP_STORE) {
+         for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) {
+            assert(i->srcExists(s));
+            size -= i->getSrc(s)->reg.size;
+         }
+         condenseSrcs(i, 1, s - 1);
+      } else
+      if (i->op == OP_LOAD || i->op == OP_VFETCH) {
+         condenseDefs(i);
+         if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(0));
+      } else
+      if (i->op == OP_UNION ||
+          i->op == OP_MERGE ||
+          i->op == OP_SPLIT) {
+         constrList.push_back(i);
+      }
+   }
+   return true;
+}
+
+// Insert extra moves so that, if multiple register constraints on a value are
+// in conflict, these conflicts can be resolved.
+bool
+RegAlloc::InsertConstraintsPass::insertConstraintMoves()
+{
+   for (std::list<Instruction *>::iterator it = constrList.begin();
+        it != constrList.end();
+        ++it) {
+      Instruction *cst = *it;
+      Instruction *mov;
+
+      if (cst->op == OP_SPLIT && 0) {
+         // spilling splits is annoying, just make sure they're separate
+         for (int d = 0; cst->defExists(d); ++d) {
+            if (!cst->getDef(d)->refCount())
+               continue;
+            LValue *lval = new_LValue(func, cst->def(d).getFile());
+            const uint8_t size = cst->def(d).getSize();
+            lval->reg.size = size;
+
+            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+            mov->setSrc(0, lval);
+            mov->setDef(0, cst->getDef(d));
+            cst->setDef(d, mov->getSrc(0));
+            cst->bb->insertAfter(cst, mov);
+
+            cst->getSrc(0)->asLValue()->noSpill = 1;
+            mov->getSrc(0)->asLValue()->noSpill = 1;
+         }
+      } else
+      if (cst->op == OP_MERGE || cst->op == OP_UNION) {
+         for (int s = 0; cst->srcExists(s); ++s) {
+            const uint8_t size = cst->src(s).getSize();
+
+            if (!cst->getSrc(s)->defs.size()) {
+               mov = new_Instruction(func, OP_NOP, typeOfSize(size));
+               mov->setDef(0, cst->getSrc(s));
+               cst->bb->insertBefore(cst, mov);
+               continue;
+            }
+            assert(cst->getSrc(s)->defs.size() == 1); // still SSA
+
+            Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+            // catch some cases where don't really need MOVs
+            if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs())
+               continue;
+
+            LValue *lval = new_LValue(func, cst->src(s).getFile());
+            lval->reg.size = size;
+
+            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+            mov->setDef(0, lval);
+            mov->setSrc(0, cst->getSrc(s));
+            cst->setSrc(s, mov->getDef(0));
+            cst->bb->insertBefore(cst, mov);
+
+            cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
+
+            if (cst->op == OP_UNION)
+               mov->setPredicate(defi->cc, defi->getPredicate());
+         }
+      }
+   }
+
+   return true;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
new file mode 100644
index 00000000000..2e432349f24
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+// Converts nv50 IR generated from TGSI to SSA form.
+
+// DominatorTree implements an algorithm for finding immediate dominators,
+// as described by T. Lengauer & R. Tarjan.
+class DominatorTree : public Graph
+{
+public:
+   DominatorTree(Graph *cfg);
+   ~DominatorTree() { }
+
+   bool dominates(BasicBlock *, BasicBlock *);
+
+   void findDominanceFrontiers();
+
+private:
+   void build();
+   void buildDFS(Node *);
+
+   void squash(int);
+   inline void link(int, int);
+   inline int eval(int);
+
+   void debugPrint();
+
+   Graph *cfg;
+
+   Node **vert;
+   int *data;
+   const int count;
+
+   #define SEMI(i)     (data[(i) + 0 * count])
+   #define ANCESTOR(i) (data[(i) + 1 * count])
+   #define PARENT(i)   (data[(i) + 2 * count])
+   #define LABEL(i)    (data[(i) + 3 * count])
+   #define DOM(i)      (data[(i) + 4 * count])
+};
+
+void DominatorTree::debugPrint()
+{
+   for (int i = 0; i < count; ++i) {
+      INFO("SEMI(%i) = %i\n", i, SEMI(i));
+      INFO("ANCESTOR(%i) = %i\n", i, ANCESTOR(i));
+      INFO("PARENT(%i) = %i\n", i, PARENT(i));
+      INFO("LABEL(%i) = %i\n", i, LABEL(i));
+      INFO("DOM(%i) = %i\n", i, DOM(i));
+   }
+}
+
+DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph),
+                                               count(cfg->getSize())
+{
+   int i = 0;
+
+   vert = new Node * [count];
+   data = new int[5 * count];
+
+   for (IteratorRef it = cfg->iteratorDFS(true); !it->end(); it->next(), ++i) {
+      vert[i] = reinterpret_cast<Node *>(it->get());
+      vert[i]->tag = i;
+      LABEL(i) = i;
+      SEMI(i) = ANCESTOR(i) = -1;
+   }
+
+   build();
+
+   delete[] vert;
+   delete[] data;
+}
+
+void DominatorTree::buildDFS(Graph::Node *node)
+{
+   SEMI(node->tag) = node->tag;
+
+   for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+      if (SEMI(ei.getNode()->tag) < 0) {
+         buildDFS(ei.getNode());
+         PARENT(ei.getNode()->tag) = node->tag;
+      }
+   }
+}
+
+void DominatorTree::squash(int v)
+{
+   if (ANCESTOR(ANCESTOR(v)) >= 0) {
+      squash(ANCESTOR(v));
+
+      if (SEMI(LABEL(ANCESTOR(v))) < SEMI(LABEL(v)))
+         LABEL(v) = LABEL(ANCESTOR(v));
+      ANCESTOR(v) = ANCESTOR(ANCESTOR(v));
+   }
+}
+
+int DominatorTree::eval(int v)
+{
+   if (ANCESTOR(v) < 0)
+      return v;
+   squash(v);
+   return LABEL(v);
+}
+
+void DominatorTree::link(int v, int w)
+{
+   ANCESTOR(w) = v;
+}
+
+void DominatorTree::build()
+{
+   DLList *bucket = new DLList[count];
+   Node *nv, *nw;
+   int p, u, v, w;
+
+   buildDFS(cfg->getRoot());
+
+   for (w = count - 1; w >= 1; --w) {
+      nw = vert[w];
+      assert(nw->tag == w);
+      for (Graph::EdgeIterator ei = nw->incident(); !ei.end(); ei.next()) {
+         nv = ei.getNode();
+         v = nv->tag;
+         u = eval(v);
+         if (SEMI(u) < SEMI(w))
+            SEMI(w) = SEMI(u);
+      }
+      p = PARENT(w);
+      bucket[SEMI(w)].insert(nw);
+      link(p, w);
+
+      for (DLList::Iterator it = bucket[p].iterator(); !it.end(); it.erase()) {
+         v = reinterpret_cast<Node *>(it.get())->tag;
+         u = eval(v);
+         DOM(v) = (SEMI(u) < SEMI(v)) ? u : p;
+      }
+   }
+   for (w = 1; w < count; ++w) {
+      if (DOM(w) != SEMI(w))
+         DOM(w) = DOM(DOM(w));
+   }
+   DOM(0) = 0;
+
+   insert(&BasicBlock::get(cfg->getRoot())->dom);
+   do {
+      p = 0;
+      for (v = 1; v < count; ++v) {
+         nw = &BasicBlock::get(vert[DOM(v)])->dom;;
+         nv = &BasicBlock::get(vert[v])->dom;
+         if (nw->getGraph() && !nv->getGraph()) {
+            ++p;
+            nw->attach(nv, Graph::Edge::TREE);
+         }
+      }
+   } while (p);
+
+   delete[] bucket;
+}
+
+#undef SEMI
+#undef ANCESTOR
+#undef PARENT
+#undef LABEL
+#undef DOM
+
+void DominatorTree::findDominanceFrontiers()
+{
+   BasicBlock *bb;
+
+   for (IteratorRef dtIt = iteratorDFS(false); !dtIt->end(); dtIt->next()) {
+      EdgeIterator succIt, chldIt;
+
+      bb = BasicBlock::get(reinterpret_cast<Node *>(dtIt->get()));
+      bb->getDF().clear();
+
+      for (succIt = bb->cfg.outgoing(); !succIt.end(); succIt.next()) {
+         BasicBlock *dfLocal = BasicBlock::get(succIt.getNode());
+         if (dfLocal->idom() != bb)
+            bb->getDF().insert(dfLocal);
+      }
+
+      for (chldIt = bb->dom.outgoing(); !chldIt.end(); chldIt.next()) {
+         BasicBlock *cb = BasicBlock::get(chldIt.getNode());
+
+         DLList::Iterator dfIt = cb->getDF().iterator();
+         for (; !dfIt.end(); dfIt.next()) {
+            BasicBlock *dfUp = BasicBlock::get(dfIt);
+            if (dfUp->idom() != bb)
+               bb->getDF().insert(dfUp);
+         }
+      }
+   }
+}
+
+// liveIn(bb) = usedBeforeAssigned(bb) U (liveOut(bb) - assigned(bb))
+void
+Function::buildLiveSetsPreSSA(BasicBlock *bb, const int seq)
+{
+   Function *f = bb->getFunction();
+   BitSet usedBeforeAssigned(allLValues.getSize(), true);
+   BitSet assigned(allLValues.getSize(), true);
+
+   bb->liveSet.allocate(allLValues.getSize(), false);
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+      if (out == bb)
+         continue;
+      if (out->cfg.visit(seq))
+         buildLiveSetsPreSSA(out, seq);
+      if (!n++)
+         bb->liveSet = out->liveSet;
+      else
+         bb->liveSet |= out->liveSet;
+   }
+   if (!n && !bb->liveSet.marker)
+      bb->liveSet.fill(0);
+   bb->liveSet.marker = true;
+
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      for (int s = 0; i->srcExists(s); ++s)
+         if (i->getSrc(s)->asLValue() && !assigned.test(i->getSrc(s)->id))
+            usedBeforeAssigned.set(i->getSrc(s)->id);
+      for (int d = 0; i->defExists(d); ++d)
+         assigned.set(i->getDef(d)->id);
+   }
+
+   if (bb == BasicBlock::get(f->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = f->outs.begin();
+           it != f->outs.end(); ++it) {
+         if (!assigned.test(it->get()->id))
+            usedBeforeAssigned.set(it->get()->id);
+      }
+   }
+
+   bb->liveSet.andNot(assigned);
+   bb->liveSet |= usedBeforeAssigned;
+}
+
+void
+Function::buildDefSetsPreSSA(BasicBlock *bb, const int seq)
+{
+   bb->defSet.allocate(allLValues.getSize(), !bb->liveSet.marker);
+   bb->liveSet.marker = true;
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+
+      if (in->cfg.visit(seq))
+         buildDefSetsPreSSA(in, seq);
+
+      bb->defSet |= in->defSet;
+   }
+
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      for (int d = 0; i->defExists(d); ++d)
+         bb->defSet.set(i->getDef(d)->id);
+   }
+}
+
+class RenamePass
+{
+public:
+   RenamePass(Function *);
+   ~RenamePass();
+
+   bool run();
+   void search(BasicBlock *);
+
+   inline LValue *getStackTop(Value *);
+
+   LValue *mkUndefined(Value *);
+
+private:
+   Stack *stack;
+   Function *func;
+   Program *prog;
+};
+
+bool
+Program::convertToSSA()
+{
+   for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+      Function *fn = reinterpret_cast<Function *>(fi.get());
+      if (!fn->convertToSSA())
+         return false;
+   }
+   return true;
+}
+
+// XXX: add edge from entry to exit ?
+
+// Efficiently Computing Static Single Assignment Form and
+//  the Control Dependence Graph,
+// R. Cytron, J. Ferrante, B. K. Rosen, M. N. Wegman, F. K. Zadeck
+bool
+Function::convertToSSA()
+{
+   // 0. calculate live in variables (for pruned SSA)
+   buildLiveSets();
+
+   // 1. create the dominator tree
+   domTree = new DominatorTree(&cfg);
+   reinterpret_cast<DominatorTree *>(domTree)->findDominanceFrontiers();
+
+   // 2. insert PHI functions
+   DLList workList;
+   LValue *lval;
+   BasicBlock *bb;
+   int var;
+   int iterCount = 0;
+   int *hasAlready = new int[allBBlocks.getSize() * 2];
+   int *work = &hasAlready[allBBlocks.getSize()];
+
+   memset(hasAlready, 0, allBBlocks.getSize() * 2 * sizeof(int));
+
+   // for each variable
+   for (var = 0; var < allLValues.getSize(); ++var) {
+      if (!allLValues.get(var))
+         continue;
+      lval = reinterpret_cast<Value *>(allLValues.get(var))->asLValue();
+      if (!lval || lval->defs.empty())
+         continue;
+      ++iterCount;
+
+      // TODO: don't add phi functions for values that aren't used outside
+      //  the BB they're defined in
+
+      // gather blocks with assignments to lval in workList
+      for (Value::DefIterator d = lval->defs.begin();
+           d != lval->defs.end(); ++d) {
+         bb = ((*d)->getInsn() ? (*d)->getInsn()->bb : NULL);
+         if (!bb)
+            continue; // instruction likely been removed but not XXX deleted
+
+         if (work[bb->getId()] == iterCount)
+            continue;
+         work[bb->getId()] = iterCount;
+         workList.insert(bb);
+      }
+
+      // for each block in workList, insert a phi for lval in the block's
+      //  dominance frontier (if we haven't already done so)
+      for (DLList::Iterator wI = workList.iterator(); !wI.end(); wI.erase()) {
+         bb = BasicBlock::get(wI);
+
+         DLList::Iterator dfIter = bb->getDF().iterator();
+         for (; !dfIter.end(); dfIter.next()) {
+            Instruction *phi;
+            BasicBlock *dfBB = BasicBlock::get(dfIter);
+
+            if (hasAlready[dfBB->getId()] >= iterCount)
+               continue;
+            hasAlready[dfBB->getId()] = iterCount;
+
+            // pruned SSA: don't need a phi if the value is not live-in
+            if (!dfBB->liveSet.test(lval->id))
+               continue;
+
+            phi = new_Instruction(this, OP_PHI, typeOfSize(lval->reg.size));
+            dfBB->insertTail(phi);
+
+            phi->setDef(0, lval);
+            for (int s = 0; s < dfBB->cfg.incidentCount(); ++s)
+               phi->setSrc(s, lval);
+
+            if (work[dfBB->getId()] < iterCount) {
+               work[dfBB->getId()] = iterCount;
+               wI.insert(dfBB);
+            }
+         }
+      }
+   }
+   delete[] hasAlready;
+
+   RenamePass rename(this);
+   return rename.run();
+}
+
+RenamePass::RenamePass(Function *fn) : func(fn), prog(fn->getProgram())
+{
+   stack = new Stack[func->allLValues.getSize()];
+}
+
+RenamePass::~RenamePass()
+{
+   if (stack)
+      delete[] stack;
+}
+
+LValue *
+RenamePass::getStackTop(Value *val)
+{
+   if (!stack[val->id].getSize())
+      return 0;
+   return reinterpret_cast<LValue *>(stack[val->id].peek().u.p);
+}
+
+LValue *
+RenamePass::mkUndefined(Value *val)
+{
+   LValue *lval = val->asLValue();
+   assert(lval);
+   LValue *ud = new_LValue(func, lval);
+   Instruction *nop = new_Instruction(func, OP_NOP, typeOfSize(lval->reg.size));
+   nop->setDef(0, ud);
+   BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+   return ud;
+}
+
+bool RenamePass::run()
+{
+   if (!stack)
+      return false;
+   search(BasicBlock::get(func->domTree->getRoot()));
+
+   return true;
+}
+
+// Go through BBs in dominance order, create new values for each definition,
+// and replace all sources with their current new values.
+//
+// NOTE: The values generated for function inputs/outputs have no connection
+// to their corresponding outputs/inputs in other functions. Only allocation
+// of physical registers will establish this connection.
+//
+void RenamePass::search(BasicBlock *bb)
+{
+   LValue *lval, *ssa;
+   int d, s;
+   const Target *targ = prog->getTarget();
+
+   // Put current definitions for function inputs values on the stack.
+   // They can be used before any redefinitions are pushed.
+   if (bb == BasicBlock::get(func->cfg.getRoot())) {
+      for (std::deque<ValueDef>::iterator it = func->ins.begin();
+           it != func->ins.end(); ++it) {
+         lval = it->get()->asLValue();
+         assert(lval);
+
+         ssa = new_LValue(func, targ->nativeFile(lval->reg.file));
+         ssa->reg.size = lval->reg.size;
+         ssa->reg.data.id = lval->reg.data.id;
+
+         it->setSSA(ssa);
+         stack[lval->id].push(ssa);
+      }
+   }
+
+   for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+      // PHI sources get definitions from the passes through the incident BBs,
+      // so skip them here.
+      if (stmt->op != OP_PHI) {
+         for (s = 0; stmt->srcExists(s); ++s) {
+            lval = stmt->getSrc(s)->asLValue();
+            if (!lval)
+               continue;
+            // Values on the stack created in previously visited blocks, and
+            // function inputs, will be valid because they dominate this one.
+            lval = getStackTop(lval);
+            if (!lval)
+               lval = mkUndefined(stmt->getSrc(s));
+            stmt->setSrc(s, lval);
+         }
+      }
+      for (d = 0; stmt->defExists(d); ++d) {
+         lval = stmt->def(d).get()->asLValue();
+         assert(lval);
+         stmt->def(d).setSSA(
+            new_LValue(func, targ->nativeFile(lval->reg.file)));
+         stmt->def(d).get()->reg.size = lval->reg.size;
+         stmt->def(d).get()->reg.data.id = lval->reg.data.id;
+         stack[lval->id].push(stmt->def(d).get());
+      }
+   }
+
+   // Update sources of PHI ops corresponding to this BB in outgoing BBs.
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      Instruction *phi;
+      int p = 0;
+      BasicBlock *sb = BasicBlock::get(ei.getNode());
+
+      // which predecessor of sb is bb ?
+      for (Graph::EdgeIterator ei = sb->cfg.incident(); !ei.end(); ei.next()) {
+         if (ei.getNode() == &bb->cfg)
+            break;
+         ++p;
+      }
+      assert(p < sb->cfg.incidentCount());
+
+      for (phi = sb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         lval = getStackTop(phi->getSrc(p));
+         if (!lval)
+            lval = mkUndefined(phi->getSrc(p));
+         phi->setSrc(p, lval);
+      }
+   }
+
+   // Visit the BBs we dominate.
+   for (Graph::EdgeIterator ei = bb->dom.outgoing(); !ei.end(); ei.next())
+      search(BasicBlock::get(ei.getNode()));
+
+   // Update function outputs to the last definitions of their pre-SSA values.
+   // I hope they're unique, i.e. that we get PHIs for all of them ...
+   if (bb == BasicBlock::get(func->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = func->outs.begin();
+           it != func->outs.end(); ++it) {
+         lval = it->get()->asLValue();
+         if (!lval)
+            continue;
+         lval = getStackTop(lval);
+         if (!lval)
+            lval = mkUndefined(it->get());
+         it->set(lval);
+      }
+   }
+
+   // Pop the values we created in this block from the stack because we will
+   // return to blocks that we do not dominate.
+   for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+      if (stmt->op == OP_NOP)
+         continue;
+      for (d = 0; stmt->defExists(d); ++d)
+         stack[stmt->def(d).preSSA()->id].pop();
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
new file mode 100644
index 00000000000..443acfcd812
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+const uint8_t Target::operationSrcNr[OP_LAST + 1] =
+{
+   0, 0,                   // NOP, PHI
+   0, 0, 0, 0,             // UNION, SPLIT, MERGE, CONSTRAINT
+   1, 1, 2,                // MOV, LOAD, STORE
+   2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD
+   1, 1, 1,                // ABS, NEG, NOT
+   2, 2, 2, 2, 2,          // AND, OR, XOR, SHL, SHR
+   2, 2, 1,                // MAX, MIN, SAT
+   1, 1, 1, 1,             // CEIL, FLOOR, TRUNC, CVT
+   3, 3, 3, 2, 3, 3,       // SET_AND,OR,XOR, SET, SELP, SLCT
+   1, 1, 1, 1, 1, 1,       // RCP, RSQ, LG2, SIN, COS, EX2
+   1, 1, 1, 1, 1, 2,       // EXP, LOG, PRESIN, PREEX2, SQRT, POW
+   0, 0, 0, 0, 0,          // BRA, CALL, RET, CONT, BREAK,
+   0, 0, 0,                // PRERET,CONT,BREAK
+   0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
+   1, 1, 2, 1, 2,          // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+   1, 1,                   // EMIT, RESTART
+   1, 1, 1,                // TEX, TXB, TXL,
+   1, 1, 1, 1, 1, 2,       // TXF, TXQ, TXD, TXG, TEXCSAA, TEXPREP
+   1, 1, 2, 2, 2, 2, 2,    // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
+   3, 3, 3, 3,             // SUBFM, SUCLAMP, SUEAU, MADSP
+   0,                      // TEXBAR
+   1, 1,                   // DFDX, DFDY
+   1, 2, 2, 0, 0,          // RDSV, WRSV, QUADOP, QUADON, QUADPOP
+   2, 3, 2, 3,             // POPCNT, INSBF, EXTBF, PERMT
+   2, 2,                   // ATOM, BAR
+   2, 2, 2, 2, 3, 2,       // VADD, VAVG, VMIN, VMAX, VSAD, VSET,
+   2, 2, 2, 1,             // VSHR, VSHL, VSEL, CCTL
+   0
+};
+
+const OpClass Target::operationClass[OP_LAST + 1] =
+{
+   // NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT
+   OPCLASS_OTHER,
+   OPCLASS_PSEUDO,
+   OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO,
+   // MOV; LOAD; STORE
+   OPCLASS_MOVE,
+   OPCLASS_LOAD,
+   OPCLASS_STORE,
+   // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD
+   OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+   OPCLASS_ARITH, OPCLASS_ARITH,
+   OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+   // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
+   OPCLASS_CONVERT, OPCLASS_CONVERT,
+   OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
+   OPCLASS_SHIFT, OPCLASS_SHIFT,
+   // MAX, MIN
+   OPCLASS_COMPARE, OPCLASS_COMPARE,
+   // SAT, CEIL, FLOOR, TRUNC; CVT
+   OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT,
+   OPCLASS_CONVERT,
+   // SET(AND,OR,XOR); SELP, SLCT
+   OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE,
+   OPCLASS_COMPARE, OPCLASS_COMPARE,
+   // RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW
+   OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+   OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+   OPCLASS_SFU, OPCLASS_SFU,
+   // BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   // DISCARD, EXIT
+   OPCLASS_FLOW, OPCLASS_FLOW,
+   // MEMBAR
+   OPCLASS_CONTROL,
+   // VFETCH, PFETCH, EXPORT
+   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+   // LINTERP, PINTERP
+   OPCLASS_SFU, OPCLASS_SFU,
+   // EMIT, RESTART
+   OPCLASS_CONTROL, OPCLASS_CONTROL,
+   // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA; TEXPREP
+   OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+   OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+   OPCLASS_TEXTURE,
+   // SULDB, SULDP, SUSTB, SUSTP; SUREDB, SUREDP, SULEA
+   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_ATOMIC, OPCLASS_SURFACE,
+   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_SURFACE,
+   // SUBFM, SUCLAMP, SUEAU, MADSP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
+   // TEXBAR
+   OPCLASS_OTHER,
+   // DFDX, DFDY, RDSV, WRSV; QUADOP, QUADON, QUADPOP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
+   OPCLASS_OTHER, OPCLASS_CONTROL, OPCLASS_CONTROL,
+   // POPCNT, INSBF, EXTBF, PERMT
+   OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD,
+   // ATOM, BAR
+   OPCLASS_ATOMIC, OPCLASS_CONTROL,
+   // VADD, VAVG, VMIN, VMAX
+   OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+   // VSAD, VSET, VSHR, VSHL
+   OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+   // VSEL, CCTL
+   OPCLASS_VECTOR, OPCLASS_CONTROL,
+   OPCLASS_PSEUDO // LAST
+};
+
+
+extern Target *getTargetNVC0(unsigned int chipset);
+extern Target *getTargetNV50(unsigned int chipset);
+
+Target *Target::create(unsigned int chipset)
+{
+   switch (chipset & 0xf0) {
+   case 0xc0:
+   case 0xd0:
+   case 0xe0:
+   case NVISA_GK110_CHIPSET:
+      return getTargetNVC0(chipset);
+   case 0x50:
+   case 0x80:
+   case 0x90:
+   case 0xa0:
+      return getTargetNV50(chipset);
+   default:
+      ERROR("unsupported target: NV%x\n", chipset);
+      return 0;
+   }
+}
+
+void Target::destroy(Target *targ)
+{
+   delete targ;
+}
+
+CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+{
+}
+
+void
+CodeEmitter::setCodeLocation(void *ptr, uint32_t size)
+{
+   code = reinterpret_cast<uint32_t *>(ptr);
+   codeSize = 0;
+   codeSizeLimit = size;
+}
+
+void
+CodeEmitter::printBinary() const
+{
+   uint32_t *bin = code - codeSize / 4;
+   INFO("program binary (%u bytes)", codeSize);
+   for (unsigned int pos = 0; pos < codeSize / 4; ++pos) {
+      if ((pos % 8) == 0)
+         INFO("\n");
+      INFO("%08x ", bin[pos]);
+   }
+   INFO("\n");
+}
+
+static inline uint32_t sizeToBundlesNVE4(uint32_t size)
+{
+   return (size + 55) / 56;
+}
+
+void
+CodeEmitter::prepareEmission(Program *prog)
+{
+   for (ArrayList::Iterator fi = prog->allFuncs.iterator();
+        !fi.end(); fi.next()) {
+      Function *func = reinterpret_cast<Function *>(fi.get());
+      func->binPos = prog->binSize;
+      prepareEmission(func);
+
+      // adjust sizes & positions for schedulding info:
+      if (prog->getTarget()->hasSWSched) {
+         uint32_t adjPos = func->binPos;
+         BasicBlock *bb = NULL;
+         for (int i = 0; i < func->bbCount; ++i) {
+            bb = func->bbArray[i];
+            int32_t adjSize = bb->binSize;
+            if (adjPos % 64) {
+               adjSize -= 64 - adjPos % 64;
+               if (adjSize < 0)
+                  adjSize = 0;
+            }
+            adjSize = bb->binSize + sizeToBundlesNVE4(adjSize) * 8;
+            bb->binPos = adjPos;
+            bb->binSize = adjSize;
+            adjPos += adjSize;
+         }
+         if (bb)
+            func->binSize = adjPos - func->binPos;
+      }
+
+      prog->binSize += func->binSize;
+   }
+}
+
+void
+CodeEmitter::prepareEmission(Function *func)
+{
+   func->bbCount = 0;
+   func->bbArray = new BasicBlock * [func->cfg.getSize()];
+
+   BasicBlock::get(func->cfg.getRoot())->binPos = func->binPos;
+
+   for (IteratorRef it = func->cfg.iteratorCFG(); !it->end(); it->next())
+      prepareEmission(BasicBlock::get(*it));
+}
+
+void
+CodeEmitter::prepareEmission(BasicBlock *bb)
+{
+   Instruction *i, *next;
+   Function *func = bb->getFunction();
+   int j;
+   unsigned int nShort;
+
+   for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j);
+
+   for (; j >= 0; --j) {
+      BasicBlock *in = func->bbArray[j];
+      Instruction *exit = in->getExit();
+
+      if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) {
+         in->binSize -= 8;
+         func->binSize -= 8;
+
+         for (++j; j < func->bbCount; ++j)
+            func->bbArray[j]->binPos -= 8;
+
+         in->remove(exit);
+      }
+      bb->binPos = in->binPos + in->binSize;
+      if (in->binSize) // no more no-op branches to bb
+         break;
+   }
+   func->bbArray[func->bbCount++] = bb;
+
+   if (!bb->getExit())
+      return;
+
+   // determine encoding size, try to group short instructions
+   nShort = 0;
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (i->op == OP_MEMBAR && !targ->isOpSupported(OP_MEMBAR, TYPE_NONE)) {
+         bb->remove(i);
+         continue;
+      }
+
+      i->encSize = getMinEncodingSize(i);
+      if (next && i->encSize < 8)
+         ++nShort;
+      else
+      if ((nShort & 1) && next && getMinEncodingSize(next) == 4) {
+         if (i->isCommutationLegal(i->next)) {
+            bb->permuteAdjacent(i, next);
+            next->encSize = 4;
+            next = i;
+            i = i->prev;
+            ++nShort;
+         } else
+         if (i->isCommutationLegal(i->prev) && next->next) {
+            bb->permuteAdjacent(i->prev, i);
+            next->encSize = 4;
+            next = next->next;
+            bb->binSize += 4;
+            ++nShort;
+         } else {
+            i->encSize = 8;
+            i->prev->encSize = 8;
+            bb->binSize += 4;
+            nShort = 0;
+         }
+      } else {
+         i->encSize = 8;
+         if (nShort & 1) {
+            i->prev->encSize = 8;
+            bb->binSize += 4;
+         }
+         nShort = 0;
+      }
+      bb->binSize += i->encSize;
+   }
+
+   if (bb->getExit()->encSize == 4) {
+      assert(nShort);
+      bb->getExit()->encSize = 8;
+      bb->binSize += 4;
+
+      if ((bb->getExit()->prev->encSize == 4) && !(nShort & 1)) {
+         bb->binSize += 8;
+         bb->getExit()->prev->encSize = 8;
+      }
+   }
+   assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 8));
+
+   func->binSize += bb->binSize;
+}
+
+void
+Program::emitSymbolTable(struct nv50_ir_prog_info *info)
+{
+   unsigned int n = 0, nMax = allFuncs.getSize();
+
+   info->bin.syms =
+      (struct nv50_ir_prog_symbol *)MALLOC(nMax * sizeof(*info->bin.syms));
+
+   for (ArrayList::Iterator fi = allFuncs.iterator();
+        !fi.end();
+        fi.next(), ++n) {
+      Function *f = (Function *)fi.get();
+      assert(n < nMax);
+
+      info->bin.syms[n].label = f->getLabel();
+      info->bin.syms[n].offset = f->binPos;
+   }
+
+   info->bin.numSyms = n;
+}
+
+bool
+Program::emitBinary(struct nv50_ir_prog_info *info)
+{
+   CodeEmitter *emit = target->getCodeEmitter(progType);
+
+   emit->prepareEmission(this);
+
+   if (dbgFlags & NV50_IR_DEBUG_BASIC)
+      this->print();
+
+   if (!binSize) {
+      code = NULL;
+      return false;
+   }
+   code = reinterpret_cast<uint32_t *>(MALLOC(binSize));
+   if (!code)
+      return false;
+   emit->setCodeLocation(code, binSize);
+
+   for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+      Function *fn = reinterpret_cast<Function *>(fi.get());
+
+      assert(emit->getCodeSize() == fn->binPos);
+
+      for (int b = 0; b < fn->bbCount; ++b)
+         for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next)
+            emit->emitInstruction(i);
+   }
+   info->bin.relocData = emit->getRelocInfo();
+
+   emitSymbolTable(info);
+
+   // the nvc0 driver will print the binary iself together with the header
+   if ((dbgFlags & NV50_IR_DEBUG_BASIC) && getTarget()->getChipset() < 0xc0)
+      emit->printBinary();
+
+   delete emit;
+   return true;
+}
+
+#define RELOC_ALLOC_INCREMENT 8
+
+bool
+CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m,
+                      int s)
+{
+   unsigned int n = relocInfo ? relocInfo->count : 0;
+
+   if (!(n % RELOC_ALLOC_INCREMENT)) {
+      size_t size = sizeof(RelocInfo) + n * sizeof(RelocEntry);
+      relocInfo = reinterpret_cast<RelocInfo *>(
+         REALLOC(relocInfo, n ? size : 0,
+                 size + RELOC_ALLOC_INCREMENT * sizeof(RelocEntry)));
+      if (!relocInfo)
+         return false;
+      if (n == 0)
+         memset(relocInfo, 0, sizeof(RelocInfo));
+   }
+   ++relocInfo->count;
+
+   relocInfo->entry[n].data = data;
+   relocInfo->entry[n].mask = m;
+   relocInfo->entry[n].offset = codeSize + w * 4;
+   relocInfo->entry[n].bitPos = s;
+   relocInfo->entry[n].type = ty;
+
+   return true;
+}
+
+void
+RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const
+{
+   uint32_t value = 0;
+
+   switch (type) {
+   case TYPE_CODE: value = info->codePos; break;
+   case TYPE_BUILTIN: value = info->libPos; break;
+   case TYPE_DATA: value = info->dataPos; break;
+   default:
+      assert(0);
+      break;
+   }
+   value += data;
+   value = (bitPos < 0) ? (value >> -bitPos) : (value << bitPos);
+
+   binary[offset / 4] &= ~mask;
+   binary[offset / 4] |= value & mask;
+}
+
+} // namespace nv50_ir
+
+
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+
+void
+nv50_ir_relocate_code(void *relocData, uint32_t *code,
+                      uint32_t codePos,
+                      uint32_t libPos,
+                      uint32_t dataPos)
+{
+   nv50_ir::RelocInfo *info = reinterpret_cast<nv50_ir::RelocInfo *>(relocData);
+
+   info->codePos = codePos;
+   info->libPos = libPos;
+   info->dataPos = dataPos;
+
+   for (unsigned int i = 0; i < info->count; ++i)
+      info->entry[i].apply(code, info);
+}
+
+void
+nv50_ir_get_target_library(uint32_t chipset,
+                           const uint32_t **code, uint32_t *size)
+{
+   nv50_ir::Target *targ = nv50_ir::Target::create(chipset);
+   targ->getBuiltinCode(code, size);
+   nv50_ir::Target::destroy(targ);
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
new file mode 100644
index 00000000000..9913ca13ddd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_TARGET_H__
+#define __NV50_IR_TARGET_H__
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+struct RelocInfo;
+
+struct RelocEntry
+{
+   enum Type
+   {
+      TYPE_CODE,
+      TYPE_BUILTIN,
+      TYPE_DATA
+   };
+
+   uint32_t data;
+   uint32_t mask;
+   uint32_t offset;
+   int8_t bitPos;
+   Type type;
+
+   inline void apply(uint32_t *binary, const RelocInfo *info) const;
+};
+
+struct RelocInfo
+{
+   uint32_t codePos;
+   uint32_t libPos;
+   uint32_t dataPos;
+
+   uint32_t count;
+
+   RelocEntry entry[0];
+};
+
+class CodeEmitter
+{
+public:
+   CodeEmitter(const Target *);
+   virtual ~CodeEmitter() { }
+
+   // returns whether the instruction was encodable and written
+   virtual bool emitInstruction(Instruction *) = 0;
+
+   virtual uint32_t getMinEncodingSize(const Instruction *) const = 0;
+
+   void setCodeLocation(void *, uint32_t size);
+   inline void *getCodeLocation() const { return code; }
+   inline uint32_t getCodeSize() const { return codeSize; }
+
+   bool addReloc(RelocEntry::Type, int w, uint32_t data, uint32_t m,
+                 int s);
+
+   inline void *getRelocInfo() const { return relocInfo; }
+
+   void prepareEmission(Program *);
+   virtual void prepareEmission(Function *);
+   virtual void prepareEmission(BasicBlock *);
+
+   void printBinary() const;
+
+protected:
+   const Target *targ;
+
+   uint32_t *code;
+   uint32_t codeSize;
+   uint32_t codeSizeLimit;
+
+   RelocInfo *relocInfo;
+};
+
+
+enum OpClass
+{
+   OPCLASS_MOVE          = 0,
+   OPCLASS_LOAD          = 1,
+   OPCLASS_STORE         = 2,
+   OPCLASS_ARITH         = 3,
+   OPCLASS_SHIFT         = 4,
+   OPCLASS_SFU           = 5,
+   OPCLASS_LOGIC         = 6,
+   OPCLASS_COMPARE       = 7,
+   OPCLASS_CONVERT       = 8,
+   OPCLASS_ATOMIC        = 9,
+   OPCLASS_TEXTURE       = 10,
+   OPCLASS_SURFACE       = 11,
+   OPCLASS_FLOW          = 12,
+   OPCLASS_PSEUDO        = 14,
+   OPCLASS_VECTOR        = 15,
+   OPCLASS_BITFIELD      = 16,
+   OPCLASS_CONTROL       = 17,
+   OPCLASS_OTHER         = 18
+};
+
+class Target
+{
+public:
+   Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { }
+   virtual ~Target() { }
+
+   static Target *create(uint32_t chipset);
+   static void destroy(Target *);
+
+   // 0x50 and 0x84 to 0xaf for nv50
+   // 0xc0 to 0xdf for nvc0
+   inline uint32_t getChipset() const { return chipset; }
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type) = 0;
+
+   // Drivers should upload this so we can use it from all programs.
+   // The address chosen is supplied to the relocation routine.
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0;
+
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) { }
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const = 0;
+
+public:
+   struct OpInfo
+   {
+      OpInfo *variants;
+      operation op;
+      uint16_t srcTypes;
+      uint16_t dstTypes;
+      uint32_t immdBits;
+      uint8_t srcNr;
+      uint8_t srcMods[3];
+      uint8_t dstMods;
+      uint8_t srcFiles[3];
+      uint8_t dstFiles;
+      unsigned int minEncSize  : 4;
+      unsigned int vector      : 1;
+      unsigned int predicate   : 1;
+      unsigned int commutative : 1;
+      unsigned int pseudo      : 1;
+      unsigned int flow        : 1;
+      unsigned int hasDest     : 1;
+      unsigned int terminator  : 1;
+   };
+
+   inline const OpInfo& getOpInfo(const Instruction *) const;
+   inline const OpInfo& getOpInfo(const operation) const;
+
+   inline DataFile nativeFile(DataFile f) const;
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const = 0;
+   virtual bool isOpSupported(operation, DataType) const = 0;
+   virtual bool isAccessSupported(DataFile, DataType) const = 0;
+   virtual bool isModSupported(const Instruction *,
+                               int s, Modifier) const = 0;
+   virtual bool isSatSupported(const Instruction *) const = 0;
+   virtual bool isPostMultiplySupported(operation op, float f,
+                                        int& e) const { return false; }
+   virtual bool mayPredicate(const Instruction *,
+                             const Value *) const = 0;
+
+   // whether @insn can be issued together with @next (order matters)
+   virtual bool canDualIssue(const Instruction *insn,
+                             const Instruction *next) const { return false; }
+   virtual int getLatency(const Instruction *) const { return 1; }
+   virtual int getThroughput(const Instruction *) const { return 1; }
+
+   virtual unsigned int getFileSize(DataFile) const = 0;
+   virtual unsigned int getFileUnit(DataFile) const = 0;
+
+   virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0;
+
+public:
+   const bool joinAnterior; // true if join is executed before the op
+   const bool hasSWSched;   // true if code should provide scheduling data
+
+   static const uint8_t operationSrcNr[OP_LAST + 1];
+   static const OpClass operationClass[OP_LAST + 1];
+
+   static inline uint8_t getOpSrcNr(operation op)
+   {
+      return operationSrcNr[op];
+   }
+   static inline OpClass getOpClass(operation op)
+   {
+      return operationClass[op];
+   }
+
+protected:
+   uint32_t chipset;
+
+   DataFile nativeFileMap[DATA_FILE_COUNT];
+
+   OpInfo opInfo[OP_LAST + 1];
+};
+
+const Target::OpInfo& Target::getOpInfo(const Instruction *insn) const
+{
+   return opInfo[MIN2(insn->op, OP_LAST)];
+}
+
+const Target::OpInfo& Target::getOpInfo(const operation op) const
+{
+   return opInfo[op];
+}
+
+inline DataFile Target::nativeFile(DataFile f) const
+{
+   return nativeFileMap[f];
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_TARGET_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
new file mode 100644
index 00000000000..ade9be0300c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+Target *getTargetNV50(unsigned int chipset)
+{
+   return new TargetNV50(chipset);
+}
+
+TargetNV50::TargetNV50(unsigned int card) : Target(true, false)
+{
+   chipset = card;
+
+   wposMask = 0;
+   for (unsigned int i = 0; i <= SV_LAST; ++i)
+      sysvalLocation[i] = ~0;
+
+   initOpInfo();
+}
+
+#if 0
+// BULTINS / LIBRARY FUNCTIONS:
+
+// TODO
+static const uint32_t nvc0_builtin_code[] =
+{
+};
+
+static const uint16_t nvc0_builtin_offsets[NV50_BUILTIN_COUNT] =
+{
+};
+#endif
+
+void
+TargetNV50::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   *code = NULL;
+   *size = 0;
+}
+
+uint32_t
+TargetNV50::getBuiltinOffset(int builtin) const
+{
+   return 0;
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg    : 4;
+   unsigned int mAbs    : 4;
+   unsigned int mNot    : 4;
+   unsigned int mSat    : 4;
+   unsigned int fConst  : 3;
+   unsigned int fShared : 3;
+   unsigned int fAttrib : 3;
+   unsigned int fImm    : 3;
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  s[], a[], imm
+   { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+   { OP_SUB,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x0, 0x6, 0x1, 0x1, 0x0 }, // special constraint
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x0, 0x1, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+};
+
+void TargetNV50::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+      0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+   };
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF
+      0x00010e40, 0x00000040, 0x00000498, 0x00000000
+   };
+   static const operation noDestList[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+      OP_SUREDB, OP_BAR
+   };
+   static const operation noPredList[] =
+   {
+      OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
+   };
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0xffffffff;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
+      opInfo[noDestList[i]].hasDest = 0;
+   for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
+      opInfo[noPredList[i]].predicate = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fShared & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_SHARED;
+         if (prop->fAttrib & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_SHADER_INPUT;
+         if (prop->fImm & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNV50::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return 256; // in 16-bit units **
+   case FILE_PREDICATE:     return 0;
+   case FILE_FLAGS:         return 4;
+   case FILE_ADDRESS:       return 4;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x200;
+   case FILE_SHADER_OUTPUT: return 0x200;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 16;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+   // ** only first 128 units encodable for 16-bit regs
+}
+
+unsigned int
+TargetNV50::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS)
+      return 1;
+   if (file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   switch (sym->reg.data.sv.sv) {
+   case SV_FACE:
+      return 0x3fc;
+   case SV_POSITION:
+   {
+      uint32_t addr = sysvalLocation[sym->reg.data.sv.sv];
+      for (int c = 0; c < sym->reg.data.sv.index; ++c)
+         if (wposMask & (1 << c))
+            addr += 4;
+      return addr;
+   }
+   case SV_NCTAID:
+      return 0x8 + 2 * sym->reg.data.sv.index;
+   case SV_CTAID:
+      return 0xc + 2 * sym->reg.data.sv.index;
+   case SV_NTID:
+      return 0x2 + 2 * sym->reg.data.sv.index;
+   case SV_TID:
+      return 0;
+   default:
+      return sysvalLocation[sym->reg.data.sv.sv];
+   }
+}
+
+// long:  rrr, arr, rcr, acr, rrc, arc, gcr, grr
+// short: rr, ar, rc, gr
+// immd:  ri, gi
+bool
+TargetNV50::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src(0).getFile();
+
+   if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
+      return false;
+   if (s >= opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+   if (s == 2 && i->src(1).getFile() != FILE_GPR)
+      return false;
+
+   // NOTE: don't rely on flagsDef
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->def(d).getFile() == FILE_FLAGS)
+         return false;
+
+   unsigned mode = 0;
+
+   for (int z = 0; z < Target::operationSrcNr[i->op]; ++z) {
+      DataFile zf = (z == s) ? sf : i->src(z).getFile();
+      switch (zf) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (z * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (z * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (z * 2);
+      default:
+         break;
+      }
+   }
+
+   switch (mode) {
+   case 0x00:
+   case 0x01:
+   case 0x03:
+   case 0x08:
+   case 0x09:
+   case 0x0c:
+   case 0x20:
+   case 0x21:
+      break;
+   case 0x0d:
+      if (ld->bb->getProgram()->getType() != Program::TYPE_GEOMETRY)
+         return false;
+   default:
+      return false;
+   }
+
+   uint8_t ldSize;
+
+   if ((i->op == OP_MUL || i->op == OP_MAD) && !isFloatType(i->dType)) {
+      // 32-bit MUL will be split into 16-bit MULs
+      if (ld->src(0).isIndirect(0))
+         return false;
+      if (sf == FILE_IMMEDIATE)
+         return false;
+      ldSize = 2;
+   } else {
+      ldSize = typeSizeof(ld->dType);
+   }
+
+   if (sf == FILE_IMMEDIATE)
+      return true;
+
+
+   // Check if memory access is encodable:
+
+   if (ldSize < 4 && sf == FILE_SHADER_INPUT) // no < 4-byte aligned a[] access
+      return false;
+   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * ldSize))
+      return false;
+
+   if (ld->src(0).isIndirect(0)) {
+      for (int z = 0; i->srcExists(z); ++z)
+         if (i->src(z).isIndirect(0))
+            return false;
+
+      // s[] access only possible in CP, $aX always applies
+      if (sf == FILE_MEMORY_SHARED)
+         return true;
+      if (!ld->bb) // can't check type ...
+         return false;
+      Program::Type pt = ld->bb->getProgram()->getType();
+
+      // $aX applies to c[] only in VP, FP, GP if p[] is not accessed
+      if (pt == Program::TYPE_COMPUTE)
+         return false;
+      if (pt == Program::TYPE_GEOMETRY) {
+         if (sf == FILE_MEMORY_CONST)
+            return i->src(s).getFile() != FILE_SHADER_INPUT;
+         return sf == FILE_SHADER_INPUT;
+      }
+      return sf == FILE_MEMORY_CONST;
+   }
+   return true;
+}
+
+bool
+TargetNV50::isAccessSupported(DataFile file, DataType ty) const
+{
+   if (ty == TYPE_B96 || ty == TYPE_NONE)
+      return false;
+   if (typeSizeof(ty) > 4)
+      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
+   return true;
+}
+
+bool
+TargetNV50::isOpSupported(operation op, DataType ty) const
+{
+   if (ty == TYPE_F64 && chipset < 0xa0)
+      return false;
+
+   switch (op) {
+   case OP_PRERET:
+      return chipset >= 0xa0;
+   case OP_TXG:
+      return chipset >= 0xa3;
+   case OP_POW:
+   case OP_SQRT:
+   case OP_DIV:
+   case OP_MOD:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+   case OP_SLCT:
+   case OP_SELP:
+   case OP_POPCNT:
+   case OP_INSBF:
+   case OP_EXTBF:
+   case OP_EXIT: // want exit modifier instead (on NOP if required)
+   case OP_MEMBAR:
+      return false;
+   case OP_SAD:
+      return ty == TYPE_S32;
+   default:
+      return true;
+   }
+}
+
+bool
+TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_ADD:
+         if (insn->src(s ? 0 : 1).mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src(1).mod.neg() ? false : true;
+         break;
+      case OP_SET:
+         if (insn->sType != TYPE_F32)
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNV50::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate() || insn->flagsSrc >= 0)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNV50::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (insn->dType != TYPE_F32)
+      return false;
+   return opInfo[insn->op].dstMods & NV50_IR_MOD_SAT;
+}
+
+int TargetNV50::getLatency(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->op == OP_LOAD) {
+      switch (i->src(0).getFile()) {
+      case FILE_MEMORY_LOCAL:
+      case FILE_MEMORY_GLOBAL:
+         return 100; // really 400 to 800
+      default:
+         return 22;
+      }
+   }
+   return 22;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNV50::getThroughput(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+         return 16;
+      default:
+         return 4;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      return 4;
+   } else
+   if (i->dType == TYPE_F64) {
+      return 32;
+   } else {
+      return 1;
+   }
+}
+
+static void
+recordLocation(uint16_t *locs, uint8_t *masks,
+               const struct nv50_ir_varying *var)
+{
+   uint16_t addr = var->slot[0] * 4;
+
+   switch (var->sn) {
+   case TGSI_SEMANTIC_POSITION: locs[SV_POSITION] = addr; break;
+   case TGSI_SEMANTIC_INSTANCEID: locs[SV_INSTANCE_ID] = addr; break;
+   case TGSI_SEMANTIC_VERTEXID: locs[SV_VERTEX_ID] = addr; break;
+   case TGSI_SEMANTIC_PRIMID: locs[SV_PRIMITIVE_ID] = addr; break;
+   case NV50_SEMANTIC_LAYER: locs[SV_LAYER] = addr; break;
+   case NV50_SEMANTIC_VIEWPORTINDEX: locs[SV_VIEWPORT_INDEX] = addr; break;
+   default:
+      break;
+   }
+   if (var->sn == TGSI_SEMANTIC_POSITION && masks)
+      masks[0] = var->mask;
+}
+
+void
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
+{
+   unsigned int i;
+   for (i = 0; i < info->numOutputs; ++i)
+      recordLocation(sysvalLocation, NULL, &info->out[i]);
+   for (i = 0; i < info->numInputs; ++i)
+      recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+   for (i = 0; i < info->numSysVals; ++i)
+      recordLocation(sysvalLocation, NULL, &info->sv[i]);
+
+   if (sysvalLocation[SV_POSITION] >= 0x200) {
+      // not assigned by driver, but we need it internally
+      wposMask = 0x8;
+      sysvalLocation[SV_POSITION] = 0;
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
new file mode 100644
index 00000000000..0cbf180d048
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNV50 : public Target
+{
+public:
+   TargetNV50(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isAccessSupported(DataFile, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+
+   uint16_t sysvalLocation[SV_LAST + 1];
+   uint8_t wposMask;
+};
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
new file mode 100644
index 00000000000..47e9c558d35
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+Target *getTargetNVC0(unsigned int chipset)
+{
+   return new TargetNVC0(chipset);
+}
+
+TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
+{
+   chipset = card;
+   initOpInfo();
+}
+
+// BULTINS / LIBRARY FUNCTIONS:
+
+// lazyness -> will just hardcode everything for the time being
+
+#include "target_lib_nvc0.asm.h"
+#include "target_lib_nve4.asm.h"
+#include "target_lib_nvf0.asm.h"
+
+void
+TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   switch (chipset & 0xf0) {
+   case 0xe0:
+      *code = (const uint32_t *)&nve4_builtin_code[0];
+      *size = sizeof(nve4_builtin_code);
+      break;
+   case 0xf0:
+      *code = (const uint32_t *)&nvf0_builtin_code[0];
+      *size = sizeof(nvf0_builtin_code);
+      break;
+   default:
+      *code = (const uint32_t *)&nvc0_builtin_code[0];
+      *size = sizeof(nvc0_builtin_code);
+      break;
+   }
+}
+
+uint32_t
+TargetNVC0::getBuiltinOffset(int builtin) const
+{
+   assert(builtin < NVC0_BUILTIN_COUNT);
+
+   switch (chipset & 0xf0) {
+   case 0xe0: return nve4_builtin_offsets[builtin];
+   case 0xf0: return nvf0_builtin_offsets[builtin];
+   default:
+      return nvc0_builtin_offsets[builtin];
+   }
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg   : 4;
+   unsigned int mAbs   : 4;
+   unsigned int mNot   : 4;
+   unsigned int mSat   : 4;
+   unsigned int fConst : 3;
+   unsigned int fImmd  : 4; // last bit indicates if full immediate is suppoted
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  imm
+   { OP_ADD,    0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_SUB,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
+   { OP_MADSP,  0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SLCT,   0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_COS,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_SIN,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_EX2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_CALL,   0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_INSBF,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
+   { OP_PERMT,  0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   // saturate only:
+   { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+   { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+   // nve4 ops:
+   { OP_SULDB,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUSTB,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUSTP,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUCLAMP, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SUBFM,   0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_SUEAU,   0x0, 0x0, 0x0, 0x0, 0x6, 0x2 }
+};
+
+void TargetNVC0::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
+      0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+   };
+
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
+      0x0670ca00, 0x00000000, 0x00000000, 0x00000000
+   };
+
+   static const operation noDest[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+      OP_SUREDB, OP_BAR
+   };
+
+   static const operation noPred[] =
+   {
+      OP_CALL, OP_PRERET, OP_QUADON, OP_QUADPOP,
+      OP_JOINAT, OP_PREBREAK, OP_PRECONT, OP_BRKPT
+   };
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_ADDRESS] = FILE_GPR;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
+      opInfo[noDest[i]].hasDest = 0;
+   for (i = 0; i < sizeof(noPred) / sizeof(noPred[0]); ++i)
+      opInfo[noPred[i]].predicate = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fImmd & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+         if (prop->fImmd & 8)
+            opInfo[prop->op].immdBits = 0xffffffff;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return (chipset >= NVISA_GK110_CHIPSET) ? 255 : 63;
+   case FILE_PREDICATE:     return 7;
+   case FILE_FLAGS:         return 1;
+   case FILE_ADDRESS:       return 0;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x400;
+   case FILE_SHADER_OUTPUT: return 0x400;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 32;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   const int idx = sym->reg.data.sv.index;
+   const SVSemantic sv = sym->reg.data.sv.sv;
+
+   const bool isInput = shaderFile == FILE_SHADER_INPUT;
+   const bool kepler = getChipset() >= NVISA_GK104_CHIPSET;
+
+   switch (sv) {
+   case SV_POSITION:       return 0x070 + idx * 4;
+   case SV_INSTANCE_ID:    return 0x2f8;
+   case SV_VERTEX_ID:      return 0x2fc;
+   case SV_PRIMITIVE_ID:   return isInput ? 0x060 : 0x040;
+   case SV_LAYER:          return 0x064;
+   case SV_VIEWPORT_INDEX: return 0x068;
+   case SV_POINT_SIZE:     return 0x06c;
+   case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
+   case SV_POINT_COORD:    return 0x2e0 + idx * 4;
+   case SV_FACE:           return 0x3fc;
+   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_COORD:     return 0x2f0 + idx * 4;
+   case SV_NTID:           return kepler ? (0x00 + idx * 4) : ~0;
+   case SV_NCTAID:         return kepler ? (0x0c + idx * 4) : ~0;
+   case SV_GRIDID:         return kepler ? 0x18 : ~0;
+   default:
+      return 0xffffffff;
+   }
+}
+
+bool
+TargetNVC0::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src(0).getFile();
+
+   // immediate 0 can be represented by GPR $r63/$r255
+   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
+      return (!i->isPseudo() &&
+              !i->asTex() &&
+              i->op != OP_EXPORT && i->op != OP_STORE);
+
+   if (s >= opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+
+   // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
+   if (ld->src(0).isIndirect(0))
+      return false;
+
+   for (int k = 0; i->srcExists(k); ++k) {
+      if (i->src(k).getFile() == FILE_IMMEDIATE) {
+         if (k == 2 && i->op == OP_SUCLAMP) // special case
+            continue;
+         if (i->getSrc(k)->reg.data.u64 != 0)
+            return false;
+      } else
+      if (i->src(k).getFile() != FILE_GPR &&
+          i->src(k).getFile() != FILE_PREDICATE) {
+         return false;
+      }
+   }
+
+   // not all instructions support full 32 bit immediates
+   if (sf == FILE_IMMEDIATE) {
+      Storage &reg = ld->getSrc(0)->asImm()->reg;
+
+      if (opInfo[i->op].immdBits != 0xffffffff) {
+         if (i->sType == TYPE_F32) {
+            if (reg.data.u32 & 0xfff)
+               return false;
+         } else
+         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            // with u32, 0xfffff counts as 0xffffffff as well
+            if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
+               return false;
+         }
+      } else
+      if (i->op == OP_MAD || i->op == OP_FMA) {
+         // requires src == dst, cannot decide before RA
+         // (except if we implement more constraints)
+         if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
+            return false;
+      } else
+      if (i->op == OP_ADD && i->sType == TYPE_F32) {
+         // add f32 LIMM cannot saturate
+         if (i->saturate && (reg.data.u32 & 0xfff))
+            return false;
+      }
+   }
+
+   return true;
+}
+
+bool
+TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
+{
+   if (ty == TYPE_NONE)
+      return false;
+   if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
+      return typeSizeof(ty) <= 8;
+   if (ty == TYPE_B96)
+      return false;
+   if (getChipset() >= 0xf0) {
+      // XXX: find wide vfetch/export
+      if (ty == TYPE_B128)
+         return false;
+      if (ty == TYPE_U64)
+         return false;
+   }
+   return true;
+}
+
+bool
+TargetNVC0::isOpSupported(operation op, DataType ty) const
+{
+   if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
+      return false;
+   if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
+      return false;
+   if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
+      return false;
+   return true;
+}
+
+bool
+TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_SET:
+         if (insn->sType != TYPE_F32)
+            return false;
+         break;
+      case OP_ADD:
+         if (mod.abs())
+            return false;
+         if (insn->src(s ? 0 : 1).mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src(1).mod.neg() ? false : true;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate())
+      return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNVC0::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
+      return false;
+
+   if (insn->dType == TYPE_U32)
+      return (insn->op == OP_ADD) || (insn->op == OP_MAD);
+
+   // add f32 LIMM cannot saturate
+   if (insn->op == OP_ADD && insn->sType == TYPE_F32) {
+      if (insn->getSrc(1)->asImm() &&
+          insn->getSrc(1)->reg.data.u32 & 0xfff)
+         return false;
+   }
+
+   return insn->dType == TYPE_F32;
+}
+
+bool
+TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
+{
+   if (op != OP_MUL)
+      return false;
+   f = fabsf(f);
+   e = static_cast<int>(log2f(f));
+   if (e < -3 || e > 3)
+      return false;
+   return f == exp2f(static_cast<float>(e));
+}
+
+// TODO: better values
+// this could be more precise, e.g. depending on the issue-to-read/write delay
+// of the depending instruction, but it's good enough
+int TargetNVC0::getLatency(const Instruction *i) const
+{
+   if (chipset >= 0xe4) {
+      if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
+         return 20;
+      switch (i->op) {
+      case OP_LINTERP:
+      case OP_PINTERP:
+         return 15;
+      case OP_LOAD:
+         if (i->src(0).getFile() == FILE_MEMORY_CONST)
+            return 9;
+         // fall through
+      case OP_VFETCH:
+         return 24;
+      default:
+         if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
+            return 17;
+         if (i->op == OP_MUL && i->dType != TYPE_F32)
+            return 15;
+         return 9;
+      }
+   } else {
+      if (i->op == OP_LOAD) {
+         if (i->cache == CACHE_CV)
+            return 700;
+         return 48;
+      }
+      return 24;
+   }
+   return 32;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNVC0::getThroughput(const Instruction *i) const
+{
+   // TODO: better values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_MUL:
+      case OP_MAD:
+      case OP_FMA:
+         return 1;
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_MIN:
+      case OP_MAX:
+         return 2;
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+      default:
+         return 8;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+      case OP_NOT:
+         return 1;
+      case OP_MUL:
+      case OP_MAD:
+      case OP_CVT:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_SHL:
+      case OP_SHR:
+      case OP_NEG:
+      case OP_ABS:
+      case OP_MIN:
+      case OP_MAX:
+      default:
+         return 2;
+      }
+   } else
+   if (i->dType == TYPE_F64) {
+      return 2;
+   } else {
+      return 1;
+   }
+}
+
+bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
+{
+   const OpClass clA = operationClass[a->op];
+   const OpClass clB = operationClass[b->op];
+
+   if (getChipset() >= 0xe4) {
+      // not texturing
+      // not if the 2nd instruction isn't necessarily executed
+      if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
+         return false;
+      // anything with MOV
+      if (a->op == OP_MOV || b->op == OP_MOV)
+         return true;
+      if (clA == clB) {
+         // only F32 arith or integer additions
+         if (clA != OPCLASS_ARITH)
+            return false;
+         return (a->dType == TYPE_F32 || a->op == OP_ADD ||
+                 b->dType == TYPE_F32 || b->op == OP_ADD);
+      }
+      // nothing with TEXBAR
+      if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
+         return false;
+      // no loads and stores accessing the the same space
+      if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
+          (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
+         if (a->src(0).getFile() == b->src(0).getFile())
+            return false;
+      // no > 32-bit ops
+      if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
+          typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
+         return false;
+      return true;
+   } else {
+      return false; // info not needed (yet)
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
new file mode 100644
index 00000000000..7831af5069b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNVC0 : public Target
+{
+public:
+   TargetNVC0(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   CodeEmitter *createCodeEmitterNVC0(Program::Type);
+   CodeEmitter *createCodeEmitterGK110(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isAccessSupported(DataFile, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool isPostMultiplySupported(operation, float, int& e) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual bool canDualIssue(const Instruction *, const Instruction *) const;
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+};
+
+bool calculateSchedDataNVC0(const Target *, Function *);
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
new file mode 100644
index 00000000000..895977710ca
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -0,0 +1,390 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_util.h"
+
+namespace nv50_ir {
+
+void DLList::clear()
+{
+   for (Item *next, *item = head.next; item != &head; item = next) {
+      next = item->next;
+      delete item;
+   }
+   head.next = head.prev = &head;
+}
+
+void
+DLList::Iterator::erase()
+{
+   Item *rem = pos;
+
+   if (rem == term)
+      return;
+   pos = pos->next;
+
+   DLLIST_DEL(rem);
+   delete rem;
+}
+
+void DLList::Iterator::moveToList(DLList& dest)
+{
+   Item *item = pos;
+
+   assert(term != &dest.head);
+   assert(pos != term);
+
+   pos = pos->next;
+
+   DLLIST_DEL(item);
+   DLLIST_ADDHEAD(&dest.head, item);
+}
+
+bool
+DLList::Iterator::insert(void *data)
+{
+   Item *ins = new Item(data);
+
+   ins->next = pos->next;
+   ins->prev = pos;
+   pos->next->prev = ins;
+   pos->next = ins;
+
+   if (pos == term)
+      term = ins;
+
+   return true;
+}
+
+void
+Stack::moveTo(Stack& that)
+{
+   unsigned int newSize = this->size + that.size;
+
+   while (newSize > that.limit)
+      that.resize();
+   memcpy(&that.array[that.size], &array[0], this->size * sizeof(Item));
+
+   that.size = newSize;
+   this->size = 0;
+}
+
+Interval::Interval(const Interval& that) : head(NULL), tail(NULL)
+{
+   this->insert(that);
+}
+
+Interval::~Interval()
+{
+   clear();
+}
+
+void
+Interval::clear()
+{
+   for (Range *next, *r = head; r; r = next) {
+      next = r->next;
+      delete r;
+   }
+   head = tail = NULL;
+}
+
+bool
+Interval::extend(int a, int b)
+{
+   Range *r, **nextp = &head;
+
+   // NOTE: we need empty intervals for fixed registers
+   // if (a == b)
+   //   return false;
+   assert(a <= b);
+
+   for (r = head; r; r = r->next) {
+      if (b < r->bgn)
+         break; // insert before
+      if (a > r->end) {
+         // insert after
+         nextp = &r->next;
+         continue;
+      }
+
+      // overlap
+      if (a < r->bgn) {
+         r->bgn = a;
+         if (b > r->end)
+            r->end = b;
+         r->coalesce(&tail);
+         return true;
+      }
+      if (b > r->end) {
+         r->end = b;
+         r->coalesce(&tail);
+         return true;
+      }
+      assert(a >= r->bgn);
+      assert(b <= r->end);
+      return true;
+   }
+
+   (*nextp) = new Range(a, b);
+   (*nextp)->next = r;
+
+   for (r = (*nextp); r->next; r = r->next);
+   tail = r;
+   return true;
+}
+
+bool Interval::contains(int pos) const
+{
+   for (Range *r = head; r && r->bgn <= pos; r = r->next)
+      if (r->end > pos)
+         return true;
+   return false;
+}
+
+bool Interval::overlaps(const Interval &that) const
+{
+#if 1
+   Range *a = this->head;
+   Range *b = that.head;
+
+   while (a && b) {
+      if (b->bgn < a->end &&
+          b->end > a->bgn)
+         return true;
+      if (a->end <= b->bgn)
+         a = a->next;
+      else
+         b = b->next;
+   }
+#else
+   for (Range *rA = this->head; rA; rA = rA->next)
+      for (Range *rB = iv.head; rB; rB = rB->next)
+         if (rB->bgn < rA->end &&
+             rB->end > rA->bgn)
+            return true;
+#endif
+   return false;
+}
+
+void Interval::insert(const Interval &that)
+{
+   for (Range *r = that.head; r; r = r->next)
+      this->extend(r->bgn, r->end);
+}
+
+void Interval::unify(Interval &that)
+{
+   assert(this != &that);
+   for (Range *next, *r = that.head; r; r = next) {
+      next = r->next;
+      this->extend(r->bgn, r->end);
+      delete r;
+   }
+   that.head = NULL;
+}
+
+int Interval::length() const
+{
+   int len = 0;
+   for (Range *r = head; r; r = r->next)
+      len += r->bgn - r->end;
+   return len;
+}
+
+void Interval::print() const
+{
+   if (!head)
+      return;
+   INFO("[%i %i)", head->bgn, head->end);
+   for (const Range *r = head->next; r; r = r->next)
+      INFO(" [%i %i)", r->bgn, r->end);
+   INFO("\n");
+}
+
+void
+BitSet::andNot(const BitSet &set)
+{
+   assert(data && set.data);
+   assert(size >= set.size);
+   for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+      data[i] &= ~set.data[i];
+}
+
+BitSet& BitSet::operator|=(const BitSet &set)
+{
+   assert(data && set.data);
+   assert(size >= set.size);
+   for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+      data[i] |= set.data[i];
+   return *this;
+}
+
+bool BitSet::resize(unsigned int nBits)
+{
+   if (!data || !nBits)
+      return allocate(nBits, true);
+   const unsigned int p = (size + 31) / 32;
+   const unsigned int n = (nBits + 31) / 32;
+   if (n == p)
+      return true;
+
+   data = (uint32_t *)REALLOC(data, 4 * p, 4 * n);
+   if (!data) {
+      size = 0;
+      return false;
+   }
+   if (n > p)
+      memset(&data[4 * p + 4], 0, (n - p) * 4);
+
+   size = nBits;
+   return true;
+}
+
+bool BitSet::allocate(unsigned int nBits, bool zero)
+{
+   if (data && size < nBits) {
+      FREE(data);
+      data = NULL;
+   }
+   size = nBits;
+
+   if (!data)
+      data = reinterpret_cast<uint32_t *>(CALLOC((size + 31) / 32, 4));
+
+   if (zero)
+      memset(data, 0, (size + 7) / 8);
+   else
+   if (nBits)
+      data[(size + 31) / 32 - 1] = 0; // clear unused bits (e.g. for popCount)
+
+   return data;
+}
+
+unsigned int BitSet::popCount() const
+{
+   unsigned int count = 0;
+
+   for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+      if (data[i])
+         count += util_bitcount(data[i]);
+   return count;
+}
+
+void BitSet::fill(uint32_t val)
+{
+   unsigned int i;
+   for (i = 0; i < (size + 31) / 32; ++i)
+      data[i] = val;
+   if (val)
+      data[i] &= ~(0xffffffff << (size % 32)); // BE ?
+}
+
+void BitSet::setOr(BitSet *pA, BitSet *pB)
+{
+   if (!pB) {
+      *this = *pA;
+   } else {
+      for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+         data[i] = pA->data[i] | pB->data[i];
+   }
+}
+
+int BitSet::findFreeRange(unsigned int count) const
+{
+   const uint32_t m = (1 << count) - 1;
+   int pos = size;
+   unsigned int i;
+   const unsigned int end = (size + 31) / 32;
+
+   if (count == 1) {
+      for (i = 0; i < end; ++i) {
+         pos = ffs(~data[i]) - 1;
+         if (pos >= 0)
+            break;
+      }
+   } else
+   if (count == 2) {
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            uint32_t b = data[i] | (data[i] >> 1) | 0xaaaaaaaa;
+            pos = ffs(~b) - 1;
+            if (pos >= 0)
+               break;
+         }
+      }
+   } else
+   if (count == 4 || count == 3) {
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            uint32_t b =
+               (data[i] >> 0) | (data[i] >> 1) |
+               (data[i] >> 2) | (data[i] >> 3) | 0xeeeeeeee;
+            pos = ffs(~b) - 1;
+            if (pos >= 0)
+               break;
+         }
+      }
+   } else {
+      if (count <= 8)
+         count = 8;
+      else
+      if (count <= 16)
+         count = 16;
+      else
+         count = 32;
+
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            for (pos = 0; pos < 32; pos += count)
+               if (!(data[i] & (m << pos)))
+                  break;
+            if (pos < 32)
+               break;
+         }
+      }
+   }
+   pos += i * 32;
+
+   return ((pos + count) <= size) ? pos : -1;
+}
+
+void BitSet::print() const
+{
+   unsigned int n = 0;
+   INFO("BitSet of size %u:\n", size);
+   for (unsigned int i = 0; i < (size + 31) / 32; ++i) {
+      uint32_t bits = data[i];
+      while (bits) {
+         int pos = ffs(bits) - 1;
+         bits &= ~(1 << pos);
+         INFO(" %i", i * 32 + pos);
+         ++n;
+         if ((n % 16) == 0)
+            INFO("\n");
+      }
+   }
+   if (n % 16)
+      INFO("\n");
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
new file mode 100644
index 00000000000..a4ea9d981e0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_UTIL_H__
+#define __NV50_IR_UTIL_H__
+
+#include <new>
+#include <assert.h>
+#include <stdio.h>
+#include <memory>
+#include <map>
+
+#ifndef NDEBUG
+# include <typeinfo>
+#endif
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#define ERROR(args...) debug_printf("ERROR: " args)
+#define WARN(args...) debug_printf("WARNING: " args)
+#define INFO(args...) debug_printf(args)
+
+#define INFO_DBG(m, f, args...)          \
+   do {                                  \
+      if (m & NV50_IR_DEBUG_##f)         \
+         debug_printf(args);             \
+   } while(0)
+
+#define FATAL(args...)          \
+   do {                         \
+      fprintf(stderr, args);    \
+      abort();                  \
+   } while(0)
+
+
+#define NV50_IR_FUNC_ALLOC_OBJ_DEF(obj, f, args...)               \
+   new ((f)->getProgram()->mem_##obj.allocate()) obj(f, args)
+
+#define new_Instruction(f, args...)                      \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(Instruction, f, args)
+#define new_CmpInstruction(f, args...)                   \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(CmpInstruction, f, args)
+#define new_TexInstruction(f, args...)                   \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(TexInstruction, f, args)
+#define new_FlowInstruction(f, args...)                  \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(FlowInstruction, f, args)
+
+#define new_LValue(f, args...)                  \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(LValue, f, args)
+
+
+#define NV50_IR_PROG_ALLOC_OBJ_DEF(obj, p, args...)   \
+   new ((p)->mem_##obj.allocate()) obj(p, args)
+
+#define new_Symbol(p, args...)                           \
+   NV50_IR_PROG_ALLOC_OBJ_DEF(Symbol, p, args)
+#define new_ImmediateValue(p, args...)                   \
+   NV50_IR_PROG_ALLOC_OBJ_DEF(ImmediateValue, p, args)
+
+
+#define delete_Instruction(p, insn) (p)->releaseInstruction(insn)
+#define delete_Value(p, val) (p)->releaseValue(val)
+
+
+namespace nv50_ir {
+
+class Iterator
+{
+public:
+   virtual ~Iterator() { };
+   virtual void next() = 0;
+   virtual void *get() const = 0;
+   virtual bool end() const = 0; // if true, get will return 0
+   virtual void reset() { assert(0); } // only for graph iterators
+};
+
+typedef std::auto_ptr<Iterator> IteratorRef;
+
+class ManipIterator : public Iterator
+{
+public:
+   virtual bool insert(void *) = 0; // insert after current position
+   virtual void erase() = 0;
+};
+
+// WARNING: do not use a->prev/next for __item or __list
+
+#define DLLIST_DEL(__item)                      \
+   do {                                         \
+      (__item)->prev->next = (__item)->next;    \
+      (__item)->next->prev = (__item)->prev;    \
+      (__item)->next = (__item);                \
+      (__item)->prev = (__item);                \
+   } while(0)
+
+#define DLLIST_ADDTAIL(__list, __item)          \
+   do {                                         \
+      (__item)->next = (__list);                \
+      (__item)->prev = (__list)->prev;          \
+      (__list)->prev->next = (__item);          \
+      (__list)->prev = (__item);                \
+   } while(0)
+
+#define DLLIST_ADDHEAD(__list, __item)          \
+   do {                                         \
+      (__item)->prev = (__list);                \
+      (__item)->next = (__list)->next;          \
+      (__list)->next->prev = (__item);          \
+      (__list)->next = (__item);                \
+   } while(0)
+
+#define DLLIST_MERGE(__listA, __listB, ty)      \
+   do {                                         \
+      ty prevB = (__listB)->prev;               \
+      (__listA)->prev->next = (__listB);        \
+      (__listB)->prev->next = (__listA);        \
+      (__listB)->prev = (__listA)->prev;        \
+      (__listA)->prev = prevB;                  \
+   } while(0)
+
+#define DLLIST_EMPTY(__list) ((__list)->next == (__list))
+
+#define DLLIST_FOR_EACH(list, it) \
+   for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next())
+
+class DLList
+{
+public:
+   class Item
+   {
+   public:
+      Item(void *priv) : next(this), prev(this), data(priv) { }
+
+   public:
+      Item *next;
+      Item *prev;
+      void *data;
+   };
+
+   DLList() : head(0) { }
+   ~DLList() { clear(); }
+
+   inline void insertHead(void *data)
+   {
+      Item *item = new Item(data);
+
+      assert(data);
+
+      item->prev = &head;
+      item->next = head.next;
+      head.next->prev = item;
+      head.next = item;
+   }
+
+   inline void insertTail(void *data)
+   {
+      Item *item = new Item(data);
+
+      assert(data);
+
+      DLLIST_ADDTAIL(&head, item);
+   }
+
+   inline void insert(void *data) { insertTail(data); }
+
+   void clear();
+
+   class Iterator : public ManipIterator
+   {
+   public:
+      Iterator(Item *head, bool r) : rev(r), pos(r ? head->prev : head->next),
+                                     term(head) { }
+
+      virtual void next() { if (!end()) pos = rev ? pos->prev : pos->next; }
+      virtual void *get() const { return pos->data; }
+      virtual bool end() const { return pos == term; }
+
+      // caution: if you're at end-2 and erase it, then do next, you're at end
+      virtual void erase();
+      virtual bool insert(void *data);
+
+      // move item to a another list, no consistency with its iterators though
+      void moveToList(DLList&);
+
+   private:
+      const bool rev;
+      Item *pos;
+      Item *term;
+
+      friend class DLList;
+   };
+
+   inline void erase(Iterator& pos)
+   {
+      pos.erase();
+   }
+
+   Iterator iterator()
+   {
+      return Iterator(&head, false);
+   }
+
+   Iterator revIterator()
+   {
+      return Iterator(&head, true);
+   }
+
+private:
+   Item head;
+};
+
+class Stack
+{
+public:
+   class Item {
+   public:
+      union {
+         void *p;
+         int i;
+         unsigned int u;
+         float f;
+         double d;
+      } u;
+
+      Item() { memset(&u, 0, sizeof(u)); }
+   };
+
+   Stack() : size(0), limit(0), array(0) { }
+   ~Stack() { if (array) FREE(array); }
+
+   inline void push(int i)          { Item data; data.u.i = i; push(data); }
+   inline void push(unsigned int u) { Item data; data.u.u = u; push(data); }
+   inline void push(void *p)        { Item data; data.u.p = p; push(data); }
+   inline void push(float f)        { Item data; data.u.f = f; push(data); }
+
+   inline void push(Item data)
+   {
+      if (size == limit)
+         resize();
+      array[size++] = data;
+   }
+
+   inline Item pop()
+   {
+      if (!size) {
+         Item data;
+         assert(0);
+         return data;
+      }
+      return array[--size];
+   }
+
+   inline unsigned int getSize() { return size; }
+
+   inline Item& peek() { assert(size); return array[size - 1]; }
+
+   void clear(bool releaseStorage = false)
+   {
+      if (releaseStorage && array)
+         FREE(array);
+      size = limit = 0;
+   }
+
+   void moveTo(Stack&); // move all items to target (not like push(pop()))
+
+private:
+   void resize()
+   {
+         unsigned int sizeOld, sizeNew;
+
+         sizeOld = limit * sizeof(Item);
+         limit = MAX2(4, limit + limit);
+         sizeNew = limit * sizeof(Item);
+
+         array = (Item *)REALLOC(array, sizeOld, sizeNew);
+   }
+
+   unsigned int size;
+   unsigned int limit;
+   Item *array;
+};
+
+class DynArray
+{
+public:
+   class Item
+   {
+   public:
+      union {
+         uint32_t u32;
+         void *p;
+      };
+   };
+
+   DynArray() : data(NULL), size(0) { }
+
+   ~DynArray() { if (data) FREE(data); }
+
+   inline Item& operator[](unsigned int i)
+   {
+      if (i >= size)
+         resize(i);
+      return data[i];
+   }
+
+   inline const Item operator[](unsigned int i) const
+   {
+      return data[i];
+   }
+
+   void resize(unsigned int index)
+   {
+      const unsigned int oldSize = size * sizeof(Item);
+
+      if (!size)
+         size = 8;
+      while (size <= index)
+         size <<= 1;
+
+      data = (Item *)REALLOC(data, oldSize, size * sizeof(Item));
+   }
+
+   void clear()
+   {
+      FREE(data);
+      data = NULL;
+      size = 0;
+   }
+
+private:
+   Item *data;
+   unsigned int size;
+};
+
+class ArrayList
+{
+public:
+   ArrayList() : size(0) { }
+
+   void insert(void *item, int& id)
+   {
+      id = ids.getSize() ? ids.pop().u.i : size++;
+      data[id].p = item;
+   }
+
+   void remove(int& id)
+   {
+      const unsigned int uid = id;
+      assert(uid < size && data[id].p);
+      ids.push(uid);
+      data[uid].p = NULL;
+      id = -1;
+   }
+
+   inline int getSize() const { return size; }
+
+   inline void *get(unsigned int id) { assert(id < size); return data[id].p; }
+
+   class Iterator : public nv50_ir::Iterator
+   {
+   public:
+      Iterator(const ArrayList *array) : pos(0), data(array->data)
+      {
+         size = array->getSize();
+         if (size)
+            nextValid();
+      }
+
+      void nextValid() { while ((pos < size) && !data[pos].p) ++pos; }
+
+      void next() { if (pos < size) { ++pos; nextValid(); } }
+      void *get() const { assert(pos < size); return data[pos].p; }
+      bool end() const { return pos >= size; }
+
+   private:
+      unsigned int pos;
+      unsigned int size;
+      const DynArray& data;
+
+      friend class ArrayList;
+   };
+
+   Iterator iterator() const { return Iterator(this); }
+
+   void clear()
+   {
+      data.clear();
+      ids.clear(true);
+      size = 0;
+   }
+
+private:
+   DynArray data;
+   Stack ids;
+   unsigned int size;
+};
+
+class Interval
+{
+public:
+   Interval() : head(0), tail(0) { }
+   Interval(const Interval&);
+   ~Interval();
+
+   bool extend(int, int);
+   void insert(const Interval&);
+   void unify(Interval&); // clears source interval
+   void clear();
+
+   inline int begin() const { return head ? head->bgn : -1; }
+   inline int end() const { checkTail(); return tail ? tail->end : -1; }
+   inline bool isEmpty() const { return !head; }
+   bool overlaps(const Interval&) const;
+   bool contains(int pos) const;
+
+   inline int extent() const { return end() - begin(); }
+   int length() const;
+
+   void print() const;
+
+   inline void checkTail() const;
+
+private:
+   class Range
+   {
+   public:
+      Range(int a, int b) : next(0), bgn(a), end(b) { }
+
+      Range *next;
+      int bgn;
+      int end;
+
+      void coalesce(Range **ptail)
+      {
+         Range *rnn;
+
+         while (next && end >= next->bgn) {
+            assert(bgn <= next->bgn);
+            rnn = next->next;
+            end = MAX2(end, next->end);
+            delete next;
+            next = rnn;
+         }
+         if (!next)
+            *ptail = this;
+      }
+   };
+
+   Range *head;
+   Range *tail;
+};
+
+class BitSet
+{
+public:
+   BitSet() : marker(false), data(0), size(0) { }
+   BitSet(unsigned int nBits, bool zero) : marker(false), data(0), size(0)
+   {
+      allocate(nBits, zero);
+   }
+   ~BitSet()
+   {
+      if (data)
+         FREE(data);
+   }
+
+   bool allocate(unsigned int nBits, bool zero);
+   bool resize(unsigned int nBits); // keep old data, zero additional bits
+
+   inline unsigned int getSize() const { return size; }
+
+   void fill(uint32_t val);
+
+   void setOr(BitSet *, BitSet *); // second BitSet may be NULL
+
+   inline void set(unsigned int i)
+   {
+      assert(i < size);
+      data[i / 32] |= 1 << (i % 32);
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline void setRange(unsigned int i, unsigned int n)
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      data[i / 32] |= ((1 << n) - 1) << (i % 32);
+   }
+   inline void setMask(unsigned int i, uint32_t m)
+   {
+      assert(i < size);
+      data[i / 32] |= m;
+   }
+
+   inline void clr(unsigned int i)
+   {
+      assert(i < size);
+      data[i / 32] &= ~(1 << (i % 32));
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline void clrRange(unsigned int i, unsigned int n)
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      data[i / 32] &= ~(((1 << n) - 1) << (i % 32));
+   }
+
+   inline bool test(unsigned int i) const
+   {
+      assert(i < size);
+      return data[i / 32] & (1 << (i % 32));
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline bool testRange(unsigned int i, unsigned int n) const
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      return data[i / 32] & (((1 << n) - 1) << (i % 32));
+   }
+
+   // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
+   int findFreeRange(unsigned int size) const;
+
+   BitSet& operator|=(const BitSet&);
+
+   BitSet& operator=(const BitSet& set)
+   {
+      assert(data && set.data);
+      assert(size == set.size);
+      memcpy(data, set.data, (set.size + 7) / 8);
+      return *this;
+   }
+
+   void andNot(const BitSet&);
+
+   // bits = (bits | setMask) & ~clrMask
+   inline void periodicMask32(uint32_t setMask, uint32_t clrMask)
+   {
+      for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+         data[i] = (data[i] | setMask) & ~clrMask;
+   }
+
+   unsigned int popCount() const;
+
+   void print() const;
+
+public:
+   bool marker; // for user
+
+private:
+   uint32_t *data;
+   unsigned int size;
+};
+
+void Interval::checkTail() const
+{
+#if NV50_DEBUG & NV50_DEBUG_PROG_RA
+   Range *r = head;
+   while (r->next)
+      r = r->next;
+   assert(tail == r);
+#endif
+}
+
+class MemoryPool
+{
+private:
+   inline bool enlargeAllocationsArray(const unsigned int id, unsigned int nr)
+   {
+      const unsigned int size = sizeof(uint8_t *) * id;
+      const unsigned int incr = sizeof(uint8_t *) * nr;
+
+      uint8_t **alloc = (uint8_t **)REALLOC(allocArray, size, size + incr);
+      if (!alloc)
+         return false;
+      allocArray = alloc;
+      return true;
+   }
+
+   inline bool enlargeCapacity()
+   {
+      const unsigned int id = count >> objStepLog2;
+
+      uint8_t *const mem = (uint8_t *)MALLOC(objSize << objStepLog2);
+      if (!mem)
+         return false;
+
+      if (!(id % 32)) {
+         if (!enlargeAllocationsArray(id, 32)) {
+            FREE(mem);
+            return false;
+         }
+      }
+      allocArray[id] = mem;
+      return true;
+   }
+
+public:
+   MemoryPool(unsigned int size, unsigned int incr) : objSize(size),
+                                                      objStepLog2(incr)
+   {
+      allocArray = NULL;
+      released = NULL;
+      count = 0;
+   }
+
+   ~MemoryPool()
+   {
+      unsigned int allocCount = (count + (1 << objStepLog2) - 1) >> objStepLog2;
+      for (unsigned int i = 0; i < allocCount && allocArray[i]; ++i)
+         FREE(allocArray[i]);
+      if (allocArray)
+         FREE(allocArray);
+   }
+
+   void *allocate()
+   {
+      void *ret;
+      const unsigned int mask = (1 << objStepLog2) - 1;
+
+      if (released) {
+         ret = released;
+         released = *(void **)released;
+         return ret;
+      }
+
+      if (!(count & mask))
+         if (!enlargeCapacity())
+            return NULL;
+
+      ret = allocArray[count >> objStepLog2] + (count & mask) * objSize;
+      ++count;
+      return ret;
+   }
+
+   void release(void *ptr)
+   {
+      *(void **)ptr = released;
+      released = ptr;
+   }
+
+private:
+   uint8_t **allocArray; // array (list) of MALLOC allocations
+
+   void *released; // list of released objects
+
+   unsigned int count; // highest allocated object
+
+   const unsigned int objSize;
+   const unsigned int objStepLog2;
+};
+
+/**
+ *  Composite object cloning policy.
+ *
+ *  Encapsulates how sub-objects are to be handled (if at all) when a
+ *  composite object is being cloned.
+ */
+template<typename C>
+class ClonePolicy
+{
+protected:
+   C *c;
+
+public:
+   ClonePolicy(C *c) : c(c) {}
+
+   C *context() { return c; }
+
+   template<typename T> T *get(T *obj)
+   {
+      void *clone = lookup(obj);
+      if (!clone)
+         clone = obj->clone(*this);
+      return reinterpret_cast<T *>(clone);
+   }
+
+   template<typename T> void set(const T *obj, T *clone)
+   {
+      insert(obj, clone);
+   }
+
+protected:
+   virtual void *lookup(void *obj) = 0;
+   virtual void insert(const void *obj, void *clone) = 0;
+};
+
+/**
+ *  Shallow non-recursive cloning policy.
+ *
+ *  Objects cloned with the "shallow" policy don't clone their
+ *  children recursively, instead, the new copy shares its children
+ *  with the original object.
+ */
+template<typename C>
+class ShallowClonePolicy : public ClonePolicy<C>
+{
+public:
+   ShallowClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+protected:
+   virtual void *lookup(void *obj)
+   {
+      return obj;
+   }
+
+   virtual void insert(const void *obj, void *clone)
+   {
+   }
+};
+
+template<typename C, typename T>
+inline T *cloneShallow(C *c, T *obj)
+{
+   ShallowClonePolicy<C> pol(c);
+   return obj->clone(pol);
+}
+
+/**
+ *  Recursive cloning policy.
+ *
+ *  Objects cloned with the "deep" policy clone their children
+ *  recursively, keeping track of what has already been cloned to
+ *  avoid making several new copies of the same object.
+ */
+template<typename C>
+class DeepClonePolicy : public ClonePolicy<C>
+{
+public:
+   DeepClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+private:
+   std::map<const void *, void *> map;
+
+protected:
+   virtual void *lookup(void *obj)
+   {
+      return map[obj];
+   }
+
+   virtual void insert(const void *obj, void *clone)
+   {
+      map[obj] = clone;
+   }
+};
+
+template<typename S, typename T>
+struct bimap
+{
+   std::map<S, T> forth;
+   std::map<T, S> back;
+
+public:
+   bimap() : l(back), r(forth) { }
+   bimap(const bimap<S, T> &m)
+      : forth(m.forth), back(m.back), l(back), r(forth) { }
+
+   void insert(const S &s, const T &t)
+   {
+      forth.insert(std::make_pair(s, t));
+      back.insert(std::make_pair(t, s));
+   }
+
+   typedef typename std::map<T, S>::const_iterator l_iterator;
+   const std::map<T, S> &l;
+   typedef typename std::map<S, T>::const_iterator r_iterator;
+   const std::map<S, T> &r;
+};
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
new file mode 100644
index 00000000000..f40becc0b88
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
@@ -0,0 +1,96 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+cvt s32 $r0 abs s32 $r0
+cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p3 cvt s32 $r0 neg s32 $r0
+$p2 cvt s32 $r1 neg s32 $r1
+ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    9 * 8 bytes
+//
+nop
+ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    14 * 8 bytes
+//
+nop
+ret
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
new file mode 100644
index 00000000000..37905049f50
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
@@ -0,0 +1,112 @@
+
+static const uint32_t nvc0_builtin_code[] =
+{
+   0x04009c03,
+   0x78000000,
+   0x7c209cdd,
+   0x0010dd18,
+   0x08309c03,
+   0x60000000,
+   0x05605c18,
+   0x0810dc2a,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0000dde4,
+   0x28000000,
+   0x08001c43,
+   0x50000000,
+   0x05609c18,
+   0x0010430d,
+   0x0811dc03,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x04000002,
+   0x08000000,
+   0x0811c003,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x040000ac,
+   0x90001dff,
+   0xfc05dc23,
+   0x188e0000,
+   0xfc17dc23,
+   0x18c40000,
+   0x03301e18,
+   0x07305e18,
+   0x04009c03,
+   0x78000000,
+   0x7c209cdd,
+   0x0010dd18,
+   0x08309c03,
+   0x60000000,
+   0x05605c18,
+   0x0810dc2a,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0000dde4,
+   0x28000000,
+   0x08001c43,
+   0x50000000,
+   0x05609c18,
+   0x0010430d,
+   0x0811dc03,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x04000002,
+   0x08000000,
+   0x0811c003,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x040000ac,
+   0x01700e18,
+   0x05704a18,
+   0x90001dff,
+   0x00001c08,
+   0x90001dff,
+   0x00001c08,
+   0x90001dff,
+};
+
+static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0x0000,
+   0x00b0,
+   0x0180,
+   0x0188
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
new file mode 100644
index 00000000000..5adc9ff38a5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
@@ -0,0 +1,698 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+long cvt u32 $r1 neg u32 $r1
+long mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+long ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
+long cvt s32 $r0 abs s32 $r0
+long cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+long $p0 add b32 $r0 $r0 0x1
+long $p3 cvt s32 $r0 neg s32 $r0
+sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+$p2 cvt s32 $r1 neg s32 $r1
+long ret
+//
+// SULDP [for each format]
+// $r4d: address
+// $r2: surface info (format)
+// $p0: access predicate
+// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
+//
+// RGBA32
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+long ret
+// RGBA16_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u16 1 $r1
+cvt rn f32 $r2 u16 0 $r1
+mul f32 $r3 $r3 0x37800074
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u16 1 $r0
+mul f32 $r2 $r2 0x37800074
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long ret
+// RGBA16_SNORM
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s16 1 $r1
+cvt rn f32 $r2 s16 0 $r1
+mul f32 $r3 $r3 0x38000187
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x38000187
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RGBA16_SINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s16 1 $r1
+cvt s32 $r2 s16 0 $r1
+cvt s32 $r1 s16 1 $r0
+cvt s32 $r0 s16 0 $r0
+long ret
+// RGBA16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u16 1 $r1
+cvt u32 $r2 u16 0 $r1
+cvt u32 $r1 u16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RGBA16_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt f32 $r3 f16 $r1 1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r2 f16 $r1 0
+cvt f32 $r1 f16 $r0 1
+cvt f32 $r0 f16 $r0 0
+long ret
+// RG32_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+long ret
+// RG32_xINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x00000001
+long ret
+// RGB10A2_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+cvt rn f32 $r2 u16 0 $r2
+cvt rn f32 $r1 u16 0 $r1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3a802007
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x3a802007
+mul f32 $r0 $r0 0x3a802007
+long ret
+// RGB10A2_UINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x00000001
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+long ret
+// RGBA8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u8 3 $r0
+cvt rn f32 $r2 u8 2 $r0
+mul f32 $r3 $r3 0x3b808081
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u8 1 $r0
+mul f32 $r2 $r2 0x3b808081
+cvt rn f32 $r0 u8 0 $r0
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RGBA8_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s8 3 $r0
+cvt rn f32 $r2 s8 2 $r0
+mul f32 $r3 $r3 0x3c010204
+cvt rn f32 $r1 s8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3c010204
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RGBA8_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s8 3 $r0
+cvt s32 $r2 s8 2 $r0
+cvt s32 $r1 s8 1 $r0
+cvt s32 $r0 s8 0 $r0
+long ret
+// RGBA8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u8 3 $r0
+cvt u32 $r2 u8 2 $r0
+cvt u32 $r1 u8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// R5G6B5_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0605
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x050b
+long and b32 $r0 $r0 0x1f
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+mul f32 $r2 $r2 0x3d042108
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3c820821
+mul f32 $r0 $r0 0x3d042108
+long ret
+// R5G5B5X1_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+ext u32 $r1 $r0 0x0505
+ext u32 $r2 $r0 0x050a
+long and b32 $r0 $r0 0x1f
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3d042108
+mul f32 $r1 $r1 0x3d042108
+mul f32 $r0 $r0 0x3d042108
+long ret
+// RG16_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r1 u16 1 $r0
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// RG16_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RG16_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt s32 $r1 s16 1 $r0
+mov b32 $r2 0x00000000
+cvt s32 $r0 s16 0 $r0
+long ret
+// RG16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt u32 $r1 u16 1 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RG16_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r1 f16 $r0 1
+mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+long ret
+// R32_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R32_xINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// RG8_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 u8 1 $r0
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RG8_SNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RG8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+cvt u32 $r1 u8 1 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// RG8_SINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt s32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt s32 $r0 s8 0 $r0
+long ret
+// R16_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u16 0 $r0
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x37800074
+long ret
+// R16_SNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s16 0 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x38000187
+long ret
+// R16_SINT
+$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_UINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_FLOAT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+mov b32 $r1 0x00000000
+long ret
+// R8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u8 0 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r0 $r0 0x3b808081
+mov b32 $r1 0x00000000
+long ret
+// R8_SNORM
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s8 0 $r0
+mov b32 $r2 0x00000000
+mul f32 $r0 $r0 0x3c010204
+mov b32 $r1 0x00000000
+long ret
+// R8_SINT
+$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// R11G11B10_FLOAT TODO
+$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long nop
+long ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    9 * 8 bytes
+//
+long nop
+long ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    14 * 8 bytes
+//
+long nop
+long ret
+//
+// Trap handler.
+// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
+// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
+//
+// Trap info:
+// 0x000: mutex
+// 0x004: PC
+// 0x008: trapstat
+// 0x00c: warperr
+// 0x010: tidx
+// 0x014: tidy
+// 0x018: tidz
+// 0x01c: ctaidx
+// 0x020: ctaidy
+// 0x024: ctaidz
+// 0x030: $r0q
+// 0x130: $flags
+// 0x140: s[]
+//
+st b128 wb l[0x00] $r0q
+// check state of the warp and continue if it didn't cause the trap
+long mov b32 $r1 $trapstat
+long mov b32 $r3 $warperr
+mov $r2 $flags mask 0xffff
+and b32 0 $c $r1 $r3
+e $c bra #end_cont
+// spill control flow stack to l[]
+long mov b32 $r3 16
+spill_cfstack:
+preret #end_exit
+sub b32 $r3 $c $r3 0x1
+lg $c bra #spill_cfstack
+// retrieve pointer to trap info
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+// we only let a single faulting thread store its state
+mov b32 $r3 0x1
+exch b32 $r3 g[$r0d] $r3
+joinat #end_exit
+set $p0 0x1 eq u32 $r3 0x1
+join $p0 nop
+// store $c and $p registers
+st b32 wb g[$r0d+0x130] $r2
+// store $trapstat and $warperr
+long mov b32 $r2 $trapstat
+long mov b32 $r3 $warperr
+st b64 wb g[$r0d+0x8] $r2d
+// store registers
+st b128 wb g[$r0d+0x40] $r4q
+st b128 wb g[$r0d+0x50] $r8q
+st b128 wb g[$r0d+0x60] $r12q
+st b128 wb g[$r0d+0x70] $r16q
+st b128 wb g[$r0d+0x80] $r20q
+st b128 wb g[$r0d+0x90] $r24q
+st b128 wb g[$r0d+0xa0] $r28q
+st b128 wb g[$r0d+0xb0] $r32q
+st b128 wb g[$r0d+0xc0] $r36q
+st b128 wb g[$r0d+0xd0] $r40q
+st b128 wb g[$r0d+0xe0] $r44q
+st b128 wb g[$r0d+0xf0] $r48q
+st b128 wb g[$r0d+0x100] $r52q
+st b128 wb g[$r0d+0x110] $r56q
+st b128 wb g[$r0d+0x120] $r60q
+ld b64 $r2d cs l[0x0]
+st b64 wb g[$r0d+0x30] $r2d
+ld b64 $r2d cs l[0x8]
+st b64 wb g[$r0d+0x38] $r2d
+// store thread id
+long mov b32 $r2 $tidx
+long mov b32 $r3 $tidy
+st b64 wb g[$r0d+0x10] $r2d
+long mov b32 $r2 $tidz
+long mov b32 $r3 $ctaidx
+st b64 wb g[$r0d+0x18] $r2d
+long mov b32 $r2 $ctaidy
+long mov b32 $r3 $ctaidz
+st b64 wb g[$r0d+0x20] $r2d
+// store shared memory (in reverse order so $r0d is base again at the end)
+long mov b32 $r3 $smemsz
+sub b32 $r3 $c $r3 0x4
+s $c bra #shared_done
+add b32 $r0 $c $r0 $r3
+add b32 $r1 $r1 0x0 $c
+shared_loop:
+long ld b32 $r2 s[$r3]
+long st b32 wb g[$r0d+0x140] $r2
+sub b32 $r0 $c $r0 0x4
+sub b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x4
+lg $c bra #shared_loop
+shared_done:
+// search the stack for trap entry to retrieve PC
+mov b32 $r0 c0[0x1908]
+mov b32 $r1 c0[0x190c]
+membar sys
+// invalidate caches so we can read stack entries via g[]
+cctl ivall 0 l[0]
+cctl ivall 0 g[$r0d]
+// get offsets
+mov b32 $r2 $physid
+ext u32 $r3 $r2 0x0814 // MP id
+ext u32 $r2 $r2 0x0608 // warp id
+mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
+mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
+add b32 $r2 $r2 $r3 // MP + warp offset
+add b32 $r0 $c $r0 $r2
+add b32 $r1 $r1 0x0 $c
+search_cstack:
+mov b32 $r3 c0[0x1918] // cstack size
+ld u8 $r2 cv g[$r0d+0x8]
+set $p0 0x1 eq u32 $r2 0xa
+$p0 bra #entry_found
+add b32 $r0 $c $r0 0x10
+add b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x10
+lg $c bra #search_cstack
+bra #end_exit
+entry_found:
+// load PC (may be unaligned and spread out)
+ld b32 $r2 cv g[$r0d]
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+st b32 wb g[$r0d+0x4] $r2
+join nop
+// invalidate caches and exit
+end_exit:
+cctl ivall 0 g[0]
+bpt pause 0x0
+rtt terminate
+end_cont:
+bpt pause 0x0
+mov $flags $r2 mask 0xffff
+ld b128 $r0q cs l[0x00]
+rtt
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
new file mode 100644
index 00000000000..53fa12c51fd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
@@ -0,0 +1,592 @@
+
+// Assembled from target_lib_nve4.asm by envyas -m nvc0 -V nve4 -W.
+
+static const uint64_t nve4_builtin_code[] =
+{
+   0x2282828042804287ULL,
+   0x7800000004009c03ULL,
+   0x380000007c209c82ULL,
+   0x180000000400dde2ULL,
+   0x6000000008309c03ULL,
+   0x1c00000005205d04ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x2282828282828287ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x2042c28280428047ULL,
+   0x200400000c209c43ULL,
+   0x280000000000dde4ULL,
+   0x5000000008001c43ULL,
+   0x1c00000005209d04ULL,
+   0x2006000000105c03ULL,
+   0x1b0e00000811dc03ULL,
+   0x4800000008104103ULL,
+   0x220282e20042c287ULL,
+   0x0800000004000002ULL,
+   0x1b0e00000811c003ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x9000000000001de7ULL,
+   0x188e0000fc05dc23ULL,
+   0x18c40000fc17dc23ULL,
+   0x2280428042828207ULL,
+   0x1c00000001201ec4ULL,
+   0x1c00000005205ec4ULL,
+   0x7800000004009c03ULL,
+   0x380000007c209c82ULL,
+   0x180000000400dde2ULL,
+   0x6000000008309c03ULL,
+   0x1c00000005205d04ULL,
+   0x2282828282828287ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x2282804280428287ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x280000000000dde4ULL,
+   0x5000000008001c43ULL,
+   0x1c00000005209d04ULL,
+   0x2006000000105c03ULL,
+   0x22028042c28042c7ULL,
+   0x1b0e00000811dc03ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x1b0e00000811c003ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x1c00000001200f84ULL,
+   0x22c200428042e047ULL,
+   0x1c00000005204b84ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004c5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009c5ULL,
+   0xd4004000084007c5ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd4004000084004c5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009c5ULL,
+   0xd4004000084007c5ULL,
+   0x1900000004a0dc04ULL,
+   0x1800000004a09c04ULL,
+   0x30de0001d030dc02ULL,
+   0x2000000000000007ULL,
+   0x1900000000a05c04ULL,
+   0x30de0001d0209c02ULL,
+   0x1800000000a01c04ULL,
+   0x30de0001d0105c02ULL,
+   0x30de0001d0001c02ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1900000004a0de04ULL,
+   0x1800000004a09e04ULL,
+   0x30e000061c30dc02ULL,
+   0x1900000000a05e04ULL,
+   0x2000000000000007ULL,
+   0x30e000061c209c02ULL,
+   0x1800000000a01e04ULL,
+   0x30e000061c105c02ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1d00000004a0de84ULL,
+   0x1c00000004a09e84ULL,
+   0x1d00000000a05e84ULL,
+   0x1c00000000a01e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1d00000004a0dc04ULL,
+   0x1c00000004a09c04ULL,
+   0x1d00000000a05c04ULL,
+   0x2000000000000007ULL,
+   0x1c00000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1100000004a0dc04ULL,
+   0x2000000000000007ULL,
+   0x1000000004a09c04ULL,
+   0x1100000000a05c04ULL,
+   0x1000000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0x2000000000000007ULL,
+   0xd4004000084007a5ULL,
+   0x1800000000009de2ULL,
+   0x18fe00000000dde2ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0x2000000000000007ULL,
+   0xd4004000084007a5ULL,
+   0x1800000000009de2ULL,
+   0x180000000400dde2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x7000c02828005c03ULL,
+   0x18fe00000000dde2ULL,
+   0x7000c02850009c03ULL,
+   0x3800000ffc001c02ULL,
+   0x1800000008a09c04ULL,
+   0x1800000004a05c04ULL,
+   0x2000000000000007ULL,
+   0x30ea00801c209c02ULL,
+   0x1800000000a01c04ULL,
+   0x30ea00801c105c02ULL,
+   0x30ea00801c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x7000c02828005c03ULL,
+   0x180000000400dde2ULL,
+   0x7000c02850009c03ULL,
+   0x3800000ffc001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x198000000020dc04ULL,
+   0x1900000000209c04ULL,
+   0x30ee02020430dc02ULL,
+   0x2000000000000007ULL,
+   0x1880000000205c04ULL,
+   0x30ee020204209c02ULL,
+   0x1800000000201c04ULL,
+   0x30ee020204105c02ULL,
+   0x30ee020204001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x198000000020de04ULL,
+   0x1900000000209e04ULL,
+   0x30f004081030dc02ULL,
+   0x1880000000205e04ULL,
+   0x2000000000000007ULL,
+   0x30f0040810209c02ULL,
+   0x1800000000201e04ULL,
+   0x30f0040810105c02ULL,
+   0x30f0040810001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x1d8000000020de84ULL,
+   0x1d00000000209e84ULL,
+   0x1c80000000205e84ULL,
+   0x1c00000000201e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x1d8000000020dc04ULL,
+   0x1d00000000209c04ULL,
+   0x1c80000000205c04ULL,
+   0x2000000000000007ULL,
+   0x1c00000000201c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x7000c01814005c03ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x7000c0142c009c03ULL,
+   0x380000007c001c02ULL,
+   0x1800000008209c04ULL,
+   0x1800000004205c04ULL,
+   0x30f4108420209c02ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30f2082084105c02ULL,
+   0x30f4108420001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x2000000000000007ULL,
+   0x7000c01414005c03ULL,
+   0x7000c01428009c03ULL,
+   0x380000007c001c02ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000008209c04ULL,
+   0x1800000004205c04ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30f4108420209c02ULL,
+   0x30f4108420105c02ULL,
+   0x30f4108420001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x1900000000a05c04ULL,
+   0x1800000000a01c04ULL,
+   0x30de0001d0105c02ULL,
+   0x30de0001d0001c02ULL,
+   0x1800000000009de2ULL,
+   0x18fe00000000dde2ULL,
+   0x2000000000000007ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x1900000000a05e04ULL,
+   0x2000000000000007ULL,
+   0x1800000000009de2ULL,
+   0x1800000000a01e04ULL,
+   0x30e000061c105c02ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1d00000000a05e84ULL,
+   0x1800000000009de2ULL,
+   0x1c00000000a01e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1d00000000a05c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1c00000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x2000000000000007ULL,
+   0x1100000000a05c04ULL,
+   0x1800000000009de2ULL,
+   0x1000000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1880000000205c04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30ee020204105c02ULL,
+   0x30ee020204001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x1880000000205e04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000201e04ULL,
+   0x30f0040810105c02ULL,
+   0x30f0040810001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x1c80000000205c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1c00000000201c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x2000000000000007ULL,
+   0x1c80000000205e84ULL,
+   0x1800000000009de2ULL,
+   0x1c00000000201e84ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0x2000000000000007ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000a01c04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x30de0001d0001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000a01e04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1800000000005de2ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400465ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400965ULL,
+   0xd400400008400765ULL,
+   0x2000000000000007ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0x2000000000000007ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000009de2ULL,
+   0x1000000000a01c04ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000201c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x30ee020204001c02ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000201e04ULL,
+   0x1800000000009de2ULL,
+   0x30f0040810001c02ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400425ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400925ULL,
+   0xd400400008400725ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x2000000000000007ULL,
+   0x9000000000001de7ULL,
+   0xd40040000840c485ULL,
+   0x0c5400000013dc04ULL,
+   0xd40040000840c985ULL,
+   0xd40040000840c785ULL,
+   0x18fe00000000dde2ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0xc800000003f01cc5ULL,
+   0x2c00000100005c04ULL,
+   0x2c0000010800dc04ULL,
+   0x3000c3fffff09c04ULL,
+   0x680100000c1fdc03ULL,
+   0x4000000a60001c47ULL,
+   0x180000004000dde2ULL,
+   0x78000009c0000007ULL,
+   0x0c0000000430dd02ULL,
+   0x4003ffffa0001ca7ULL,
+   0x2800406400001de4ULL,
+   0x2800406410005de4ULL,
+   0x180000000400dde2ULL,
+   0x547e18000000dd05ULL,
+   0x60000008e0000007ULL,
+   0x190ec0000431dc03ULL,
+   0x40000000000001f4ULL,
+   0x94000004c0009c85ULL,
+   0x2c00000100009c04ULL,
+   0x2c0000010800dc04ULL,
+   0x9400000020009ca5ULL,
+   0x9400000100011cc5ULL,
+   0x9400000140021cc5ULL,
+   0x9400000180031cc5ULL,
+   0x94000001c0041cc5ULL,
+   0x9400000200051cc5ULL,
+   0x9400000240061cc5ULL,
+   0x9400000280071cc5ULL,
+   0x94000002c0081cc5ULL,
+   0x9400000300091cc5ULL,
+   0x94000003400a1cc5ULL,
+   0x94000003800b1cc5ULL,
+   0x94000003c00c1cc5ULL,
+   0x94000004000d1cc5ULL,
+   0x94000004400e1cc5ULL,
+   0x94000004800f1cc5ULL,
+   0xc000000003f09ea5ULL,
+   0x94000000c0009ca5ULL,
+   0xc000000023f09ea5ULL,
+   0x94000000e0009ca5ULL,
+   0x2c00000084009c04ULL,
+   0x2c0000008800dc04ULL,
+   0x9400000040009ca5ULL,
+   0x2c0000008c009c04ULL,
+   0x2c0000009400dc04ULL,
+   0x9400000060009ca5ULL,
+   0x2c00000098009c04ULL,
+   0x2c0000009c00dc04ULL,
+   0x9400000080009ca5ULL,
+   0x2c000000c800dc04ULL,
+   0x0c0000001030dd02ULL,
+   0x4000000100001ea7ULL,
+   0x480100000c001c03ULL,
+   0x0800000000105c42ULL,
+   0xc100000000309c85ULL,
+   0x9400000500009c85ULL,
+   0x0c00000010001d02ULL,
+   0x0800000000105d42ULL,
+   0x0c0000001030dd02ULL,
+   0x4003ffff40001ca7ULL,
+   0x2800406420001de4ULL,
+   0x2800406430005de4ULL,
+   0xe000000000001c45ULL,
+   0xd000000003ffdcc5ULL,
+   0x9c000000000fdcc5ULL,
+   0x2c0000000c009c04ULL,
+   0x7000c0205020dc03ULL,
+   0x7000c01820209c03ULL,
+   0x5000406450209c03ULL,
+   0x500040644030dc03ULL,
+   0x480000000c209c03ULL,
+   0x4801000008001c03ULL,
+   0x0800000000105c42ULL,
+   0x280040646000dde4ULL,
+   0x8400000020009f05ULL,
+   0x190ec0002821dc03ULL,
+   0x40000000800001e7ULL,
+   0x0c00000040001c02ULL,
+   0x0800000000105c42ULL,
+   0x0c0000004030dd02ULL,
+   0x00029dff0ffc5cbfULL,
+   0x8400000000009f85ULL,
+   0x2800406400001de4ULL,
+   0x2800406410005de4ULL,
+   0x9400000010009c85ULL,
+   0x4000000000001df4ULL,
+   0x9800000003ffdcc5ULL,
+   0xd000000000008007ULL,
+   0xa000000000004007ULL,
+   0xd000000000008007ULL,
+   0x3400c3fffc201c04ULL,
+   0xc000000003f01ec5ULL,
+   0xa000000000000007ULL
+};
+
+static const uint16_t nve4_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0x0000,
+   0x00f0,
+   0x0f08,
+   0x0f18,
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
new file mode 100644
index 00000000000..d10b6b07693
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
@@ -0,0 +1,13 @@
+
+static const uint64_t nvf0_builtin_code[] =
+{
+   0x19000000001c003cULL,
+};
+
+static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0,
+   0,
+   0,
+   0
+};
diff --git a/src/gallium/drivers/nouveau/nouveau_mm.c b/src/gallium/drivers/nouveau/nouveau_mm.c
index 439c0fb2255..87f2f460a9d 100644
--- a/src/gallium/drivers/nouveau/nouveau_mm.c
+++ b/src/gallium/drivers/nouveau/nouveau_mm.c
@@ -296,4 +296,3 @@ nouveau_mm_destroy(struct nouveau_mman *cache)
 
    FREE(cache);
 }
-
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index d129a55b387..5b35ee47c12 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -85,7 +85,7 @@ nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
 	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
 	struct nouveau_bo *bo = 0;
 	int ret;
- 
+
 	ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
 	if (ret) {
 		debug_printf("%s: ref name 0x%08x failed with %d\n",
@@ -106,7 +106,7 @@ nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
 {
 	whandle->stride = stride;
 
-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) { 
+	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
 		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
 	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
 		whandle->handle = bo->handle;
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index 8e08cab4b2d..7deea881a25 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -27,7 +27,7 @@
 #include "nouveau_context.h"
 #include "nouveau_video.h"
 
-#include "nouveau/nouveau_buffer.h"
+#include "nouveau_buffer.h"
 #include "util/u_video.h"
 #include "util/u_format.h"
 #include "util/u_sampler.h"
diff --git a/src/gallium/drivers/nouveau/nv30/nv01_2d.xml.h b/src/gallium/drivers/nouveau/nv30/nv01_2d.xml.h
new file mode 100644
index 00000000000..afae00cd0a2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv01_2d.xml.h
@@ -0,0 +1,1416 @@
+#ifndef _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV01_2D_XML
+#define _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV01_2D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- /home/skeggsb/git/envytools/rnndb/nv_objects.xml      (    794 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/copyright.xml       (   6452 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv_m2mf.xml         (   2696 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv_object.xml       (  12672 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nvchipsets.xml      (   3617 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv_defs.xml         (   4437 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv50_defs.xml       (   5468 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nvc0_m2mf.xml       (   2687 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv01_2d.xml         (  32584 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv04_dvd.xml        (   3000 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv03_3d.xml         (   5209 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv04_3d.xml         (  17759 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv_3ddefs.xml       (  16394 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv10_3d.xml         (  18437 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv20_3d.xml         (  21107 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv30-40_3d.xml      (  31987 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv50_2d.xml         (  11113 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv50_3d.xml         (  65233 bytes, from 2011-11-30 05:49:35)
+- /home/skeggsb/git/envytools/rnndb/nv50_compute.xml    (  14012 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nv84_crypt.xml      (   2071 bytes, from 2011-11-30 05:49:35)
+- /home/skeggsb/git/envytools/rnndb/nv31_mpeg.xml       (   2269 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/nvc0_3d.xml         (  52547 bytes, from 2011-11-30 05:49:35)
+- /home/skeggsb/git/envytools/rnndb/nvc0_compute.xml    (  10865 bytes, from 2011-10-22 08:01:09)
+- /home/skeggsb/git/envytools/rnndb/blob_nvc0_pcopy.xml (   4516 bytes, from 2011-10-22 08:01:09)
+
+Copyright (C) 2006-2011 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV01_BETA_DMA_NOTIFY					0x00000180
+
+#define NV01_BETA_BETA_1D31					0x00000300
+
+
+#define NV04_BETA4_DMA_NOTIFY					0x00000180
+
+#define NV04_BETA4_BETA_FACTOR					0x00000300
+
+
+#define NV01_CHROMA_DMA_NOTIFY					0x00000180
+
+#define NV01_CHROMA_COLOR_FORMAT				0x00000300
+#define NV01_CHROMA_COLOR_FORMAT_A16R5G6B5			0x00000001
+#define NV01_CHROMA_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV01_CHROMA_COLOR_FORMAT_A8R8G8B8			0x00000003
+
+#define NV01_CHROMA_COLOR					0x00000304
+
+
+#define NV01_PATTERN_DMA_NOTIFY					0x00000180
+
+#define NV01_PATTERN_COLOR_FORMAT				0x00000300
+#define NV01_PATTERN_COLOR_FORMAT_A16R5G6B5			0x00000001
+#define NV01_PATTERN_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV01_PATTERN_COLOR_FORMAT_A8R8G8B8			0x00000003
+
+#define NV01_PATTERN_MONOCHROME_FORMAT				0x00000304
+#define NV01_PATTERN_MONOCHROME_FORMAT_CGA6			0x00000001
+#define NV01_PATTERN_MONOCHROME_FORMAT_LE			0x00000002
+
+#define NV01_PATTERN_MONOCHROME_SHAPE				0x00000308
+#define NV01_PATTERN_MONOCHROME_SHAPE_8X8			0x00000000
+#define NV01_PATTERN_MONOCHROME_SHAPE_64X1			0x00000001
+#define NV01_PATTERN_MONOCHROME_SHAPE_1X64			0x00000002
+
+#define NV04_PATTERN_PATTERN_SELECT				0x0000030c
+#define NV04_PATTERN_PATTERN_SELECT_MONO			0x00000001
+#define NV04_PATTERN_PATTERN_SELECT_COLOR			0x00000002
+
+#define NV01_PATTERN_MONOCHROME_COLOR(i0)		       (0x00000310 + 0x4*(i0))
+#define NV01_PATTERN_MONOCHROME_COLOR__ESIZE			0x00000004
+#define NV01_PATTERN_MONOCHROME_COLOR__LEN			0x00000002
+
+#define NV01_PATTERN_MONOCHROME_PATTERN(i0)		       (0x00000318 + 0x4*(i0))
+#define NV01_PATTERN_MONOCHROME_PATTERN__ESIZE			0x00000004
+#define NV01_PATTERN_MONOCHROME_PATTERN__LEN			0x00000002
+
+#define NV04_PATTERN_PATTERN_Y8(i0)			       (0x00000400 + 0x4*(i0))
+#define NV04_PATTERN_PATTERN_Y8__ESIZE				0x00000004
+#define NV04_PATTERN_PATTERN_Y8__LEN				0x00000010
+#define NV04_PATTERN_PATTERN_Y8_Y0__MASK			0x000000ff
+#define NV04_PATTERN_PATTERN_Y8_Y0__SHIFT			0
+#define NV04_PATTERN_PATTERN_Y8_Y1__MASK			0x0000ff00
+#define NV04_PATTERN_PATTERN_Y8_Y1__SHIFT			8
+#define NV04_PATTERN_PATTERN_Y8_Y2__MASK			0x00ff0000
+#define NV04_PATTERN_PATTERN_Y8_Y2__SHIFT			16
+#define NV04_PATTERN_PATTERN_Y8_Y3__MASK			0xff000000
+#define NV04_PATTERN_PATTERN_Y8_Y3__SHIFT			24
+
+#define NV04_PATTERN_PATTERN_R5G6B5(i0)			       (0x00000500 + 0x4*(i0))
+#define NV04_PATTERN_PATTERN_R5G6B5__ESIZE			0x00000004
+#define NV04_PATTERN_PATTERN_R5G6B5__LEN			0x00000020
+#define NV04_PATTERN_PATTERN_R5G6B5_B0__MASK			0x0000001f
+#define NV04_PATTERN_PATTERN_R5G6B5_B0__SHIFT			0
+#define NV04_PATTERN_PATTERN_R5G6B5_G0__MASK			0x000007e0
+#define NV04_PATTERN_PATTERN_R5G6B5_G0__SHIFT			5
+#define NV04_PATTERN_PATTERN_R5G6B5_R0__MASK			0x0000f800
+#define NV04_PATTERN_PATTERN_R5G6B5_R0__SHIFT			11
+#define NV04_PATTERN_PATTERN_R5G6B5_B1__MASK			0x001f0000
+#define NV04_PATTERN_PATTERN_R5G6B5_B1__SHIFT			16
+#define NV04_PATTERN_PATTERN_R5G6B5_G1__MASK			0x07e00000
+#define NV04_PATTERN_PATTERN_R5G6B5_G1__SHIFT			21
+#define NV04_PATTERN_PATTERN_R5G6B5_R1__MASK			0xf8000000
+#define NV04_PATTERN_PATTERN_R5G6B5_R1__SHIFT			27
+
+#define NV04_PATTERN_PATTERN_X1R5G5B5(i0)		       (0x00000600 + 0x4*(i0))
+#define NV04_PATTERN_PATTERN_X1R5G5B5__ESIZE			0x00000004
+#define NV04_PATTERN_PATTERN_X1R5G5B5__LEN			0x00000020
+#define NV04_PATTERN_PATTERN_X1R5G5B5_B0__MASK			0x0000001f
+#define NV04_PATTERN_PATTERN_X1R5G5B5_B0__SHIFT			0
+#define NV04_PATTERN_PATTERN_X1R5G5B5_G0__MASK			0x000003e0
+#define NV04_PATTERN_PATTERN_X1R5G5B5_G0__SHIFT			5
+#define NV04_PATTERN_PATTERN_X1R5G5B5_R0__MASK			0x00007c00
+#define NV04_PATTERN_PATTERN_X1R5G5B5_R0__SHIFT			10
+#define NV04_PATTERN_PATTERN_X1R5G5B5_B1__MASK			0x001f0000
+#define NV04_PATTERN_PATTERN_X1R5G5B5_B1__SHIFT			16
+#define NV04_PATTERN_PATTERN_X1R5G5B5_G1__MASK			0x03e00000
+#define NV04_PATTERN_PATTERN_X1R5G5B5_G1__SHIFT			21
+#define NV04_PATTERN_PATTERN_X1R5G5B5_R1__MASK			0x7c000000
+#define NV04_PATTERN_PATTERN_X1R5G5B5_R1__SHIFT			26
+
+#define NV04_PATTERN_PATTERN_X8R8G8B8(i0)		       (0x00000700 + 0x4*(i0))
+#define NV04_PATTERN_PATTERN_X8R8G8B8__ESIZE			0x00000004
+#define NV04_PATTERN_PATTERN_X8R8G8B8__LEN			0x00000040
+#define NV04_PATTERN_PATTERN_X8R8G8B8_B__MASK			0x000000ff
+#define NV04_PATTERN_PATTERN_X8R8G8B8_B__SHIFT			0
+#define NV04_PATTERN_PATTERN_X8R8G8B8_G__MASK			0x0000ff00
+#define NV04_PATTERN_PATTERN_X8R8G8B8_G__SHIFT			8
+#define NV04_PATTERN_PATTERN_X8R8G8B8_R__MASK			0x00ff0000
+#define NV04_PATTERN_PATTERN_X8R8G8B8_R__SHIFT			16
+
+
+#define NV01_CLIP_DMA_NOTIFY					0x00000180
+
+#define NV01_CLIP_POINT						0x00000300
+#define NV01_CLIP_POINT_X__MASK					0x0000ffff
+#define NV01_CLIP_POINT_X__SHIFT				0
+#define NV01_CLIP_POINT_Y__MASK					0xffff0000
+#define NV01_CLIP_POINT_Y__SHIFT				16
+
+#define NV01_CLIP_SIZE						0x00000304
+#define NV01_CLIP_SIZE_W__MASK					0x0000ffff
+#define NV01_CLIP_SIZE_W__SHIFT					0
+#define NV01_CLIP_SIZE_H__MASK					0xffff0000
+#define NV01_CLIP_SIZE_H__SHIFT					16
+
+
+#define NV01_ROP_DMA_NOTIFY					0x00000180
+
+#define NV01_ROP_ROP						0x00000300
+
+
+#define NV04_SURFACE_2D_DMA_NOTIFY				0x00000180
+
+#define NV04_SURFACE_2D_DMA_IMAGE_SOURCE			0x00000184
+
+#define NV04_SURFACE_2D_DMA_IMAGE_DESTIN			0x00000188
+
+
+#define NV50_SURFACE_2D_SRC_LINEAR				0x00000200
+
+#define NV50_SURFACE_2D_SRC_TILE_MODE				0x00000204
+
+#define NV50_SURFACE_2D_SRC_WIDTH				0x00000208
+
+#define NV50_SURFACE_2D_SRC_HEIGHT				0x0000020c
+
+#define NV50_SURFACE_2D_UNK0210					0x00000210
+
+#define NV50_SURFACE_2D_UNK0214					0x00000214
+
+#define NV50_SURFACE_2D_DST_LINEAR				0x00000218
+
+#define NV50_SURFACE_2D_DST_TILE_MODE				0x0000021c
+
+#define NV50_SURFACE_2D_DST_WIDTH				0x00000220
+
+#define NV50_SURFACE_2D_DST_HEIGHT				0x00000224
+
+#define NV50_SURFACE_2D_UNK0228					0x00000228
+
+#define NV50_SURFACE_2D_UNK022C					0x0000022c
+
+#define NV50_SURFACE_2D_OFFSET_SOURCE_HIGH			0x00000230
+
+#define NV50_SURFACE_2D_OFFSET_DESTIN_HIGH			0x00000234
+
+#define NV04_SURFACE_2D_FORMAT					0x00000300
+#define NV04_SURFACE_2D_FORMAT_Y8				0x00000001
+#define NV04_SURFACE_2D_FORMAT_X1R5G5B5_Z1R5G5B5		0x00000002
+#define NV04_SURFACE_2D_FORMAT_X1R5G5B5_X1R5G5B5		0x00000003
+#define NV04_SURFACE_2D_FORMAT_R5G6B5				0x00000004
+#define NV04_SURFACE_2D_FORMAT_Y16				0x00000005
+#define NV04_SURFACE_2D_FORMAT_X8R8G8B8_Z8R8G8B8		0x00000006
+#define NV04_SURFACE_2D_FORMAT_X8R8G8B8_X8R8G8B8		0x00000007
+#define NV04_SURFACE_2D_FORMAT_X1A7R8G8B8_Z1A7R8G8B8		0x00000008
+#define NV04_SURFACE_2D_FORMAT_X1A7R8G8B8_X1A7R8G8B8		0x00000009
+#define NV04_SURFACE_2D_FORMAT_A8R8G8B8				0x0000000a
+#define NV04_SURFACE_2D_FORMAT_Y32				0x0000000b
+
+#define NV04_SURFACE_2D_PITCH					0x00000304
+#define NV04_SURFACE_2D_PITCH_SOURCE__MASK			0x0000ffff
+#define NV04_SURFACE_2D_PITCH_SOURCE__SHIFT			0
+#define NV04_SURFACE_2D_PITCH_DESTIN__MASK			0xffff0000
+#define NV04_SURFACE_2D_PITCH_DESTIN__SHIFT			16
+
+#define NV04_SURFACE_2D_OFFSET_SOURCE				0x00000308
+
+#define NV04_SURFACE_2D_OFFSET_DESTIN				0x0000030c
+
+
+#define NV04_SURFACE_SWZ_DMA_NOTIFY				0x00000180
+
+#define NV04_SURFACE_SWZ_DMA_IMAGE				0x00000184
+
+#define NV04_SURFACE_SWZ_FORMAT					0x00000300
+#define NV04_SURFACE_SWZ_FORMAT_COLOR__MASK			0x000000ff
+#define NV04_SURFACE_SWZ_FORMAT_COLOR__SHIFT			0
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_Y8			0x00000001
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5		0x00000002
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X1R5G5B5_X1R5G5B5		0x00000003
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_R5G6B5			0x00000004
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_Y16			0x00000005
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8		0x00000006
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X8R8G8B8_X8R8G8B8		0x00000007
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8	0x00000008
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8	0x00000009
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_A8R8G8B8			0x0000000a
+#define NV04_SURFACE_SWZ_FORMAT_COLOR_Y32			0x0000000b
+#define NV04_SURFACE_SWZ_FORMAT_BASE_SIZE_U__MASK		0x00ff0000
+#define NV04_SURFACE_SWZ_FORMAT_BASE_SIZE_U__SHIFT		16
+#define NV04_SURFACE_SWZ_FORMAT_BASE_SIZE_V__MASK		0xff000000
+#define NV04_SURFACE_SWZ_FORMAT_BASE_SIZE_V__SHIFT		24
+
+#define NV04_SURFACE_SWZ_OFFSET					0x00000304
+
+
+#define NV01_POINT_PATCH					0x0000010c
+
+#define NV01_POINT_DMA_NOTIFY					0x00000180
+
+#define NV01_POINT_CLIP						0x00000184
+
+#define NV01_POINT_PATTERN					0x00000188
+
+#define NV01_POINT_ROP						0x0000018c
+
+#define NV01_POINT_BETA						0x00000190
+
+#define NV01_POINT_SURFACE_DST					0x00000194
+
+#define NV01_POINT_OPERATION					0x000002fc
+#define NV01_POINT_OPERATION_SRCCOPY_AND			0x00000000
+#define NV01_POINT_OPERATION_ROP_AND				0x00000001
+#define NV01_POINT_OPERATION_BLEND_AND				0x00000002
+#define NV01_POINT_OPERATION_SRCCOPY				0x00000003
+#define NV01_POINT_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_POINT_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_POINT_COLOR_FORMAT					0x00000300
+#define NV01_POINT_COLOR_FORMAT_X16A8Y8				0x00000001
+#define NV01_POINT_COLOR_FORMAT_X24Y8				0x00000002
+#define NV01_POINT_COLOR_FORMAT_X16A1R5G5B5			0x00000003
+#define NV01_POINT_COLOR_FORMAT_X17R5G5B5			0x00000004
+#define NV01_POINT_COLOR_FORMAT_A8R8G8B8			0x00000005
+#define NV01_POINT_COLOR_FORMAT_X8R8G8B8			0x00000006
+#define NV01_POINT_COLOR_FORMAT_A16Y16				0x00000007
+#define NV01_POINT_COLOR_FORMAT_X16Y16				0x00000008
+
+#define NV01_POINT_COLOR					0x00000304
+
+#define NV01_POINT_POINT(i0)				       (0x00000400 + 0x4*(i0))
+#define NV01_POINT_POINT__ESIZE					0x00000004
+#define NV01_POINT_POINT__LEN					0x00000020
+#define NV01_POINT_POINT_X__MASK				0x0000ffff
+#define NV01_POINT_POINT_X__SHIFT				0
+#define NV01_POINT_POINT_Y__MASK				0xffff0000
+#define NV01_POINT_POINT_Y__SHIFT				16
+
+#define NV01_POINT_POINT32_X(i0)			       (0x00000480 + 0x8*(i0))
+#define NV01_POINT_POINT32_X__ESIZE				0x00000008
+#define NV01_POINT_POINT32_X__LEN				0x00000010
+
+#define NV01_POINT_POINT32_Y(i0)			       (0x00000484 + 0x8*(i0))
+#define NV01_POINT_POINT32_Y__ESIZE				0x00000008
+#define NV01_POINT_POINT32_Y__LEN				0x00000010
+
+#define NV01_POINT_CPOINT_COLOR(i0)			       (0x00000500 + 0x8*(i0))
+#define NV01_POINT_CPOINT_COLOR__ESIZE				0x00000008
+#define NV01_POINT_CPOINT_COLOR__LEN				0x00000010
+
+#define NV01_POINT_CPOINT_POINT(i0)			       (0x00000504 + 0x8*(i0))
+#define NV01_POINT_CPOINT_POINT__ESIZE				0x00000008
+#define NV01_POINT_CPOINT_POINT__LEN				0x00000010
+#define NV01_POINT_CPOINT_POINT_X__MASK				0x0000ffff
+#define NV01_POINT_CPOINT_POINT_X__SHIFT			0
+#define NV01_POINT_CPOINT_POINT_Y__MASK				0xffff0000
+#define NV01_POINT_CPOINT_POINT_Y__SHIFT			16
+
+
+#define NV01_LINE_PATCH						0x0000010c
+
+#define NV01_LINE_DMA_NOTIFY					0x00000180
+
+#define NV01_LINE_CLIP						0x00000184
+
+#define NV01_LINE_PATTERN					0x00000188
+
+#define NV04_LIN_PATTERN					0x00000188
+
+#define NV01_LINE_ROP						0x0000018c
+
+#define NV01_LINE_BETA						0x00000190
+
+#define NV01_LINE_SURFACE_DST					0x00000194
+
+
+#define NV04_LIN_BETA4						0x00000194
+
+#define NV04_LIN_SURFACE					0x00000198
+
+#define NV01_LINE_OPERATION					0x000002fc
+#define NV01_LINE_OPERATION_SRCCOPY_AND				0x00000000
+#define NV01_LINE_OPERATION_ROP_AND				0x00000001
+#define NV01_LINE_OPERATION_BLEND_AND				0x00000002
+#define NV01_LINE_OPERATION_SRCCOPY				0x00000003
+#define NV01_LINE_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_LINE_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_LINE_COLOR_FORMAT					0x00000300
+#define NV01_LINE_COLOR_FORMAT_A16R5G6B5			0x00000001
+#define NV01_LINE_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV01_LINE_COLOR_FORMAT_A8R8G8B8				0x00000003
+
+#define NV01_LINE_COLOR						0x00000304
+
+#define NV01_LINE_LINE_POINT0(i0)			       (0x00000400 + 0x8*(i0))
+#define NV01_LINE_LINE_POINT0__ESIZE				0x00000008
+#define NV01_LINE_LINE_POINT0__LEN				0x00000010
+#define NV01_LINE_LINE_POINT0_X__MASK				0x0000ffff
+#define NV01_LINE_LINE_POINT0_X__SHIFT				0
+#define NV01_LINE_LINE_POINT0_Y__MASK				0xffff0000
+#define NV01_LINE_LINE_POINT0_Y__SHIFT				16
+
+#define NV01_LINE_LINE_POINT1(i0)			       (0x00000404 + 0x8*(i0))
+#define NV01_LINE_LINE_POINT1__ESIZE				0x00000008
+#define NV01_LINE_LINE_POINT1__LEN				0x00000010
+#define NV01_LINE_LINE_POINT1_X__MASK				0x0000ffff
+#define NV01_LINE_LINE_POINT1_X__SHIFT				0
+#define NV01_LINE_LINE_POINT1_Y__MASK				0xffff0000
+#define NV01_LINE_LINE_POINT1_Y__SHIFT				16
+
+#define NV01_LINE_LINE32_POINT0_X(i0)			       (0x00000480 + 0x10*(i0))
+#define NV01_LINE_LINE32_POINT0_X__ESIZE			0x00000010
+#define NV01_LINE_LINE32_POINT0_X__LEN				0x00000008
+
+#define NV01_LINE_LINE32_POINT0_Y(i0)			       (0x00000484 + 0x10*(i0))
+#define NV01_LINE_LINE32_POINT0_Y__ESIZE			0x00000010
+#define NV01_LINE_LINE32_POINT0_Y__LEN				0x00000008
+
+#define NV01_LINE_LINE32_POINT1_X(i0)			       (0x00000488 + 0x10*(i0))
+#define NV01_LINE_LINE32_POINT1_X__ESIZE			0x00000010
+#define NV01_LINE_LINE32_POINT1_X__LEN				0x00000008
+
+#define NV01_LINE_LINE32_POINT1_Y(i0)			       (0x0000048c + 0x10*(i0))
+#define NV01_LINE_LINE32_POINT1_Y__ESIZE			0x00000010
+#define NV01_LINE_LINE32_POINT1_Y__LEN				0x00000008
+
+#define NV01_LINE_POLYLINE(i0)				       (0x00000500 + 0x4*(i0))
+#define NV01_LINE_POLYLINE__ESIZE				0x00000004
+#define NV01_LINE_POLYLINE__LEN					0x00000020
+#define NV01_LINE_POLYLINE_X__MASK				0x0000ffff
+#define NV01_LINE_POLYLINE_X__SHIFT				0
+#define NV01_LINE_POLYLINE_Y__MASK				0xffff0000
+#define NV01_LINE_POLYLINE_Y__SHIFT				16
+
+#define NV01_LINE_POLYLINE32_POINT_X(i0)		       (0x00000580 + 0x8*(i0))
+#define NV01_LINE_POLYLINE32_POINT_X__ESIZE			0x00000008
+#define NV01_LINE_POLYLINE32_POINT_X__LEN			0x00000010
+
+#define NV01_LINE_POLYLINE32_POINT_Y(i0)		       (0x00000584 + 0x8*(i0))
+#define NV01_LINE_POLYLINE32_POINT_Y__ESIZE			0x00000008
+#define NV01_LINE_POLYLINE32_POINT_Y__LEN			0x00000010
+
+#define NV01_LINE_CPOLYLINE_COLOR(i0)			       (0x00000600 + 0x8*(i0))
+#define NV01_LINE_CPOLYLINE_COLOR__ESIZE			0x00000008
+#define NV01_LINE_CPOLYLINE_COLOR__LEN				0x00000010
+
+#define NV01_LINE_CPOLYLINE_POINT(i0)			       (0x00000604 + 0x8*(i0))
+#define NV01_LINE_CPOLYLINE_POINT__ESIZE			0x00000008
+#define NV01_LINE_CPOLYLINE_POINT__LEN				0x00000010
+#define NV01_LINE_CPOLYLINE_POINT_X__MASK			0x0000ffff
+#define NV01_LINE_CPOLYLINE_POINT_X__SHIFT			0
+#define NV01_LINE_CPOLYLINE_POINT_Y__MASK			0xffff0000
+#define NV01_LINE_CPOLYLINE_POINT_Y__SHIFT			16
+
+
+#define NV01_TRI_PATCH						0x0000010c
+
+#define NV01_TRI_DMA_NOTIFY					0x00000180
+
+#define NV01_TRI_CLIP						0x00000184
+
+#define NV01_TRI_PATTERN					0x00000188
+
+#define NV04_TRI_PATTERN					0x00000188
+
+#define NV01_TRI_ROP						0x0000018c
+
+#define NV01_TRI_BETA						0x00000190
+
+#define NV01_TRI_SURFACE_DST					0x00000194
+
+
+#define NV04_TRI_BETA4						0x00000194
+
+#define NV04_TRI_SURFACE					0x00000198
+
+#define NV01_TRI_OPERATION					0x000002fc
+#define NV01_TRI_OPERATION_SRCCOPY_AND				0x00000000
+#define NV01_TRI_OPERATION_ROP_AND				0x00000001
+#define NV01_TRI_OPERATION_BLEND_AND				0x00000002
+#define NV01_TRI_OPERATION_SRCCOPY				0x00000003
+#define NV01_TRI_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_TRI_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_TRI_COLOR_FORMAT					0x00000300
+#define NV01_TRI_COLOR_FORMAT_A16R5G6B5				0x00000001
+#define NV01_TRI_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV01_TRI_COLOR_FORMAT_A8R8G8B8				0x00000003
+
+#define NV01_TRI_COLOR						0x00000304
+
+#define NV01_TRI_TRIANGLE_POINT0				0x00000310
+#define NV01_TRI_TRIANGLE_POINT0_X__MASK			0x0000ffff
+#define NV01_TRI_TRIANGLE_POINT0_X__SHIFT			0
+#define NV01_TRI_TRIANGLE_POINT0_Y__MASK			0xffff0000
+#define NV01_TRI_TRIANGLE_POINT0_Y__SHIFT			16
+
+#define NV01_TRI_TRIANGLE_POINT1				0x00000314
+#define NV01_TRI_TRIANGLE_POINT1_X__MASK			0x0000ffff
+#define NV01_TRI_TRIANGLE_POINT1_X__SHIFT			0
+#define NV01_TRI_TRIANGLE_POINT1_Y__MASK			0xffff0000
+#define NV01_TRI_TRIANGLE_POINT1_Y__SHIFT			16
+
+#define NV01_TRI_TRIANGLE_POINT2				0x00000318
+#define NV01_TRI_TRIANGLE_POINT2_X__MASK			0x0000ffff
+#define NV01_TRI_TRIANGLE_POINT2_X__SHIFT			0
+#define NV01_TRI_TRIANGLE_POINT2_Y__MASK			0xffff0000
+#define NV01_TRI_TRIANGLE_POINT2_Y__SHIFT			16
+
+#define NV01_TRI_TRIANGLE32_POINT0_X				0x00000320
+
+#define NV01_TRI_TRIANGLE32_POINT0_Y				0x00000324
+
+#define NV01_TRI_TRIANGLE32_POINT1_X				0x00000328
+
+#define NV01_TRI_TRIANGLE32_POINT1_Y				0x0000032c
+
+#define NV01_TRI_TRIANGLE32_POINT2_X				0x00000330
+
+#define NV01_TRI_TRIANGLE32_POINT2_Y				0x00000334
+
+#define NV01_TRI_TRIMESH(i0)				       (0x00000400 + 0x4*(i0))
+#define NV01_TRI_TRIMESH__ESIZE					0x00000004
+#define NV01_TRI_TRIMESH__LEN					0x00000020
+#define NV01_TRI_TRIMESH_X__MASK				0x0000ffff
+#define NV01_TRI_TRIMESH_X__SHIFT				0
+#define NV01_TRI_TRIMESH_Y__MASK				0xffff0000
+#define NV01_TRI_TRIMESH_Y__SHIFT				16
+
+#define NV01_TRI_TRIMESH32_POINT_X(i0)			       (0x00000480 + 0x8*(i0))
+#define NV01_TRI_TRIMESH32_POINT_X__ESIZE			0x00000008
+#define NV01_TRI_TRIMESH32_POINT_X__LEN				0x00000010
+
+#define NV01_TRI_TRIMESH32_POINT_Y(i0)			       (0x00000484 + 0x8*(i0))
+#define NV01_TRI_TRIMESH32_POINT_Y__ESIZE			0x00000008
+#define NV01_TRI_TRIMESH32_POINT_Y__LEN				0x00000010
+
+#define NV01_TRI_CTRIANGLE_COLOR(i0)			       (0x00000500 + 0x10*(i0))
+#define NV01_TRI_CTRIANGLE_COLOR__ESIZE				0x00000010
+#define NV01_TRI_CTRIANGLE_COLOR__LEN				0x00000008
+
+#define NV01_TRI_CTRIANGLE_POINT0(i0)			       (0x00000504 + 0x10*(i0))
+#define NV01_TRI_CTRIANGLE_POINT0__ESIZE			0x00000010
+#define NV01_TRI_CTRIANGLE_POINT0__LEN				0x00000008
+#define NV01_TRI_CTRIANGLE_POINT0_X__MASK			0x0000ffff
+#define NV01_TRI_CTRIANGLE_POINT0_X__SHIFT			0
+#define NV01_TRI_CTRIANGLE_POINT0_Y__MASK			0xffff0000
+#define NV01_TRI_CTRIANGLE_POINT0_Y__SHIFT			16
+
+#define NV01_TRI_CTRIANGLE_POINT1(i0)			       (0x00000508 + 0x10*(i0))
+#define NV01_TRI_CTRIANGLE_POINT1__ESIZE			0x00000010
+#define NV01_TRI_CTRIANGLE_POINT1__LEN				0x00000008
+#define NV01_TRI_CTRIANGLE_POINT1_X__MASK			0x0000ffff
+#define NV01_TRI_CTRIANGLE_POINT1_X__SHIFT			0
+#define NV01_TRI_CTRIANGLE_POINT1_Y__MASK			0xffff0000
+#define NV01_TRI_CTRIANGLE_POINT1_Y__SHIFT			16
+
+#define NV01_TRI_CTRIANGLE_POINT2(i0)			       (0x0000050c + 0x10*(i0))
+#define NV01_TRI_CTRIANGLE_POINT2__ESIZE			0x00000010
+#define NV01_TRI_CTRIANGLE_POINT2__LEN				0x00000008
+#define NV01_TRI_CTRIANGLE_POINT2_X__MASK			0x0000ffff
+#define NV01_TRI_CTRIANGLE_POINT2_X__SHIFT			0
+#define NV01_TRI_CTRIANGLE_POINT2_Y__MASK			0xffff0000
+#define NV01_TRI_CTRIANGLE_POINT2_Y__SHIFT			16
+
+#define NV01_TRI_CTRIMESH_COLOR(i0)			       (0x00000580 + 0x8*(i0))
+#define NV01_TRI_CTRIMESH_COLOR__ESIZE				0x00000008
+#define NV01_TRI_CTRIMESH_COLOR__LEN				0x00000010
+
+#define NV01_TRI_CTRIMESH_POINT(i0)			       (0x00000584 + 0x8*(i0))
+#define NV01_TRI_CTRIMESH_POINT__ESIZE				0x00000008
+#define NV01_TRI_CTRIMESH_POINT__LEN				0x00000010
+#define NV01_TRI_CTRIMESH_POINT_X__MASK				0x0000ffff
+#define NV01_TRI_CTRIMESH_POINT_X__SHIFT			0
+#define NV01_TRI_CTRIMESH_POINT_Y__MASK				0xffff0000
+#define NV01_TRI_CTRIMESH_POINT_Y__SHIFT			16
+
+
+#define NV01_RECT_PATCH						0x0000010c
+
+#define NV01_RECT_DMA_NOTIFY					0x00000180
+
+#define NV01_RECT_CLIP						0x00000184
+
+#define NV01_RECT_PATTERN					0x00000188
+
+#define NV04_RECT_PATTERN					0x00000188
+
+#define NV01_RECT_ROP						0x0000018c
+
+#define NV01_RECT_BETA						0x00000190
+
+#define NV01_RECT_SURFACE_DST					0x00000194
+
+
+#define NV04_RECT_BETA4						0x00000194
+
+#define NV04_RECT_SURFACE					0x00000198
+
+#define NV01_RECT_OPERATION					0x000002fc
+#define NV01_RECT_OPERATION_SRCCOPY_AND				0x00000000
+#define NV01_RECT_OPERATION_ROP_AND				0x00000001
+#define NV01_RECT_OPERATION_BLEND_AND				0x00000002
+#define NV01_RECT_OPERATION_SRCCOPY				0x00000003
+#define NV01_RECT_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_RECT_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_RECT_COLOR_FORMAT					0x00000300
+#define NV01_RECT_COLOR_FORMAT_A16R5G6B5			0x00000001
+#define NV01_RECT_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV01_RECT_COLOR_FORMAT_A8R8G8B8				0x00000003
+
+#define NV01_RECT_COLOR						0x00000304
+
+#define NV01_RECT_RECTANGLE_POINT(i0)			       (0x00000400 + 0x8*(i0))
+#define NV01_RECT_RECTANGLE_POINT__ESIZE			0x00000008
+#define NV01_RECT_RECTANGLE_POINT__LEN				0x00000010
+#define NV01_RECT_RECTANGLE_POINT_X__MASK			0x0000ffff
+#define NV01_RECT_RECTANGLE_POINT_X__SHIFT			0
+#define NV01_RECT_RECTANGLE_POINT_Y__MASK			0xffff0000
+#define NV01_RECT_RECTANGLE_POINT_Y__SHIFT			16
+
+#define NV01_RECT_RECTANGLE_SIZE(i0)			       (0x00000404 + 0x8*(i0))
+#define NV01_RECT_RECTANGLE_SIZE__ESIZE				0x00000008
+#define NV01_RECT_RECTANGLE_SIZE__LEN				0x00000010
+#define NV01_RECT_RECTANGLE_SIZE_W__MASK			0x0000ffff
+#define NV01_RECT_RECTANGLE_SIZE_W__SHIFT			0
+#define NV01_RECT_RECTANGLE_SIZE_H__MASK			0xffff0000
+#define NV01_RECT_RECTANGLE_SIZE_H__SHIFT			16
+
+
+#define NV01_BLIT_PATCH						0x0000010c
+
+
+#define NV15_BLIT_WAIT_FOR_IDLE					0x00000108
+
+#define NV15_BLIT_FLIP_SET_READ					0x00000120
+
+#define NV15_BLIT_FLIP_SET_WRITE				0x00000124
+
+#define NV15_BLIT_FLIP_MAX					0x00000128
+
+#define NV15_BLIT_FLIP_INCR_WRITE				0x0000012c
+
+#define NV15_BLIT_FLIP_WAIT					0x00000130
+
+#define NV15_BLIT_FLIP_CRTC_INCR_READ				0x00000134
+
+#define NV01_BLIT_DMA_NOTIFY					0x00000180
+
+#define NV01_BLIT_COLOR_KEY					0x00000184
+
+#define NV04_BLIT_COLOR_KEY					0x00000184
+
+#define NV01_BLIT_CLIP						0x00000188
+
+#define NV01_BLIT_PATTERN					0x0000018c
+
+#define NV04_BLIT_PATTERN					0x0000018c
+
+#define NV01_BLIT_ROP						0x00000190
+
+#define NV01_BLIT_BETA						0x00000194
+
+
+#define NV01_BLIT_SURFACE_SRC					0x00000198
+
+#define NV01_BLIT_SURFACE_DST					0x0000019c
+
+
+#define NV04_BLIT_BETA4						0x00000198
+
+#define NV04_BLIT_SURFACES					0x0000019c
+
+#define NV01_BLIT_OPERATION					0x000002fc
+#define NV01_BLIT_OPERATION_SRCCOPY_AND				0x00000000
+#define NV01_BLIT_OPERATION_ROP_AND				0x00000001
+#define NV01_BLIT_OPERATION_BLEND_AND				0x00000002
+#define NV01_BLIT_OPERATION_SRCCOPY				0x00000003
+#define NV01_BLIT_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_BLIT_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_BLIT_POINT_IN					0x00000300
+#define NV01_BLIT_POINT_IN_X__MASK				0x0000ffff
+#define NV01_BLIT_POINT_IN_X__SHIFT				0
+#define NV01_BLIT_POINT_IN_Y__MASK				0xffff0000
+#define NV01_BLIT_POINT_IN_Y__SHIFT				16
+
+#define NV01_BLIT_POINT_OUT					0x00000304
+#define NV01_BLIT_POINT_OUT_X__MASK				0x0000ffff
+#define NV01_BLIT_POINT_OUT_X__SHIFT				0
+#define NV01_BLIT_POINT_OUT_Y__MASK				0xffff0000
+#define NV01_BLIT_POINT_OUT_Y__SHIFT				16
+
+#define NV01_BLIT_SIZE						0x00000308
+#define NV01_BLIT_SIZE_W__MASK					0x0000ffff
+#define NV01_BLIT_SIZE_W__SHIFT					0
+#define NV01_BLIT_SIZE_H__MASK					0xffff0000
+#define NV01_BLIT_SIZE_H__SHIFT					16
+
+
+#define NV04_INDEX_PATCH					0x0000010c
+
+#define NV04_INDEX_DMA_NOTIFY					0x00000180
+
+#define NV04_INDEX_DMA_LUT					0x00000184
+
+#define NV04_INDEX_COLOR_KEY					0x00000188
+
+#define NV04_INDEX_CLIP						0x0000018c
+
+#define NV04_INDEX_PATTERN					0x00000190
+
+#define NV04_INDEX_ROP						0x00000194
+
+#define NV04_INDEX_BETA						0x00000198
+
+#define NV04_INDEX_BETA4					0x0000019c
+
+#define NV04_INDEX_SURFACE					0x000001a0
+
+#define NV05_INDEX_SURFACE					0x000001a0
+
+#define NV05_INDEX_COLOR_CONVERSION				0x000003e0
+
+#define NV04_INDEX_OPERATION					0x000003e4
+
+#define NV04_INDEX_COLOR_FORMAT					0x000003e8
+
+#define NV04_INDEX_INDEX_FORMAT					0x000003ec
+
+#define NV04_INDEX_LUT_OFFSET					0x000003f0
+
+#define NV04_INDEX_POINT					0x000003f4
+
+#define NV04_INDEX_SIZE_OUT					0x000003f8
+
+#define NV04_INDEX_SIZE_IN					0x000003fc
+
+#define NV04_INDEX_COLOR(i0)				       (0x00000400 + 0x4*(i0))
+#define NV04_INDEX_COLOR__ESIZE					0x00000004
+#define NV04_INDEX_COLOR__LEN					0x00000700
+
+
+#define NV10_IFC_WAIT_FOR_IDLE					0x00000108
+
+#define NV01_IFC_PATCH						0x0000010c
+
+#define NV01_IFC_DMA_NOTIFY					0x00000180
+
+#define NV01_IFC_COLOR_KEY					0x00000184
+
+#define NV04_IFC_COLOR_KEY					0x00000184
+
+#define NV01_IFC_CLIP						0x00000188
+
+#define NV01_IFC_PATTERN					0x0000018c
+
+#define NV04_IFC_PATTERN					0x0000018c
+
+#define NV01_IFC_ROP						0x00000190
+
+#define NV01_IFC_BETA						0x00000194
+
+
+#define NV01_IFC_SURFACE_DST					0x00000198
+
+
+#define NV04_IFC_BETA4						0x00000198
+
+#define NV04_IFC_SURFACE					0x0000019c
+
+#define NV05_IFC_COLOR_CONVERSION				0x000002f8
+
+#define NV01_IFC_OPERATION					0x000002fc
+#define NV01_IFC_OPERATION_SRCCOPY_AND				0x00000000
+#define NV01_IFC_OPERATION_ROP_AND				0x00000001
+#define NV01_IFC_OPERATION_BLEND_AND				0x00000002
+#define NV01_IFC_OPERATION_SRCCOPY				0x00000003
+#define NV01_IFC_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV01_IFC_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV01_IFC_COLOR_FORMAT					0x00000300
+#define NV01_IFC_COLOR_FORMAT_R5G6G5				0x00000001
+#define NV01_IFC_COLOR_FORMAT_A1R5G5B5				0x00000002
+#define NV01_IFC_COLOR_FORMAT_X1R5G5B5				0x00000003
+#define NV01_IFC_COLOR_FORMAT_A8R8G8B8				0x00000004
+#define NV01_IFC_COLOR_FORMAT_X8R8G8B8				0x00000005
+
+#define NV01_IFC_POINT						0x00000304
+#define NV01_IFC_POINT_X__MASK					0x0000ffff
+#define NV01_IFC_POINT_X__SHIFT					0
+#define NV01_IFC_POINT_Y__MASK					0xffff0000
+#define NV01_IFC_POINT_Y__SHIFT					16
+
+#define NV01_IFC_SIZE_OUT					0x00000308
+#define NV01_IFC_SIZE_OUT_W__MASK				0x0000ffff
+#define NV01_IFC_SIZE_OUT_W__SHIFT				0
+#define NV01_IFC_SIZE_OUT_H__MASK				0xffff0000
+#define NV01_IFC_SIZE_OUT_H__SHIFT				16
+
+#define NV01_IFC_SIZE_IN					0x0000030c
+#define NV01_IFC_SIZE_IN_W__MASK				0x0000ffff
+#define NV01_IFC_SIZE_IN_W__SHIFT				0
+#define NV01_IFC_SIZE_IN_H__MASK				0xffff0000
+#define NV01_IFC_SIZE_IN_H__SHIFT				16
+
+#define NV01_IFC_COLOR(i0)				       (0x00000400 + 0x4*(i0))
+#define NV01_IFC_COLOR__ESIZE					0x00000004
+#define NV01_IFC_COLOR__LEN					0x00000020
+
+#define NV04_IFC_COLOR(i0)				       (0x00000400 + 0x4*(i0))
+#define NV04_IFC_COLOR__ESIZE					0x00000004
+#define NV04_IFC_COLOR__LEN					0x00000700
+
+
+#define NV03_SIFC_PATCH						0x0000010c
+
+#define NV03_SIFC_DMA_NOTIFY					0x00000180
+
+#define NV03_SIFC_COLOR_KEY					0x00000184
+
+#define NV04_SIFC_COLOR_KEY					0x00000184
+
+#define NV03_SIFC_PATTERN					0x00000188
+
+#define NV04_SIFC_PATTERN					0x00000188
+
+#define NV03_SIFC_ROP						0x0000018c
+
+#define NV03_SIFC_BETA						0x00000190
+
+
+#define NV03_SIFC_SURFACE_DST					0x00000194
+
+
+#define NV04_SIFC_BETA4						0x00000194
+
+#define NV04_SIFC_SURFACE					0x00000198
+
+#define NV05_SIFC_COLOR_CONVERSION				0x000002f8
+
+#define NV03_SIFC_OPERATION					0x000002fc
+
+#define NV03_SIFC_COLOR_FORMAT					0x00000300
+
+#define NV03_SIFC_SIZE_IN					0x00000304
+#define NV03_SIFC_SIZE_IN_W__MASK				0x0000ffff
+#define NV03_SIFC_SIZE_IN_W__SHIFT				0
+#define NV03_SIFC_SIZE_IN_H__MASK				0xffff0000
+#define NV03_SIFC_SIZE_IN_H__SHIFT				16
+
+#define NV03_SIFC_DX_DU						0x00000308
+
+#define NV03_SIFC_DY_DV						0x0000030c
+
+#define NV03_SIFC_CLIP_POINT					0x00000310
+#define NV03_SIFC_CLIP_POINT_X__MASK				0x0000ffff
+#define NV03_SIFC_CLIP_POINT_X__SHIFT				0
+#define NV03_SIFC_CLIP_POINT_Y__MASK				0xffff0000
+#define NV03_SIFC_CLIP_POINT_Y__SHIFT				16
+
+#define NV03_SIFC_CLIP_SIZE					0x00000314
+#define NV03_SIFC_CLIP_SIZE_W__MASK				0x0000ffff
+#define NV03_SIFC_CLIP_SIZE_W__SHIFT				0
+#define NV03_SIFC_CLIP_SIZE_H__MASK				0xffff0000
+#define NV03_SIFC_CLIP_SIZE_H__SHIFT				16
+
+#define NV03_SIFC_POINT12D4					0x00000318
+#define NV03_SIFC_POINT12D4_X__MASK				0x0000ffff
+#define NV03_SIFC_POINT12D4_X__SHIFT				0
+#define NV03_SIFC_POINT12D4_Y__MASK				0xffff0000
+#define NV03_SIFC_POINT12D4_Y__SHIFT				16
+
+#define NV03_SIFC_COLOR(i0)				       (0x00000400 + 0x4*(i0))
+#define NV03_SIFC_COLOR__ESIZE					0x00000004
+#define NV03_SIFC_COLOR__LEN					0x00000700
+
+
+#define NV10_SIFM_WAIT_FOR_IDLE					0x00000108
+
+#define NV03_SIFM_DMA_NOTIFY					0x00000180
+
+#define NV03_SIFM_DMA_IMAGE					0x00000184
+
+#define NV03_SIFM_PATTERN					0x00000188
+
+#define NV04_SIFM_PATTERN					0x00000188
+
+#define NV03_SIFM_ROP						0x0000018c
+
+#define NV03_SIFM_BETA						0x00000190
+
+
+#define NV03_SIFM_SURFACE_DST					0x00000194
+
+
+#define NV04_SIFM_BETA4						0x00000194
+
+#define NV04_SIFM_SURFACE					0x00000198
+
+#define NV05_SIFM_SURFACE					0x00000198
+
+#define NV05_SIFM_COLOR_CONVERSION				0x000002fc
+#define NV05_SIFM_COLOR_CONVERSION_DITHER			0x00000000
+#define NV05_SIFM_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define NV05_SIFM_COLOR_CONVERSION_SUBTR_TRUNCATE		0x00000002
+
+#define NV03_SIFM_COLOR_FORMAT					0x00000300
+#define NV03_SIFM_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define NV03_SIFM_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define NV03_SIFM_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define NV03_SIFM_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define NV03_SIFM_COLOR_FORMAT_V8YB8U8YA8			0x00000005
+#define NV03_SIFM_COLOR_FORMAT_YB8V8YA8U8			0x00000006
+#define NV03_SIFM_COLOR_FORMAT_R5G6B5				0x00000007
+#define NV03_SIFM_COLOR_FORMAT_Y8				0x00000008
+#define NV03_SIFM_COLOR_FORMAT_AY8				0x00000009
+
+#define NV03_SIFM_OPERATION					0x00000304
+#define NV03_SIFM_OPERATION_SRCCOPY_AND				0x00000000
+#define NV03_SIFM_OPERATION_ROP_AND				0x00000001
+#define NV03_SIFM_OPERATION_BLEND_AND				0x00000002
+#define NV03_SIFM_OPERATION_SRCCOPY				0x00000003
+#define NV03_SIFM_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV03_SIFM_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV03_SIFM_CLIP_POINT					0x00000308
+#define NV03_SIFM_CLIP_POINT_X__MASK				0x0000ffff
+#define NV03_SIFM_CLIP_POINT_X__SHIFT				0
+#define NV03_SIFM_CLIP_POINT_Y__MASK				0xffff0000
+#define NV03_SIFM_CLIP_POINT_Y__SHIFT				16
+
+#define NV03_SIFM_CLIP_SIZE					0x0000030c
+#define NV03_SIFM_CLIP_SIZE_W__MASK				0x0000ffff
+#define NV03_SIFM_CLIP_SIZE_W__SHIFT				0
+#define NV03_SIFM_CLIP_SIZE_H__MASK				0xffff0000
+#define NV03_SIFM_CLIP_SIZE_H__SHIFT				16
+
+#define NV03_SIFM_OUT_POINT					0x00000310
+#define NV03_SIFM_OUT_POINT_X__MASK				0x0000ffff
+#define NV03_SIFM_OUT_POINT_X__SHIFT				0
+#define NV03_SIFM_OUT_POINT_Y__MASK				0xffff0000
+#define NV03_SIFM_OUT_POINT_Y__SHIFT				16
+
+#define NV03_SIFM_OUT_SIZE					0x00000314
+#define NV03_SIFM_OUT_SIZE_W__MASK				0x0000ffff
+#define NV03_SIFM_OUT_SIZE_W__SHIFT				0
+#define NV03_SIFM_OUT_SIZE_H__MASK				0xffff0000
+#define NV03_SIFM_OUT_SIZE_H__SHIFT				16
+
+#define NV03_SIFM_DU_DX						0x00000318
+
+#define NV03_SIFM_DV_DY						0x0000031c
+
+#define NV03_SIFM_SIZE						0x00000400
+#define NV03_SIFM_SIZE_W__MASK					0x0000ffff
+#define NV03_SIFM_SIZE_W__SHIFT					0
+#define NV03_SIFM_SIZE_H__MASK					0xffff0000
+#define NV03_SIFM_SIZE_H__SHIFT					16
+
+#define NV03_SIFM_FORMAT					0x00000404
+#define NV03_SIFM_FORMAT_PITCH__MASK				0x0000ffff
+#define NV03_SIFM_FORMAT_PITCH__SHIFT				0
+#define NV03_SIFM_FORMAT_ORIGIN__MASK				0x00ff0000
+#define NV03_SIFM_FORMAT_ORIGIN__SHIFT				16
+#define NV03_SIFM_FORMAT_ORIGIN_CENTER				0x00010000
+#define NV03_SIFM_FORMAT_ORIGIN_CORNER				0x00020000
+#define NV03_SIFM_FORMAT_FILTER__MASK				0xff000000
+#define NV03_SIFM_FORMAT_FILTER__SHIFT				24
+#define NV03_SIFM_FORMAT_FILTER_POINT_SAMPLE			0x00000000
+#define NV03_SIFM_FORMAT_FILTER_BILINEAR			0x01000000
+
+#define NV03_SIFM_OFFSET					0x00000408
+
+#define NV03_SIFM_POINT						0x0000040c
+#define NV03_SIFM_POINT_U__MASK					0x0000ffff
+#define NV03_SIFM_POINT_U__SHIFT				0
+#define NV03_SIFM_POINT_V__MASK					0xffff0000
+#define NV03_SIFM_POINT_V__SHIFT				16
+
+
+#define NV50_SIFM_OFFSET_HIGH					0x00000410
+
+#define NV50_SIFM_SRC_LINEAR					0x00000414
+
+#define NV50_SIFM_SRC_TILE_MODE					0x00000418
+
+
+#define NV03_GDI_DMA_NOTIFY					0x00000180
+
+#define NV03_GDI_PATTERN					0x00000184
+
+#define NV03_GDI_ROP						0x00000188
+
+#define NV03_GDI_BETA						0x0000019c
+
+#define NV03_GDI_SURFACE_DST					0x00000190
+
+#define NV03_GDI_OPERATION					0x000002fc
+
+#define NV03_GDI_COLOR_FORMAT					0x00000300
+
+#define NV03_GDI_MONOCHROME_FORMAT				0x00000304
+
+#define NV03_GDI_COLOR1_A					0x000003fc
+
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT(i0)		       (0x00000400 + 0x8*(i0))
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT__ESIZE		0x00000008
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT__LEN			0x00000040
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT_Y__MASK		0x0000ffff
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT_Y__SHIFT		0
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT_X__MASK		0xffff0000
+#define NV03_GDI_UNCLIPPED_RECTANGLE_POINT_X__SHIFT		16
+
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE(i0)		       (0x00000404 + 0x8*(i0))
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE__ESIZE		0x00000008
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE__LEN			0x00000040
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE_H__MASK		0x0000ffff
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE_H__SHIFT		0
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE_W__MASK		0xffff0000
+#define NV03_GDI_UNCLIPPED_RECTANGLE_SIZE_W__SHIFT		16
+
+#define NV03_GDI_CLIP_POINT0_B					0x000007f4
+#define NV03_GDI_CLIP_POINT0_B_L__MASK				0x0000ffff
+#define NV03_GDI_CLIP_POINT0_B_L__SHIFT				0
+#define NV03_GDI_CLIP_POINT0_B_T__MASK				0xffff0000
+#define NV03_GDI_CLIP_POINT0_B_T__SHIFT				16
+
+#define NV03_GDI_CLIP_POINT1_B					0x000007f8
+#define NV03_GDI_CLIP_POINT1_B_R__MASK				0x0000ffff
+#define NV03_GDI_CLIP_POINT1_B_R__SHIFT				0
+#define NV03_GDI_CLIP_POINT1_B_B__MASK				0xffff0000
+#define NV03_GDI_CLIP_POINT1_B_B__SHIFT				16
+
+#define NV03_GDI_COLOR1_B					0x000007fc
+
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0(i0)		       (0x00000800 + 0x8*(i0))
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0__ESIZE		0x00000008
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0__LEN			0x00000040
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0_L__MASK		0x0000ffff
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0_L__SHIFT		0
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0_T__MASK		0xffff0000
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_0_T__SHIFT		16
+
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1(i0)		       (0x00000804 + 0x8*(i0))
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1__ESIZE		0x00000008
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1__LEN			0x00000040
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1_R__MASK		0x0000ffff
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1_R__SHIFT		0
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1_B__MASK		0xffff0000
+#define NV03_GDI_CLIPPED_RECTANGLE_POINT_1_B__SHIFT		16
+
+#define NV03_GDI_CLIP_C_POINT0					0x00000bec
+#define NV03_GDI_CLIP_C_POINT0_L__MASK				0x0000ffff
+#define NV03_GDI_CLIP_C_POINT0_L__SHIFT				0
+#define NV03_GDI_CLIP_C_POINT0_T__MASK				0xffff0000
+#define NV03_GDI_CLIP_C_POINT0_T__SHIFT				16
+
+#define NV03_GDI_CLIP_C_POINT1					0x00000bf0
+#define NV03_GDI_CLIP_C_POINT1_R__MASK				0x0000ffff
+#define NV03_GDI_CLIP_C_POINT1_R__SHIFT				0
+#define NV03_GDI_CLIP_C_POINT1_B__MASK				0xffff0000
+#define NV03_GDI_CLIP_C_POINT1_B__SHIFT				16
+
+#define NV03_GDI_COLOR1_C					0x00000bf4
+
+#define NV03_GDI_SIZE_C						0x00000bf8
+#define NV03_GDI_SIZE_C_W__MASK					0x0000ffff
+#define NV03_GDI_SIZE_C_W__SHIFT				0
+#define NV03_GDI_SIZE_C_H__MASK					0xffff0000
+#define NV03_GDI_SIZE_C_H__SHIFT				16
+
+#define NV03_GDI_POINT_C					0x00000bfc
+#define NV03_GDI_POINT_C_X__MASK				0x0000ffff
+#define NV03_GDI_POINT_C_X__SHIFT				0
+#define NV03_GDI_POINT_C_Y__MASK				0xffff0000
+#define NV03_GDI_POINT_C_Y__SHIFT				16
+
+#define NV03_GDI_MONOCHROME_COLOR1_C(i0)		       (0x00000c00 + 0x4*(i0))
+#define NV03_GDI_MONOCHROME_COLOR1_C__ESIZE			0x00000004
+#define NV03_GDI_MONOCHROME_COLOR1_C__LEN			0x00000080
+
+#define NV03_GDI_CLIP_D_POINT0					0x00000fe8
+#define NV03_GDI_CLIP_D_POINT0_L__MASK				0x0000ffff
+#define NV03_GDI_CLIP_D_POINT0_L__SHIFT				0
+#define NV03_GDI_CLIP_D_POINT0_T__MASK				0xffff0000
+#define NV03_GDI_CLIP_D_POINT0_T__SHIFT				16
+
+#define NV03_GDI_CLIP_D_POINT1					0x00000fec
+#define NV03_GDI_CLIP_D_POINT1_R__MASK				0x0000ffff
+#define NV03_GDI_CLIP_D_POINT1_R__SHIFT				0
+#define NV03_GDI_CLIP_D_POINT1_B__MASK				0xffff0000
+#define NV03_GDI_CLIP_D_POINT1_B__SHIFT				16
+
+#define NV03_GDI_COLOR1_D					0x00000ff0
+
+#define NV03_GDI_SIZE_IN_D					0x00000ff4
+#define NV03_GDI_SIZE_IN_D_W__MASK				0x0000ffff
+#define NV03_GDI_SIZE_IN_D_W__SHIFT				0
+#define NV03_GDI_SIZE_IN_D_H__MASK				0xffff0000
+#define NV03_GDI_SIZE_IN_D_H__SHIFT				16
+
+#define NV03_GDI_SIZE_OUT_D					0x00000ff8
+#define NV03_GDI_SIZE_OUT_D_W__MASK				0x0000ffff
+#define NV03_GDI_SIZE_OUT_D_W__SHIFT				0
+#define NV03_GDI_SIZE_OUT_D_H__MASK				0xffff0000
+#define NV03_GDI_SIZE_OUT_D_H__SHIFT				16
+
+#define NV03_GDI_POINT_D					0x00000ffc
+#define NV03_GDI_POINT_D_X__MASK				0x0000ffff
+#define NV03_GDI_POINT_D_X__SHIFT				0
+#define NV03_GDI_POINT_D_Y__MASK				0xffff0000
+#define NV03_GDI_POINT_D_Y__SHIFT				16
+
+#define NV03_GDI_MONOCHROME_COLOR1_D(i0)		       (0x00001000 + 0x4*(i0))
+#define NV03_GDI_MONOCHROME_COLOR1_D__ESIZE			0x00000004
+#define NV03_GDI_MONOCHROME_COLOR1_D__LEN			0x00000080
+
+#define NV03_GDI_CLIP_E_POINT0					0x000013e4
+#define NV03_GDI_CLIP_E_POINT0_L__MASK				0x0000ffff
+#define NV03_GDI_CLIP_E_POINT0_L__SHIFT				0
+#define NV03_GDI_CLIP_E_POINT0_T__MASK				0xffff0000
+#define NV03_GDI_CLIP_E_POINT0_T__SHIFT				16
+
+#define NV03_GDI_CLIP_E_POINT1					0x000013e8
+#define NV03_GDI_CLIP_E_POINT1_R__MASK				0x0000ffff
+#define NV03_GDI_CLIP_E_POINT1_R__SHIFT				0
+#define NV03_GDI_CLIP_E_POINT1_B__MASK				0xffff0000
+#define NV03_GDI_CLIP_E_POINT1_B__SHIFT				16
+
+#define NV03_GDI_COLOR0_E					0x000013ec
+
+#define NV03_GDI_COLOR1_E					0x000013f0
+
+#define NV03_GDI_SIZE_IN_E					0x000013f4
+#define NV03_GDI_SIZE_IN_E_W__MASK				0x0000ffff
+#define NV03_GDI_SIZE_IN_E_W__SHIFT				0
+#define NV03_GDI_SIZE_IN_E_H__MASK				0xffff0000
+#define NV03_GDI_SIZE_IN_E_H__SHIFT				16
+
+#define NV03_GDI_SIZE_OUT_E					0x000013f8
+#define NV03_GDI_SIZE_OUT_E_W__MASK				0x0000ffff
+#define NV03_GDI_SIZE_OUT_E_W__SHIFT				0
+#define NV03_GDI_SIZE_OUT_E_H__MASK				0xffff0000
+#define NV03_GDI_SIZE_OUT_E_H__SHIFT				16
+
+#define NV03_GDI_POINT_E					0x000013fc
+#define NV03_GDI_POINT_E_X__MASK				0x0000ffff
+#define NV03_GDI_POINT_E_X__SHIFT				0
+#define NV03_GDI_POINT_E_Y__MASK				0xffff0000
+#define NV03_GDI_POINT_E_Y__SHIFT				16
+
+#define NV03_GDI_MONOCHROME_COLOR01_E(i0)		       (0x00001400 + 0x4*(i0))
+#define NV03_GDI_MONOCHROME_COLOR01_E__ESIZE			0x00000004
+#define NV03_GDI_MONOCHROME_COLOR01_E__LEN			0x00000080
+
+
+#define NV04_GDI_PATCH						0x0000010c
+
+#define NV04_GDI_DMA_NOTIFY					0x00000180
+
+#define NV04_GDI_DMA_FONTS					0x00000184
+
+#define NV04_GDI_PATTERN					0x00000188
+
+#define NV04_GDI_ROP						0x0000018c
+
+#define NV04_GDI_BETA						0x00000190
+
+#define NV04_GDI_BETA4						0x00000194
+
+#define NV04_GDI_SURFACE					0x00000198
+
+#define NV04_GDI_OPERATION					0x000002fc
+#define NV04_GDI_OPERATION_SRCCOPY_AND				0x00000000
+#define NV04_GDI_OPERATION_ROP_AND				0x00000001
+#define NV04_GDI_OPERATION_BLEND_AND				0x00000002
+#define NV04_GDI_OPERATION_SRCCOPY				0x00000003
+#define NV04_GDI_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define NV04_GDI_OPERATION_BLEND_PREMULT			0x00000005
+
+#define NV04_GDI_COLOR_FORMAT					0x00000300
+#define NV04_GDI_COLOR_FORMAT_A16R5G6B5				0x00000001
+#define NV04_GDI_COLOR_FORMAT_X16A1R5G5B5			0x00000002
+#define NV04_GDI_COLOR_FORMAT_A8R8G8B8				0x00000003
+
+#define NV04_GDI_MONOCHROME_FORMAT				0x00000304
+#define NV04_GDI_MONOCHROME_FORMAT_CGA6				0x00000001
+#define NV04_GDI_MONOCHROME_FORMAT_LE				0x00000002
+
+#define NV04_GDI_COLOR1_A					0x000003fc
+
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT(i0)		       (0x00000400 + 0x8*(i0))
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT__ESIZE		0x00000008
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT__LEN			0x00000020
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT_Y__MASK		0x0000ffff
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT_Y__SHIFT		0
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT_X__MASK		0xffff0000
+#define NV04_GDI_UNCLIPPED_RECTANGLE_POINT_X__SHIFT		16
+
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE(i0)		       (0x00000404 + 0x8*(i0))
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE__ESIZE		0x00000008
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE__LEN			0x00000020
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE_H__MASK		0x0000ffff
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE_H__SHIFT		0
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE_W__MASK		0xffff0000
+#define NV04_GDI_UNCLIPPED_RECTANGLE_SIZE_W__SHIFT		16
+
+#define NV04_GDI_CLIP_B_POINT0					0x000005f4
+#define NV04_GDI_CLIP_B_POINT0_L__MASK				0x0000ffff
+#define NV04_GDI_CLIP_B_POINT0_L__SHIFT				0
+#define NV04_GDI_CLIP_B_POINT0_T__MASK				0xffff0000
+#define NV04_GDI_CLIP_B_POINT0_T__SHIFT				16
+
+#define NV04_GDI_CLIP_B_POINT1					0x000005f8
+#define NV04_GDI_CLIP_B_POINT1_R__MASK				0x0000ffff
+#define NV04_GDI_CLIP_B_POINT1_R__SHIFT				0
+#define NV04_GDI_CLIP_B_POINT1_B__MASK				0xffff0000
+#define NV04_GDI_CLIP_B_POINT1_B__SHIFT				16
+
+#define NV04_GDI_COLOR1_B					0x000005fc
+
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0(i0)		       (0x00000600 + 0x8*(i0))
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0__ESIZE		0x00000008
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0__LEN			0x00000020
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0_L__MASK		0x0000ffff
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0_L__SHIFT		0
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0_T__MASK		0xffff0000
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_0_T__SHIFT		16
+
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1(i0)		       (0x00000604 + 0x8*(i0))
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1__ESIZE		0x00000008
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1__LEN			0x00000020
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1_R__MASK		0x0000ffff
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1_R__SHIFT		0
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1_B__MASK		0xffff0000
+#define NV04_GDI_CLIPPED_RECTANGLE_POINT_1_B__SHIFT		16
+
+#define NV04_GDI_CLIP_C_POINT0					0x000007ec
+#define NV04_GDI_CLIP_C_POINT0_L__MASK				0x0000ffff
+#define NV04_GDI_CLIP_C_POINT0_L__SHIFT				0
+#define NV04_GDI_CLIP_C_POINT0_T__MASK				0xffff0000
+#define NV04_GDI_CLIP_C_POINT0_T__SHIFT				16
+
+#define NV04_GDI_CLIP_C_POINT1					0x000007f0
+#define NV04_GDI_CLIP_C_POINT1_R__MASK				0x0000ffff
+#define NV04_GDI_CLIP_C_POINT1_R__SHIFT				0
+#define NV04_GDI_CLIP_C_POINT1_B__MASK				0xffff0000
+#define NV04_GDI_CLIP_C_POINT1_B__SHIFT				16
+
+#define NV04_GDI_COLOR1_C					0x000007f4
+
+#define NV04_GDI_SIZE_C						0x000007f8
+#define NV04_GDI_SIZE_C_W__MASK					0x0000ffff
+#define NV04_GDI_SIZE_C_W__SHIFT				0
+#define NV04_GDI_SIZE_C_H__MASK					0xffff0000
+#define NV04_GDI_SIZE_C_H__SHIFT				16
+
+#define NV04_GDI_POINT_C					0x000007fc
+#define NV04_GDI_POINT_C_X__MASK				0x0000ffff
+#define NV04_GDI_POINT_C_X__SHIFT				0
+#define NV04_GDI_POINT_C_Y__MASK				0xffff0000
+#define NV04_GDI_POINT_C_Y__SHIFT				16
+
+#define NV04_GDI_MONOCHROME_COLOR1_C(i0)		       (0x00000800 + 0x4*(i0))
+#define NV04_GDI_MONOCHROME_COLOR1_C__ESIZE			0x00000004
+#define NV04_GDI_MONOCHROME_COLOR1_C__LEN			0x00000080
+
+#define NV04_GDI_CLIP_E_POINT0					0x00000be4
+#define NV04_GDI_CLIP_E_POINT0_L__MASK				0x0000ffff
+#define NV04_GDI_CLIP_E_POINT0_L__SHIFT				0
+#define NV04_GDI_CLIP_E_POINT0_T__MASK				0xffff0000
+#define NV04_GDI_CLIP_E_POINT0_T__SHIFT				16
+
+#define NV04_GDI_CLIP_E_POINT1					0x00000be8
+#define NV04_GDI_CLIP_E_POINT1_R__MASK				0x0000ffff
+#define NV04_GDI_CLIP_E_POINT1_R__SHIFT				0
+#define NV04_GDI_CLIP_E_POINT1_B__MASK				0xffff0000
+#define NV04_GDI_CLIP_E_POINT1_B__SHIFT				16
+
+#define NV04_GDI_COLOR0_E					0x00000bec
+
+#define NV04_GDI_COLOR1_E					0x00000bf0
+
+#define NV04_GDI_SIZE_IN_E					0x00000bf4
+#define NV04_GDI_SIZE_IN_E_W__MASK				0x0000ffff
+#define NV04_GDI_SIZE_IN_E_W__SHIFT				0
+#define NV04_GDI_SIZE_IN_E_H__MASK				0xffff0000
+#define NV04_GDI_SIZE_IN_E_H__SHIFT				16
+
+#define NV04_GDI_SIZE_OUT_E					0x00000bf8
+#define NV04_GDI_SIZE_OUT_E_W__MASK				0x0000ffff
+#define NV04_GDI_SIZE_OUT_E_W__SHIFT				0
+#define NV04_GDI_SIZE_OUT_E_H__MASK				0xffff0000
+#define NV04_GDI_SIZE_OUT_E_H__SHIFT				16
+
+#define NV04_GDI_POINT_E					0x00000bfc
+#define NV04_GDI_POINT_E_X__MASK				0x0000ffff
+#define NV04_GDI_POINT_E_X__SHIFT				0
+#define NV04_GDI_POINT_E_Y__MASK				0xffff0000
+#define NV04_GDI_POINT_E_Y__SHIFT				16
+
+#define NV04_GDI_MONOCHROME_COLOR01_E(i0)		       (0x00000c00 + 0x4*(i0))
+#define NV04_GDI_MONOCHROME_COLOR01_E__ESIZE			0x00000004
+#define NV04_GDI_MONOCHROME_COLOR01_E__LEN			0x00000080
+
+#define NV04_GDI_FONT_F						0x00000ff0
+#define NV04_GDI_FONT_F_OFFSET__MASK				0x0fffffff
+#define NV04_GDI_FONT_F_OFFSET__SHIFT				0
+#define NV04_GDI_FONT_F_PITCH__MASK				0xf0000000
+#define NV04_GDI_FONT_F_PITCH__SHIFT				28
+
+#define NV04_GDI_CLIP_F_POINT0					0x00000ff4
+#define NV04_GDI_CLIP_F_POINT0_L__MASK				0x0000ffff
+#define NV04_GDI_CLIP_F_POINT0_L__SHIFT				0
+#define NV04_GDI_CLIP_F_POINT0_T__MASK				0xffff0000
+#define NV04_GDI_CLIP_F_POINT0_T__SHIFT				16
+
+#define NV04_GDI_CLIP_F_POINT1					0x00000ff8
+#define NV04_GDI_CLIP_F_POINT1_R__MASK				0x0000ffff
+#define NV04_GDI_CLIP_F_POINT1_R__SHIFT				0
+#define NV04_GDI_CLIP_F_POINT1_B__MASK				0xffff0000
+#define NV04_GDI_CLIP_F_POINT1_B__SHIFT				16
+
+#define NV04_GDI_COLOR1_F					0x00000ffc
+
+#define NV04_GDI_CHARACTER_COLOR1_F(i0)			       (0x00001000 + 0x4*(i0))
+#define NV04_GDI_CHARACTER_COLOR1_F__ESIZE			0x00000004
+#define NV04_GDI_CHARACTER_COLOR1_F__LEN			0x00000100
+#define NV04_GDI_CHARACTER_COLOR1_F_INDEX__MASK			0x000000ff
+#define NV04_GDI_CHARACTER_COLOR1_F_INDEX__SHIFT		0
+#define NV04_GDI_CHARACTER_COLOR1_F_X__MASK			0x000fff00
+#define NV04_GDI_CHARACTER_COLOR1_F_X__SHIFT			8
+#define NV04_GDI_CHARACTER_COLOR1_F_Y__MASK			0xfff00000
+#define NV04_GDI_CHARACTER_COLOR1_F_Y__SHIFT			20
+
+#define NV04_GDI_FONT_G						0x000017f0
+#define NV04_GDI_FONT_G_OFFSET__MASK				0x0fffffff
+#define NV04_GDI_FONT_G_OFFSET__SHIFT				0
+#define NV04_GDI_FONT_G_PITCH__MASK				0xf0000000
+#define NV04_GDI_FONT_G_PITCH__SHIFT				28
+
+#define NV04_GDI_CLIP_G_POINT0					0x000017f4
+#define NV04_GDI_CLIP_G_POINT0_L__MASK				0x0000ffff
+#define NV04_GDI_CLIP_G_POINT0_L__SHIFT				0
+#define NV04_GDI_CLIP_G_POINT0_T__MASK				0xffff0000
+#define NV04_GDI_CLIP_G_POINT0_T__SHIFT				16
+
+#define NV04_GDI_CLIP_G_POINT1					0x000017f8
+#define NV04_GDI_CLIP_G_POINT1_R__MASK				0x0000ffff
+#define NV04_GDI_CLIP_G_POINT1_R__SHIFT				0
+#define NV04_GDI_CLIP_G_POINT1_B__MASK				0xffff0000
+#define NV04_GDI_CLIP_G_POINT1_B__SHIFT				16
+
+#define NV04_GDI_COLOR1_G					0x000017fc
+
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT(i0)		       (0x00001800 + 0x8*(i0))
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT__ESIZE		0x00000008
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT__LEN			0x00000100
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT_X__MASK		0x0000ffff
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT_X__SHIFT		0
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT_Y__MASK		0xffff0000
+#define NV04_GDI_CHARACTER_COLOR1_G_POINT_Y__SHIFT		16
+
+#define NV04_GDI_CHARACTER_COLOR1_G_INDEX(i0)		       (0x00001804 + 0x8*(i0))
+#define NV04_GDI_CHARACTER_COLOR1_G_INDEX__ESIZE		0x00000008
+#define NV04_GDI_CHARACTER_COLOR1_G_INDEX__LEN			0x00000100
+
+
+#define NV10_TEXUPLOAD_WAIT_FOR_IDLE				0x00000108
+
+#define NV10_TEXUPLOAD_DMA_NOTIFY				0x00000180
+
+#define NV10_TEXUPLOAD_SURFACE					0x00000184
+
+#define NV10_TEXUPLOAD_COLOR_FORMAT				0x00000300
+
+#define NV10_TEXUPLOAD_POINT					0x00000304
+#define NV10_TEXUPLOAD_POINT_X__MASK				0x0000ffff
+#define NV10_TEXUPLOAD_POINT_X__SHIFT				0
+#define NV10_TEXUPLOAD_POINT_Y__MASK				0xffff0000
+#define NV10_TEXUPLOAD_POINT_Y__SHIFT				16
+
+#define NV10_TEXUPLOAD_SIZE					0x00000308
+#define NV10_TEXUPLOAD_SIZE_W__MASK				0x0000ffff
+#define NV10_TEXUPLOAD_SIZE_W__SHIFT				0
+#define NV10_TEXUPLOAD_SIZE_H__MASK				0xffff0000
+#define NV10_TEXUPLOAD_SIZE_H__SHIFT				16
+
+#define NV10_TEXUPLOAD_CLIP_HORIZONTAL				0x0000030c
+#define NV10_TEXUPLOAD_CLIP_HORIZONTAL_X__MASK			0x0000ffff
+#define NV10_TEXUPLOAD_CLIP_HORIZONTAL_X__SHIFT			0
+#define NV10_TEXUPLOAD_CLIP_HORIZONTAL_W__MASK			0xffff0000
+#define NV10_TEXUPLOAD_CLIP_HORIZONTAL_W__SHIFT			16
+
+#define NV10_TEXUPLOAD_CLIP_VERTICAL				0x00000310
+#define NV10_TEXUPLOAD_CLIP_VERTICAL_Y__MASK			0x0000ffff
+#define NV10_TEXUPLOAD_CLIP_VERTICAL_Y__SHIFT			0
+#define NV10_TEXUPLOAD_CLIP_VERTICAL_H__MASK			0xffff0000
+#define NV10_TEXUPLOAD_CLIP_VERTICAL_H__SHIFT			16
+
+#define NV10_TEXUPLOAD_COLOR(i0)			       (0x00000400 + 0x4*(i0))
+#define NV10_TEXUPLOAD_COLOR__ESIZE				0x00000004
+#define NV10_TEXUPLOAD_COLOR__LEN				0x00000700
+
+
+#endif /* _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV01_2D_XML */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
new file mode 100644
index 00000000000..447f4b3b7ae
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30-40_3d.xml.h
@@ -0,0 +1,2045 @@
+#ifndef _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV30_40_3D_XML
+#define _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV30_40_3D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- /home/skeggsb/git/envytools/rnndb/nv30-40_3d.xml (  32450 bytes, from 2012-02-10 02:41:48)
+- /home/skeggsb/git/envytools/rnndb/copyright.xml  (   6452 bytes, from 2011-06-30 00:27:55)
+- /home/skeggsb/git/envytools/rnndb/nv_3ddefs.xml  (  16394 bytes, from 2011-06-30 00:27:55)
+- /home/skeggsb/git/envytools/rnndb/nv_defs.xml    (   4437 bytes, from 2011-06-30 00:27:55)
+- /home/skeggsb/git/envytools/rnndb/nv_object.xml  (  12672 bytes, from 2011-07-13 22:28:24)
+- /home/skeggsb/git/envytools/rnndb/nvchipsets.xml (   3617 bytes, from 2011-07-17 23:19:55)
+- /home/skeggsb/git/envytools/rnndb/nv50_defs.xml  (   5468 bytes, from 2011-07-08 05:22:51)
+
+Copyright (C) 2006-2012 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV30_3D_FLIP_SET_READ					0x00000120
+
+#define NV30_3D_FLIP_SET_WRITE					0x00000124
+
+#define NV30_3D_FLIP_MAX					0x00000128
+
+#define NV30_3D_FLIP_INCR_WRITE					0x0000012c
+
+#define NV30_3D_FLIP_WAIT					0x00000130
+
+#define NV30_3D_DMA_NOTIFY					0x00000180
+
+#define NV30_3D_DMA_TEXTURE0					0x00000184
+
+#define NV30_3D_DMA_TEXTURE1					0x00000188
+
+#define NV30_3D_DMA_COLOR1					0x0000018c
+
+#define NV30_3D_DMA_UNK190					0x00000190
+
+#define NV30_3D_DMA_COLOR0					0x00000194
+
+#define NV30_3D_DMA_ZETA					0x00000198
+
+#define NV30_3D_DMA_VTXBUF0					0x0000019c
+
+#define NV30_3D_DMA_VTXBUF1					0x000001a0
+
+#define NV30_3D_DMA_FENCE					0x000001a4
+
+#define NV30_3D_DMA_QUERY					0x000001a8
+
+#define NV30_3D_DMA_UNK1AC					0x000001ac
+
+#define NV30_3D_DMA_UNK1B0					0x000001b0
+
+#define NV40_3D_DMA_COLOR2					0x000001b4
+
+#define NV40_3D_DMA_COLOR3					0x000001b8
+
+#define NV30_3D_RT_HORIZ					0x00000200
+#define NV30_3D_RT_HORIZ_X__MASK				0x0000ffff
+#define NV30_3D_RT_HORIZ_X__SHIFT				0
+#define NV30_3D_RT_HORIZ_W__MASK				0xffff0000
+#define NV30_3D_RT_HORIZ_W__SHIFT				16
+
+#define NV30_3D_RT_VERT						0x00000204
+#define NV30_3D_RT_VERT_Y__MASK					0x0000ffff
+#define NV30_3D_RT_VERT_Y__SHIFT				0
+#define NV30_3D_RT_VERT_H__MASK					0xffff0000
+#define NV30_3D_RT_VERT_H__SHIFT				16
+
+#define NV30_3D_RT_FORMAT					0x00000208
+#define NV30_3D_RT_FORMAT_COLOR__MASK				0x0000001f
+#define NV30_3D_RT_FORMAT_COLOR__SHIFT				0
+#define NV30_3D_RT_FORMAT_COLOR_R5G6B5				0x00000003
+#define NV30_3D_RT_FORMAT_COLOR_X8R8G8B8			0x00000005
+#define NV30_3D_RT_FORMAT_COLOR_A8R8G8B8			0x00000008
+#define NV30_3D_RT_FORMAT_COLOR_B8				0x00000009
+#define NV30_3D_RT_FORMAT_COLOR_A16B16G16R16_FLOAT		0x0000000b
+#define NV30_3D_RT_FORMAT_COLOR_A32B32G32R32_FLOAT		0x0000000c
+#define NV30_3D_RT_FORMAT_COLOR_R32_FLOAT			0x0000000d
+#define NV30_3D_RT_FORMAT_COLOR_UNK0D				0x0000000d
+#define NV30_3D_RT_FORMAT_COLOR_X8B8G8R8			0x0000000f
+#define NV30_3D_RT_FORMAT_COLOR_A8B8G8R8			0x00000010
+#define NV30_3D_RT_FORMAT_ZETA__MASK				0x000000e0
+#define NV30_3D_RT_FORMAT_ZETA__SHIFT				5
+#define NV30_3D_RT_FORMAT_ZETA_Z16				0x00000020
+#define NV30_3D_RT_FORMAT_ZETA_Z24S8				0x00000040
+#define NV30_3D_RT_FORMAT_TYPE__MASK				0x00000f00
+#define NV30_3D_RT_FORMAT_TYPE__SHIFT				8
+#define NV30_3D_RT_FORMAT_TYPE_LINEAR				0x00000100
+#define NV30_3D_RT_FORMAT_TYPE_SWIZZLED				0x00000200
+#define NV30_3D_RT_FORMAT_LOG2_WIDTH__MASK			0x00ff0000
+#define NV30_3D_RT_FORMAT_LOG2_WIDTH__SHIFT			16
+#define NV30_3D_RT_FORMAT_LOG2_HEIGHT__MASK			0xff000000
+#define NV30_3D_RT_FORMAT_LOG2_HEIGHT__SHIFT			24
+
+#define NV30_3D_COLOR0_PITCH					0x0000020c
+#define NV30_3D_COLOR0_PITCH_COLOR0__MASK			0x0000ffff
+#define NV30_3D_COLOR0_PITCH_COLOR0__SHIFT			0
+#define NV30_3D_COLOR0_PITCH_ZETA__MASK				0xffff0000
+#define NV30_3D_COLOR0_PITCH_ZETA__SHIFT			16
+
+#define NV40_3D_COLOR0_PITCH					0x0000020c
+
+#define NV30_3D_COLOR0_OFFSET					0x00000210
+
+#define NV30_3D_ZETA_OFFSET					0x00000214
+
+#define NV30_3D_COLOR1_OFFSET					0x00000218
+
+#define NV30_3D_COLOR1_PITCH					0x0000021c
+
+#define NV30_3D_RT_ENABLE					0x00000220
+#define NV30_3D_RT_ENABLE_COLOR0				0x00000001
+#define NV30_3D_RT_ENABLE_COLOR1				0x00000002
+#define NV40_3D_RT_ENABLE_COLOR2				0x00000004
+#define NV40_3D_RT_ENABLE_COLOR3				0x00000008
+#define NV30_3D_RT_ENABLE_MRT					0x00000010
+
+#define NV40_3D_ZETA_PITCH					0x0000022c
+
+#define NV30_3D_HIERZ_PITCH					0x0000022c
+
+#define NV30_3D_HIERZ_OFFSET					0x00000230
+
+#define NV30_3D_TEX_UNITS_ENABLE				0x0000023c
+#define NV30_3D_TEX_UNITS_ENABLE_TX0				0x00000001
+#define NV30_3D_TEX_UNITS_ENABLE_TX1				0x00000002
+#define NV30_3D_TEX_UNITS_ENABLE_TX2				0x00000004
+#define NV30_3D_TEX_UNITS_ENABLE_TX3				0x00000008
+#define NV30_3D_TEX_UNITS_ENABLE_TX4				0x00000010
+#define NV30_3D_TEX_UNITS_ENABLE_TX5				0x00000020
+#define NV30_3D_TEX_UNITS_ENABLE_TX6				0x00000040
+#define NV30_3D_TEX_UNITS_ENABLE_TX7				0x00000080
+
+#define NV30_3D_TEX_MATRIX_ENABLE(i0)			       (0x00000240 + 0x4*(i0))
+#define NV30_3D_TEX_MATRIX_ENABLE__ESIZE			0x00000004
+#define NV30_3D_TEX_MATRIX_ENABLE__LEN				0x00000008
+
+#define NV40_3D_COLOR2_PITCH					0x00000280
+
+#define NV40_3D_COLOR3_PITCH					0x00000284
+
+#define NV40_3D_COLOR2_OFFSET					0x00000288
+
+#define NV40_3D_COLOR3_OFFSET					0x0000028c
+
+#define NV30_3D_VIEWPORT_TX_ORIGIN				0x000002b8
+#define NV30_3D_VIEWPORT_TX_ORIGIN_X__MASK			0x0000ffff
+#define NV30_3D_VIEWPORT_TX_ORIGIN_X__SHIFT			0
+#define NV30_3D_VIEWPORT_TX_ORIGIN_Y__MASK			0xffff0000
+#define NV30_3D_VIEWPORT_TX_ORIGIN_Y__SHIFT			16
+
+#define NV30_3D_VIEWPORT_CLIP_MODE				0x000002bc
+
+#define NV30_3D_VIEWPORT_CLIP_HORIZ(i0)			       (0x000002c0 + 0x8*(i0))
+#define NV30_3D_VIEWPORT_CLIP_HORIZ__ESIZE			0x00000008
+#define NV30_3D_VIEWPORT_CLIP_HORIZ__LEN			0x00000008
+#define NV30_3D_VIEWPORT_CLIP_HORIZ_L__MASK			0x0000ffff
+#define NV30_3D_VIEWPORT_CLIP_HORIZ_L__SHIFT			0
+#define NV30_3D_VIEWPORT_CLIP_HORIZ_R__MASK			0xffff0000
+#define NV30_3D_VIEWPORT_CLIP_HORIZ_R__SHIFT			16
+
+#define NV30_3D_VIEWPORT_CLIP_VERT(i0)			       (0x000002c4 + 0x8*(i0))
+#define NV30_3D_VIEWPORT_CLIP_VERT__ESIZE			0x00000008
+#define NV30_3D_VIEWPORT_CLIP_VERT__LEN				0x00000008
+#define NV30_3D_VIEWPORT_CLIP_VERT_T__MASK			0x0000ffff
+#define NV30_3D_VIEWPORT_CLIP_VERT_T__SHIFT			0
+#define NV30_3D_VIEWPORT_CLIP_VERT_D__MASK			0xffff0000
+#define NV30_3D_VIEWPORT_CLIP_VERT_D__SHIFT			16
+
+#define NV30_3D_DITHER_ENABLE					0x00000300
+
+#define NV30_3D_ALPHA_FUNC_ENABLE				0x00000304
+
+#define NV30_3D_ALPHA_FUNC_FUNC					0x00000308
+#define NV30_3D_ALPHA_FUNC_FUNC_NEVER				0x00000200
+#define NV30_3D_ALPHA_FUNC_FUNC_LESS				0x00000201
+#define NV30_3D_ALPHA_FUNC_FUNC_EQUAL				0x00000202
+#define NV30_3D_ALPHA_FUNC_FUNC_LEQUAL				0x00000203
+#define NV30_3D_ALPHA_FUNC_FUNC_GREATER				0x00000204
+#define NV30_3D_ALPHA_FUNC_FUNC_NOTEQUAL			0x00000205
+#define NV30_3D_ALPHA_FUNC_FUNC_GEQUAL				0x00000206
+#define NV30_3D_ALPHA_FUNC_FUNC_ALWAYS				0x00000207
+
+#define NV30_3D_ALPHA_FUNC_REF					0x0000030c
+
+#define NV30_3D_BLEND_FUNC_ENABLE				0x00000310
+
+#define NV30_3D_BLEND_FUNC_SRC					0x00000314
+#define NV30_3D_BLEND_FUNC_SRC_RGB__MASK			0x0000ffff
+#define NV30_3D_BLEND_FUNC_SRC_RGB__SHIFT			0
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ZERO				0x00000000
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE				0x00000001
+#define NV30_3D_BLEND_FUNC_SRC_RGB_SRC_COLOR			0x00000300
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR		0x00000301
+#define NV30_3D_BLEND_FUNC_SRC_RGB_SRC_ALPHA			0x00000302
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA		0x00000303
+#define NV30_3D_BLEND_FUNC_SRC_RGB_DST_ALPHA			0x00000304
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA		0x00000305
+#define NV30_3D_BLEND_FUNC_SRC_RGB_DST_COLOR			0x00000306
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR		0x00000307
+#define NV30_3D_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE		0x00000308
+#define NV30_3D_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR		0x00008001
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR	0x00008002
+#define NV30_3D_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA		0x00008003
+#define NV30_3D_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA	0x00008004
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA__MASK			0xffff0000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA__SHIFT			16
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ZERO			0x00000000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE			0x00010000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_SRC_COLOR			0x03000000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR	0x03010000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA			0x03020000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA	0x03030000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_DST_ALPHA			0x03040000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA	0x03050000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_DST_COLOR			0x03060000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR	0x03070000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE		0x03080000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR		0x80010000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR	0x80020000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA		0x80030000
+#define NV30_3D_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA	0x80040000
+
+#define NV30_3D_BLEND_FUNC_DST					0x00000318
+#define NV30_3D_BLEND_FUNC_DST_RGB__MASK			0x0000ffff
+#define NV30_3D_BLEND_FUNC_DST_RGB__SHIFT			0
+#define NV30_3D_BLEND_FUNC_DST_RGB_ZERO				0x00000000
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE				0x00000001
+#define NV30_3D_BLEND_FUNC_DST_RGB_SRC_COLOR			0x00000300
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR		0x00000301
+#define NV30_3D_BLEND_FUNC_DST_RGB_SRC_ALPHA			0x00000302
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA		0x00000303
+#define NV30_3D_BLEND_FUNC_DST_RGB_DST_ALPHA			0x00000304
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA		0x00000305
+#define NV30_3D_BLEND_FUNC_DST_RGB_DST_COLOR			0x00000306
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR		0x00000307
+#define NV30_3D_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE		0x00000308
+#define NV30_3D_BLEND_FUNC_DST_RGB_CONSTANT_COLOR		0x00008001
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR	0x00008002
+#define NV30_3D_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA		0x00008003
+#define NV30_3D_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA	0x00008004
+#define NV30_3D_BLEND_FUNC_DST_ALPHA__MASK			0xffff0000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA__SHIFT			16
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ZERO			0x00000000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE			0x00010000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_SRC_COLOR			0x03000000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR	0x03010000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_SRC_ALPHA			0x03020000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA	0x03030000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_DST_ALPHA			0x03040000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA	0x03050000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_DST_COLOR			0x03060000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR	0x03070000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE		0x03080000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR		0x80010000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR	0x80020000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA		0x80030000
+#define NV30_3D_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA	0x80040000
+
+#define NV30_3D_BLEND_COLOR					0x0000031c
+#define NV30_3D_BLEND_COLOR_B__MASK				0x000000ff
+#define NV30_3D_BLEND_COLOR_B__SHIFT				0
+#define NV30_3D_BLEND_COLOR_G__MASK				0x0000ff00
+#define NV30_3D_BLEND_COLOR_G__SHIFT				8
+#define NV30_3D_BLEND_COLOR_R__MASK				0x00ff0000
+#define NV30_3D_BLEND_COLOR_R__SHIFT				16
+#define NV30_3D_BLEND_COLOR_A__MASK				0xff000000
+#define NV30_3D_BLEND_COLOR_A__SHIFT				24
+
+#define NV30_3D_BLEND_EQUATION					0x00000320
+#define NV30_3D_BLEND_EQUATION_FUNC_ADD				0x00008006
+#define NV30_3D_BLEND_EQUATION_MIN				0x00008007
+#define NV30_3D_BLEND_EQUATION_MAX				0x00008008
+#define NV30_3D_BLEND_EQUATION_FUNC_SUBTRACT			0x0000800a
+#define NV30_3D_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT		0x0000800b
+
+#define NV40_3D_BLEND_EQUATION					0x00000320
+#define NV40_3D_BLEND_EQUATION_RGB__MASK			0x0000ffff
+#define NV40_3D_BLEND_EQUATION_RGB__SHIFT			0
+#define NV40_3D_BLEND_EQUATION_RGB_FUNC_ADD			0x00008006
+#define NV40_3D_BLEND_EQUATION_RGB_MIN				0x00008007
+#define NV40_3D_BLEND_EQUATION_RGB_MAX				0x00008008
+#define NV40_3D_BLEND_EQUATION_RGB_FUNC_SUBTRACT		0x0000800a
+#define NV40_3D_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT	0x0000800b
+#define NV40_3D_BLEND_EQUATION_ALPHA__MASK			0xffff0000
+#define NV40_3D_BLEND_EQUATION_ALPHA__SHIFT			16
+#define NV40_3D_BLEND_EQUATION_ALPHA_FUNC_ADD			0x80060000
+#define NV40_3D_BLEND_EQUATION_ALPHA_MIN			0x80070000
+#define NV40_3D_BLEND_EQUATION_ALPHA_MAX			0x80080000
+#define NV40_3D_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT		0x800a0000
+#define NV40_3D_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT	0x800b0000
+
+#define NV30_3D_COLOR_MASK					0x00000324
+#define NV30_3D_COLOR_MASK_B					0x000000ff
+#define NV30_3D_COLOR_MASK_G					0x0000ff00
+#define NV30_3D_COLOR_MASK_R					0x00ff0000
+#define NV30_3D_COLOR_MASK_A					0xff000000
+
+#define NV30_3D_STENCIL(i0)				       (0x00000328 + 0x20*(i0))
+#define NV30_3D_STENCIL__ESIZE					0x00000020
+#define NV30_3D_STENCIL__LEN					0x00000002
+
+#define NV30_3D_STENCIL_ENABLE(i0)			       (0x00000328 + 0x20*(i0))
+
+#define NV30_3D_STENCIL_MASK(i0)			       (0x0000032c + 0x20*(i0))
+
+#define NV30_3D_STENCIL_FUNC_FUNC(i0)			       (0x00000330 + 0x20*(i0))
+#define NV30_3D_STENCIL_FUNC_FUNC_NEVER				0x00000200
+#define NV30_3D_STENCIL_FUNC_FUNC_LESS				0x00000201
+#define NV30_3D_STENCIL_FUNC_FUNC_EQUAL				0x00000202
+#define NV30_3D_STENCIL_FUNC_FUNC_LEQUAL			0x00000203
+#define NV30_3D_STENCIL_FUNC_FUNC_GREATER			0x00000204
+#define NV30_3D_STENCIL_FUNC_FUNC_NOTEQUAL			0x00000205
+#define NV30_3D_STENCIL_FUNC_FUNC_GEQUAL			0x00000206
+#define NV30_3D_STENCIL_FUNC_FUNC_ALWAYS			0x00000207
+
+#define NV30_3D_STENCIL_FUNC_REF(i0)			       (0x00000334 + 0x20*(i0))
+
+#define NV30_3D_STENCIL_FUNC_MASK(i0)			       (0x00000338 + 0x20*(i0))
+
+#define NV30_3D_STENCIL_OP_FAIL(i0)			       (0x0000033c + 0x20*(i0))
+#define NV30_3D_STENCIL_OP_FAIL_ZERO				0x00000000
+#define NV30_3D_STENCIL_OP_FAIL_INVERT				0x0000150a
+#define NV30_3D_STENCIL_OP_FAIL_KEEP				0x00001e00
+#define NV30_3D_STENCIL_OP_FAIL_REPLACE				0x00001e01
+#define NV30_3D_STENCIL_OP_FAIL_INCR				0x00001e02
+#define NV30_3D_STENCIL_OP_FAIL_DECR				0x00001e03
+#define NV30_3D_STENCIL_OP_FAIL_INCR_WRAP			0x00008507
+#define NV30_3D_STENCIL_OP_FAIL_DECR_WRAP			0x00008508
+
+#define NV30_3D_STENCIL_OP_ZFAIL(i0)			       (0x00000340 + 0x20*(i0))
+#define NV30_3D_STENCIL_OP_ZFAIL_ZERO				0x00000000
+#define NV30_3D_STENCIL_OP_ZFAIL_INVERT				0x0000150a
+#define NV30_3D_STENCIL_OP_ZFAIL_KEEP				0x00001e00
+#define NV30_3D_STENCIL_OP_ZFAIL_REPLACE			0x00001e01
+#define NV30_3D_STENCIL_OP_ZFAIL_INCR				0x00001e02
+#define NV30_3D_STENCIL_OP_ZFAIL_DECR				0x00001e03
+#define NV30_3D_STENCIL_OP_ZFAIL_INCR_WRAP			0x00008507
+#define NV30_3D_STENCIL_OP_ZFAIL_DECR_WRAP			0x00008508
+
+#define NV30_3D_STENCIL_OP_ZPASS(i0)			       (0x00000344 + 0x20*(i0))
+#define NV30_3D_STENCIL_OP_ZPASS_ZERO				0x00000000
+#define NV30_3D_STENCIL_OP_ZPASS_INVERT				0x0000150a
+#define NV30_3D_STENCIL_OP_ZPASS_KEEP				0x00001e00
+#define NV30_3D_STENCIL_OP_ZPASS_REPLACE			0x00001e01
+#define NV30_3D_STENCIL_OP_ZPASS_INCR				0x00001e02
+#define NV30_3D_STENCIL_OP_ZPASS_DECR				0x00001e03
+#define NV30_3D_STENCIL_OP_ZPASS_INCR_WRAP			0x00008507
+#define NV30_3D_STENCIL_OP_ZPASS_DECR_WRAP			0x00008508
+
+#define NV30_3D_SHADE_MODEL					0x00000368
+#define NV30_3D_SHADE_MODEL_FLAT				0x00001d00
+#define NV30_3D_SHADE_MODEL_SMOOTH				0x00001d01
+
+#define NV30_3D_FOG_ENABLE					0x0000036c
+
+#define NV30_3D_FOG_COLOR					0x00000370
+#define NV30_3D_FOG_COLOR_R__MASK				0x000000ff
+#define NV30_3D_FOG_COLOR_R__SHIFT				0
+#define NV30_3D_FOG_COLOR_G__MASK				0x0000ff00
+#define NV30_3D_FOG_COLOR_G__SHIFT				8
+#define NV30_3D_FOG_COLOR_B__MASK				0x00ff0000
+#define NV30_3D_FOG_COLOR_B__SHIFT				16
+#define NV30_3D_FOG_COLOR_A__MASK				0xff000000
+#define NV30_3D_FOG_COLOR_A__SHIFT				24
+
+#define NV40_3D_MRT_COLOR_MASK					0x00000370
+#define NV40_3D_MRT_COLOR_MASK_BUFFER1_A			0x00000010
+#define NV40_3D_MRT_COLOR_MASK_BUFFER1_R			0x00000020
+#define NV40_3D_MRT_COLOR_MASK_BUFFER1_G			0x00000040
+#define NV40_3D_MRT_COLOR_MASK_BUFFER1_B			0x00000080
+#define NV40_3D_MRT_COLOR_MASK_BUFFER2_A			0x00000100
+#define NV40_3D_MRT_COLOR_MASK_BUFFER2_R			0x00000200
+#define NV40_3D_MRT_COLOR_MASK_BUFFER2_G			0x00000400
+#define NV40_3D_MRT_COLOR_MASK_BUFFER2_B			0x00000800
+#define NV40_3D_MRT_COLOR_MASK_BUFFER3_A			0x00001000
+#define NV40_3D_MRT_COLOR_MASK_BUFFER3_R			0x00002000
+#define NV40_3D_MRT_COLOR_MASK_BUFFER3_G			0x00004000
+#define NV40_3D_MRT_COLOR_MASK_BUFFER3_B			0x00008000
+
+#define NV30_3D_COLOR_LOGIC_OP_ENABLE				0x00000374
+
+#define NV30_3D_COLOR_LOGIC_OP_OP				0x00000378
+#define NV30_3D_COLOR_LOGIC_OP_OP_CLEAR				0x00001500
+#define NV30_3D_COLOR_LOGIC_OP_OP_AND				0x00001501
+#define NV30_3D_COLOR_LOGIC_OP_OP_AND_REVERSE			0x00001502
+#define NV30_3D_COLOR_LOGIC_OP_OP_COPY				0x00001503
+#define NV30_3D_COLOR_LOGIC_OP_OP_AND_INVERTED			0x00001504
+#define NV30_3D_COLOR_LOGIC_OP_OP_NOOP				0x00001505
+#define NV30_3D_COLOR_LOGIC_OP_OP_XOR				0x00001506
+#define NV30_3D_COLOR_LOGIC_OP_OP_OR				0x00001507
+#define NV30_3D_COLOR_LOGIC_OP_OP_NOR				0x00001508
+#define NV30_3D_COLOR_LOGIC_OP_OP_EQUIV				0x00001509
+#define NV30_3D_COLOR_LOGIC_OP_OP_INVERT			0x0000150a
+#define NV30_3D_COLOR_LOGIC_OP_OP_OR_REVERSE			0x0000150b
+#define NV30_3D_COLOR_LOGIC_OP_OP_COPY_INVERTED			0x0000150c
+#define NV30_3D_COLOR_LOGIC_OP_OP_OR_INVERTED			0x0000150d
+#define NV30_3D_COLOR_LOGIC_OP_OP_NAND				0x0000150e
+#define NV30_3D_COLOR_LOGIC_OP_OP_SET				0x0000150f
+
+#define NV30_3D_NORMALIZE_ENABLE				0x0000037c
+
+#define NV35_3D_DEPTH_BOUNDS_TEST_ENABLE			0x00000380
+
+#define NV35_3D_DEPTH_BOUNDS_TEST_ZMIN				0x00000384
+
+#define NV35_3D_DEPTH_BOUNDS_TEST_ZMAX				0x00000388
+
+#define NV30_3D_COLOR_MATERIAL					0x00000390
+#define NV30_3D_COLOR_MATERIAL_FRONT_EMISSION_ENABLE		0x00000001
+#define NV30_3D_COLOR_MATERIAL_FRONT_AMBIENT_ENABLE		0x00000004
+#define NV30_3D_COLOR_MATERIAL_FRONT_DIFFUSE_ENABLE		0x00000010
+#define NV30_3D_COLOR_MATERIAL_FRONT_SPECULAR_ENABLE		0x00000040
+#define NV30_3D_COLOR_MATERIAL_BACK_EMISSION_ENABLE		0x00000100
+#define NV30_3D_COLOR_MATERIAL_BACK_AMBIENT_ENABLE		0x00000400
+#define NV30_3D_COLOR_MATERIAL_BACK_DIFFUSE_ENABLE		0x00001000
+#define NV30_3D_COLOR_MATERIAL_BACK_SPECULAR_ENABLE		0x00004000
+
+#define NV30_3D_DEPTH_RANGE_NEAR				0x00000394
+
+#define NV30_3D_DEPTH_RANGE_FAR					0x00000398
+
+#define NV30_3D_COLOR_MATERIAL_FRONT				0x000003a0
+
+
+#define NV30_3D_COLOR_MATERIAL_FRONT_R				0x000003a0
+
+#define NV30_3D_COLOR_MATERIAL_FRONT_G				0x000003a4
+
+#define NV30_3D_COLOR_MATERIAL_FRONT_B				0x000003a8
+
+#define NV30_3D_COLOR_MATERIAL_FRONT_A				0x000003ac
+
+#define NV40_3D_MIPMAP_ROUNDING					0x000003b0
+#define NV40_3D_MIPMAP_ROUNDING_MODE__MASK			0x00100000
+#define NV40_3D_MIPMAP_ROUNDING_MODE__SHIFT			20
+#define NV40_3D_MIPMAP_ROUNDING_MODE_UP				0x00000000
+#define NV40_3D_MIPMAP_ROUNDING_MODE_DOWN			0x00100000
+
+#define NV30_3D_LINE_WIDTH					0x000003b8
+
+#define NV30_3D_LINE_SMOOTH_ENABLE				0x000003bc
+
+
+
+#define NV30_3D_TEX_GEN_MODE(i0, i1)			       (0x00000400 + 0x10*(i0) + 0x4*(i1))
+#define NV30_3D_TEX_GEN_MODE__ESIZE				0x00000004
+#define NV30_3D_TEX_GEN_MODE__LEN				0x00000004
+#define NV30_3D_TEX_GEN_MODE_FALSE				0x00000000
+#define NV30_3D_TEX_GEN_MODE_EYE_LINEAR				0x00002400
+#define NV30_3D_TEX_GEN_MODE_OBJECT_LINEAR			0x00002401
+#define NV30_3D_TEX_GEN_MODE_SPHERE_MAP				0x00002402
+#define NV30_3D_TEX_GEN_MODE_NORMAL_MAP				0x00008511
+#define NV30_3D_TEX_GEN_MODE_REFLECTION_MAP			0x00008512
+
+#define NV30_3D_MODELVIEW_MATRIX(i0)			       (0x00000480 + 0x4*(i0))
+#define NV30_3D_MODELVIEW_MATRIX__ESIZE				0x00000004
+#define NV30_3D_MODELVIEW_MATRIX__LEN				0x00000010
+
+#define NV30_3D_INVERSE_MODELVIEW_MATRIX(i0)		       (0x00000580 + 0x4*(i0))
+#define NV30_3D_INVERSE_MODELVIEW_MATRIX__ESIZE			0x00000004
+#define NV30_3D_INVERSE_MODELVIEW_MATRIX__LEN			0x0000000c
+
+#define NV30_3D_PROJECTION_MATRIX(i0)			       (0x00000680 + 0x4*(i0))
+#define NV30_3D_PROJECTION_MATRIX__ESIZE			0x00000004
+#define NV30_3D_PROJECTION_MATRIX__LEN				0x00000010
+
+
+#define NV30_3D_TEX_MATRIX(i0, i1)			       (0x000006c0 + 0x40*(i0) + 0x4*(i1))
+#define NV30_3D_TEX_MATRIX__ESIZE				0x00000004
+#define NV30_3D_TEX_MATRIX__LEN					0x00000010
+
+#define NV30_3D_SCISSOR_HORIZ					0x000008c0
+#define NV30_3D_SCISSOR_HORIZ_X__MASK				0x0000ffff
+#define NV30_3D_SCISSOR_HORIZ_X__SHIFT				0
+#define NV30_3D_SCISSOR_HORIZ_W__MASK				0xffff0000
+#define NV30_3D_SCISSOR_HORIZ_W__SHIFT				16
+
+#define NV30_3D_SCISSOR_VERT					0x000008c4
+#define NV30_3D_SCISSOR_VERT_Y__MASK				0x0000ffff
+#define NV30_3D_SCISSOR_VERT_Y__SHIFT				0
+#define NV30_3D_SCISSOR_VERT_H__MASK				0xffff0000
+#define NV30_3D_SCISSOR_VERT_H__SHIFT				16
+
+#define NV30_3D_FOG_COORD_DIST					0x000008c8
+
+#define NV30_3D_FOG_MODE					0x000008cc
+
+#define NV30_3D_FOG_EQUATION_CONSTANT				0x000008d0
+
+#define NV30_3D_FOG_EQUATION_LINEAR				0x000008d4
+
+#define NV30_3D_FOG_EQUATION_QUADRATIC				0x000008d8
+
+#define NV30_3D_FP_ACTIVE_PROGRAM				0x000008e4
+#define NV30_3D_FP_ACTIVE_PROGRAM_DMA0				0x00000001
+#define NV30_3D_FP_ACTIVE_PROGRAM_DMA1				0x00000002
+#define NV30_3D_FP_ACTIVE_PROGRAM_OFFSET__MASK			0xfffffffc
+#define NV30_3D_FP_ACTIVE_PROGRAM_OFFSET__SHIFT			2
+
+
+#define NV30_3D_RC_COLOR0					0x000008ec
+#define NV30_3D_RC_COLOR0_B__MASK				0x000000ff
+#define NV30_3D_RC_COLOR0_B__SHIFT				0
+#define NV30_3D_RC_COLOR0_G__MASK				0x0000ff00
+#define NV30_3D_RC_COLOR0_G__SHIFT				8
+#define NV30_3D_RC_COLOR0_R__MASK				0x00ff0000
+#define NV30_3D_RC_COLOR0_R__SHIFT				16
+#define NV30_3D_RC_COLOR0_A__MASK				0xff000000
+#define NV30_3D_RC_COLOR0_A__SHIFT				24
+
+#define NV30_3D_RC_COLOR1					0x000008f0
+#define NV30_3D_RC_COLOR1_B__MASK				0x000000ff
+#define NV30_3D_RC_COLOR1_B__SHIFT				0
+#define NV30_3D_RC_COLOR1_G__MASK				0x0000ff00
+#define NV30_3D_RC_COLOR1_G__SHIFT				8
+#define NV30_3D_RC_COLOR1_R__MASK				0x00ff0000
+#define NV30_3D_RC_COLOR1_R__SHIFT				16
+#define NV30_3D_RC_COLOR1_A__MASK				0xff000000
+#define NV30_3D_RC_COLOR1_A__SHIFT				24
+
+#define NV30_3D_RC_FINAL0					0x000008f4
+#define NV30_3D_RC_FINAL0_D_INPUT__MASK				0x0000000f
+#define NV30_3D_RC_FINAL0_D_INPUT__SHIFT			0
+#define NV30_3D_RC_FINAL0_D_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL0_D_INPUT_CONSTANT_COLOR0		0x00000001
+#define NV30_3D_RC_FINAL0_D_INPUT_CONSTANT_COLOR1		0x00000002
+#define NV30_3D_RC_FINAL0_D_INPUT_FOG				0x00000003
+#define NV30_3D_RC_FINAL0_D_INPUT_PRIMARY_COLOR			0x00000004
+#define NV30_3D_RC_FINAL0_D_INPUT_SECONDARY_COLOR		0x00000005
+#define NV30_3D_RC_FINAL0_D_INPUT_TEXTURE0			0x00000008
+#define NV30_3D_RC_FINAL0_D_INPUT_TEXTURE1			0x00000009
+#define NV30_3D_RC_FINAL0_D_INPUT_TEXTURE2			0x0000000a
+#define NV30_3D_RC_FINAL0_D_INPUT_TEXTURE3			0x0000000b
+#define NV30_3D_RC_FINAL0_D_INPUT_SPARE0			0x0000000c
+#define NV30_3D_RC_FINAL0_D_INPUT_SPARE1			0x0000000d
+#define NV30_3D_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0000000e
+#define NV30_3D_RC_FINAL0_D_INPUT_E_TIMES_F			0x0000000f
+#define NV30_3D_RC_FINAL0_D_COMPONENT_USAGE__MASK		0x00000010
+#define NV30_3D_RC_FINAL0_D_COMPONENT_USAGE__SHIFT		4
+#define NV30_3D_RC_FINAL0_D_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL0_D_COMPONENT_USAGE_ALPHA		0x00000010
+#define NV30_3D_RC_FINAL0_D_MAPPING__MASK			0x000000e0
+#define NV30_3D_RC_FINAL0_D_MAPPING__SHIFT			5
+#define NV30_3D_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT		0x00000020
+#define NV30_3D_RC_FINAL0_D_MAPPING_EXPAND_NORMAL		0x00000040
+#define NV30_3D_RC_FINAL0_D_MAPPING_EXPAND_NEGATE		0x00000060
+#define NV30_3D_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL		0x00000080
+#define NV30_3D_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE		0x000000a0
+#define NV30_3D_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY		0x000000c0
+#define NV30_3D_RC_FINAL0_D_MAPPING_SIGNED_NEGATE		0x000000e0
+#define NV30_3D_RC_FINAL0_C_INPUT__MASK				0x00000f00
+#define NV30_3D_RC_FINAL0_C_INPUT__SHIFT			8
+#define NV30_3D_RC_FINAL0_C_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL0_C_INPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_FINAL0_C_INPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_FINAL0_C_INPUT_FOG				0x00000300
+#define NV30_3D_RC_FINAL0_C_INPUT_PRIMARY_COLOR			0x00000400
+#define NV30_3D_RC_FINAL0_C_INPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_FINAL0_C_INPUT_TEXTURE0			0x00000800
+#define NV30_3D_RC_FINAL0_C_INPUT_TEXTURE1			0x00000900
+#define NV30_3D_RC_FINAL0_C_INPUT_TEXTURE2			0x00000a00
+#define NV30_3D_RC_FINAL0_C_INPUT_TEXTURE3			0x00000b00
+#define NV30_3D_RC_FINAL0_C_INPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_FINAL0_C_INPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_FINAL0_C_INPUT_E_TIMES_F			0x00000f00
+#define NV30_3D_RC_FINAL0_C_COMPONENT_USAGE__MASK		0x00001000
+#define NV30_3D_RC_FINAL0_C_COMPONENT_USAGE__SHIFT		12
+#define NV30_3D_RC_FINAL0_C_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL0_C_COMPONENT_USAGE_ALPHA		0x00001000
+#define NV30_3D_RC_FINAL0_C_MAPPING__MASK			0x0000e000
+#define NV30_3D_RC_FINAL0_C_MAPPING__SHIFT			13
+#define NV30_3D_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT		0x00002000
+#define NV30_3D_RC_FINAL0_C_MAPPING_EXPAND_NORMAL		0x00004000
+#define NV30_3D_RC_FINAL0_C_MAPPING_EXPAND_NEGATE		0x00006000
+#define NV30_3D_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL		0x00008000
+#define NV30_3D_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE		0x0000a000
+#define NV30_3D_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY		0x0000c000
+#define NV30_3D_RC_FINAL0_C_MAPPING_SIGNED_NEGATE		0x0000e000
+#define NV30_3D_RC_FINAL0_B_INPUT__MASK				0x000f0000
+#define NV30_3D_RC_FINAL0_B_INPUT__SHIFT			16
+#define NV30_3D_RC_FINAL0_B_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL0_B_INPUT_CONSTANT_COLOR0		0x00010000
+#define NV30_3D_RC_FINAL0_B_INPUT_CONSTANT_COLOR1		0x00020000
+#define NV30_3D_RC_FINAL0_B_INPUT_FOG				0x00030000
+#define NV30_3D_RC_FINAL0_B_INPUT_PRIMARY_COLOR			0x00040000
+#define NV30_3D_RC_FINAL0_B_INPUT_SECONDARY_COLOR		0x00050000
+#define NV30_3D_RC_FINAL0_B_INPUT_TEXTURE0			0x00080000
+#define NV30_3D_RC_FINAL0_B_INPUT_TEXTURE1			0x00090000
+#define NV30_3D_RC_FINAL0_B_INPUT_TEXTURE2			0x000a0000
+#define NV30_3D_RC_FINAL0_B_INPUT_TEXTURE3			0x000b0000
+#define NV30_3D_RC_FINAL0_B_INPUT_SPARE0			0x000c0000
+#define NV30_3D_RC_FINAL0_B_INPUT_SPARE1			0x000d0000
+#define NV30_3D_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000e0000
+#define NV30_3D_RC_FINAL0_B_INPUT_E_TIMES_F			0x000f0000
+#define NV30_3D_RC_FINAL0_B_COMPONENT_USAGE__MASK		0x00100000
+#define NV30_3D_RC_FINAL0_B_COMPONENT_USAGE__SHIFT		20
+#define NV30_3D_RC_FINAL0_B_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL0_B_COMPONENT_USAGE_ALPHA		0x00100000
+#define NV30_3D_RC_FINAL0_B_MAPPING__MASK			0x00e00000
+#define NV30_3D_RC_FINAL0_B_MAPPING__SHIFT			21
+#define NV30_3D_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT		0x00200000
+#define NV30_3D_RC_FINAL0_B_MAPPING_EXPAND_NORMAL		0x00400000
+#define NV30_3D_RC_FINAL0_B_MAPPING_EXPAND_NEGATE		0x00600000
+#define NV30_3D_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL		0x00800000
+#define NV30_3D_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE		0x00a00000
+#define NV30_3D_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY		0x00c00000
+#define NV30_3D_RC_FINAL0_B_MAPPING_SIGNED_NEGATE		0x00e00000
+#define NV30_3D_RC_FINAL0_A_INPUT__MASK				0x0f000000
+#define NV30_3D_RC_FINAL0_A_INPUT__SHIFT			24
+#define NV30_3D_RC_FINAL0_A_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL0_A_INPUT_CONSTANT_COLOR0		0x01000000
+#define NV30_3D_RC_FINAL0_A_INPUT_CONSTANT_COLOR1		0x02000000
+#define NV30_3D_RC_FINAL0_A_INPUT_FOG				0x03000000
+#define NV30_3D_RC_FINAL0_A_INPUT_PRIMARY_COLOR			0x04000000
+#define NV30_3D_RC_FINAL0_A_INPUT_SECONDARY_COLOR		0x05000000
+#define NV30_3D_RC_FINAL0_A_INPUT_TEXTURE0			0x08000000
+#define NV30_3D_RC_FINAL0_A_INPUT_TEXTURE1			0x09000000
+#define NV30_3D_RC_FINAL0_A_INPUT_TEXTURE2			0x0a000000
+#define NV30_3D_RC_FINAL0_A_INPUT_TEXTURE3			0x0b000000
+#define NV30_3D_RC_FINAL0_A_INPUT_SPARE0			0x0c000000
+#define NV30_3D_RC_FINAL0_A_INPUT_SPARE1			0x0d000000
+#define NV30_3D_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0e000000
+#define NV30_3D_RC_FINAL0_A_INPUT_E_TIMES_F			0x0f000000
+#define NV30_3D_RC_FINAL0_A_COMPONENT_USAGE__MASK		0x10000000
+#define NV30_3D_RC_FINAL0_A_COMPONENT_USAGE__SHIFT		28
+#define NV30_3D_RC_FINAL0_A_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL0_A_COMPONENT_USAGE_ALPHA		0x10000000
+#define NV30_3D_RC_FINAL0_A_MAPPING__MASK			0xe0000000
+#define NV30_3D_RC_FINAL0_A_MAPPING__SHIFT			29
+#define NV30_3D_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT		0x20000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_EXPAND_NORMAL		0x40000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_EXPAND_NEGATE		0x60000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL		0x80000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE		0xa0000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY		0xc0000000
+#define NV30_3D_RC_FINAL0_A_MAPPING_SIGNED_NEGATE		0xe0000000
+
+#define NV30_3D_RC_FINAL1					0x000008f8
+#define NV30_3D_RC_FINAL1_COLOR_SUM_CLAMP			0x00000080
+#define NV30_3D_RC_FINAL1_G_INPUT__MASK				0x00000f00
+#define NV30_3D_RC_FINAL1_G_INPUT__SHIFT			8
+#define NV30_3D_RC_FINAL1_G_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL1_G_INPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_FINAL1_G_INPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_FINAL1_G_INPUT_FOG				0x00000300
+#define NV30_3D_RC_FINAL1_G_INPUT_PRIMARY_COLOR			0x00000400
+#define NV30_3D_RC_FINAL1_G_INPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_FINAL1_G_INPUT_TEXTURE0			0x00000800
+#define NV30_3D_RC_FINAL1_G_INPUT_TEXTURE1			0x00000900
+#define NV30_3D_RC_FINAL1_G_INPUT_TEXTURE2			0x00000a00
+#define NV30_3D_RC_FINAL1_G_INPUT_TEXTURE3			0x00000b00
+#define NV30_3D_RC_FINAL1_G_INPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_FINAL1_G_INPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_FINAL1_G_INPUT_E_TIMES_F			0x00000f00
+#define NV30_3D_RC_FINAL1_G_COMPONENT_USAGE__MASK		0x00001000
+#define NV30_3D_RC_FINAL1_G_COMPONENT_USAGE__SHIFT		12
+#define NV30_3D_RC_FINAL1_G_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL1_G_COMPONENT_USAGE_ALPHA		0x00001000
+#define NV30_3D_RC_FINAL1_G_MAPPING__MASK			0x0000e000
+#define NV30_3D_RC_FINAL1_G_MAPPING__SHIFT			13
+#define NV30_3D_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT		0x00002000
+#define NV30_3D_RC_FINAL1_G_MAPPING_EXPAND_NORMAL		0x00004000
+#define NV30_3D_RC_FINAL1_G_MAPPING_EXPAND_NEGATE		0x00006000
+#define NV30_3D_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL		0x00008000
+#define NV30_3D_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE		0x0000a000
+#define NV30_3D_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY		0x0000c000
+#define NV30_3D_RC_FINAL1_G_MAPPING_SIGNED_NEGATE		0x0000e000
+#define NV30_3D_RC_FINAL1_F_INPUT__MASK				0x000f0000
+#define NV30_3D_RC_FINAL1_F_INPUT__SHIFT			16
+#define NV30_3D_RC_FINAL1_F_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL1_F_INPUT_CONSTANT_COLOR0		0x00010000
+#define NV30_3D_RC_FINAL1_F_INPUT_CONSTANT_COLOR1		0x00020000
+#define NV30_3D_RC_FINAL1_F_INPUT_FOG				0x00030000
+#define NV30_3D_RC_FINAL1_F_INPUT_PRIMARY_COLOR			0x00040000
+#define NV30_3D_RC_FINAL1_F_INPUT_SECONDARY_COLOR		0x00050000
+#define NV30_3D_RC_FINAL1_F_INPUT_TEXTURE0			0x00080000
+#define NV30_3D_RC_FINAL1_F_INPUT_TEXTURE1			0x00090000
+#define NV30_3D_RC_FINAL1_F_INPUT_TEXTURE2			0x000a0000
+#define NV30_3D_RC_FINAL1_F_INPUT_TEXTURE3			0x000b0000
+#define NV30_3D_RC_FINAL1_F_INPUT_SPARE0			0x000c0000
+#define NV30_3D_RC_FINAL1_F_INPUT_SPARE1			0x000d0000
+#define NV30_3D_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000e0000
+#define NV30_3D_RC_FINAL1_F_INPUT_E_TIMES_F			0x000f0000
+#define NV30_3D_RC_FINAL1_F_COMPONENT_USAGE__MASK		0x00100000
+#define NV30_3D_RC_FINAL1_F_COMPONENT_USAGE__SHIFT		20
+#define NV30_3D_RC_FINAL1_F_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL1_F_COMPONENT_USAGE_ALPHA		0x00100000
+#define NV30_3D_RC_FINAL1_F_MAPPING__MASK			0x00e00000
+#define NV30_3D_RC_FINAL1_F_MAPPING__SHIFT			21
+#define NV30_3D_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT		0x00200000
+#define NV30_3D_RC_FINAL1_F_MAPPING_EXPAND_NORMAL		0x00400000
+#define NV30_3D_RC_FINAL1_F_MAPPING_EXPAND_NEGATE		0x00600000
+#define NV30_3D_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL		0x00800000
+#define NV30_3D_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE		0x00a00000
+#define NV30_3D_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY		0x00c00000
+#define NV30_3D_RC_FINAL1_F_MAPPING_SIGNED_NEGATE		0x00e00000
+#define NV30_3D_RC_FINAL1_E_INPUT__MASK				0x0f000000
+#define NV30_3D_RC_FINAL1_E_INPUT__SHIFT			24
+#define NV30_3D_RC_FINAL1_E_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_FINAL1_E_INPUT_CONSTANT_COLOR0		0x01000000
+#define NV30_3D_RC_FINAL1_E_INPUT_CONSTANT_COLOR1		0x02000000
+#define NV30_3D_RC_FINAL1_E_INPUT_FOG				0x03000000
+#define NV30_3D_RC_FINAL1_E_INPUT_PRIMARY_COLOR			0x04000000
+#define NV30_3D_RC_FINAL1_E_INPUT_SECONDARY_COLOR		0x05000000
+#define NV30_3D_RC_FINAL1_E_INPUT_TEXTURE0			0x08000000
+#define NV30_3D_RC_FINAL1_E_INPUT_TEXTURE1			0x09000000
+#define NV30_3D_RC_FINAL1_E_INPUT_TEXTURE2			0x0a000000
+#define NV30_3D_RC_FINAL1_E_INPUT_TEXTURE3			0x0b000000
+#define NV30_3D_RC_FINAL1_E_INPUT_SPARE0			0x0c000000
+#define NV30_3D_RC_FINAL1_E_INPUT_SPARE1			0x0d000000
+#define NV30_3D_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0e000000
+#define NV30_3D_RC_FINAL1_E_INPUT_E_TIMES_F			0x0f000000
+#define NV30_3D_RC_FINAL1_E_COMPONENT_USAGE__MASK		0x10000000
+#define NV30_3D_RC_FINAL1_E_COMPONENT_USAGE__SHIFT		28
+#define NV30_3D_RC_FINAL1_E_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_FINAL1_E_COMPONENT_USAGE_ALPHA		0x10000000
+#define NV30_3D_RC_FINAL1_E_MAPPING__MASK			0xe0000000
+#define NV30_3D_RC_FINAL1_E_MAPPING__SHIFT			29
+#define NV30_3D_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT		0x20000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_EXPAND_NORMAL		0x40000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_EXPAND_NEGATE		0x60000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL		0x80000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE		0xa0000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY		0xc0000000
+#define NV30_3D_RC_FINAL1_E_MAPPING_SIGNED_NEGATE		0xe0000000
+
+#define NV30_3D_RC_ENABLE					0x000008fc
+#define NV30_3D_RC_ENABLE_NUM_COMBINERS__MASK			0x0000000f
+#define NV30_3D_RC_ENABLE_NUM_COMBINERS__SHIFT			0
+#define NV30_3D_RC_ENABLE_STAGE_CONSTANT_COLOR0			0x0000f000
+#define NV30_3D_RC_ENABLE_STAGE_CONSTANT_COLOR1			0x000f0000
+
+
+#define NV30_3D_RC_IN_ALPHA(i0)				       (0x00000900 + 0x20*(i0))
+#define NV30_3D_RC_IN_ALPHA_D_INPUT__MASK			0x0000000f
+#define NV30_3D_RC_IN_ALPHA_D_INPUT__SHIFT			0
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_ZERO			0x00000000
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0		0x00000001
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1		0x00000002
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_FOG				0x00000003
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR		0x00000004
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR		0x00000005
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_TEXTURE0			0x00000008
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_TEXTURE1			0x00000009
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_TEXTURE2			0x0000000a
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_TEXTURE3			0x0000000b
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_SPARE0			0x0000000c
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_SPARE1			0x0000000d
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0000000e
+#define NV30_3D_RC_IN_ALPHA_D_INPUT_E_TIMES_F			0x0000000f
+#define NV30_3D_RC_IN_ALPHA_D_COMPONENT_USAGE__MASK		0x00000010
+#define NV30_3D_RC_IN_ALPHA_D_COMPONENT_USAGE__SHIFT		4
+#define NV30_3D_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE		0x00000000
+#define NV30_3D_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA		0x00000010
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING__MASK			0x000000e0
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING__SHIFT			5
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT		0x00000020
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL		0x00000040
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE		0x00000060
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL		0x00000080
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE		0x000000a0
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY		0x000000c0
+#define NV30_3D_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE		0x000000e0
+#define NV30_3D_RC_IN_ALPHA_C_INPUT__MASK			0x00000f00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT__SHIFT			8
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_ZERO			0x00000000
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_FOG				0x00000300
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR		0x00000400
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_TEXTURE0			0x00000800
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_TEXTURE1			0x00000900
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_TEXTURE2			0x00000a00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_TEXTURE3			0x00000b00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_IN_ALPHA_C_INPUT_E_TIMES_F			0x00000f00
+#define NV30_3D_RC_IN_ALPHA_C_COMPONENT_USAGE__MASK		0x00001000
+#define NV30_3D_RC_IN_ALPHA_C_COMPONENT_USAGE__SHIFT		12
+#define NV30_3D_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE		0x00000000
+#define NV30_3D_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA		0x00001000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING__MASK			0x0000e000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING__SHIFT			13
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT		0x00002000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL		0x00004000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE		0x00006000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL		0x00008000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE		0x0000a000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY		0x0000c000
+#define NV30_3D_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE		0x0000e000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT__MASK			0x000f0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT__SHIFT			16
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_ZERO			0x00000000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0		0x00010000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1		0x00020000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_FOG				0x00030000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR		0x00040000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR		0x00050000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_TEXTURE0			0x00080000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_TEXTURE1			0x00090000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_TEXTURE2			0x000a0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_TEXTURE3			0x000b0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_SPARE0			0x000c0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_SPARE1			0x000d0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000e0000
+#define NV30_3D_RC_IN_ALPHA_B_INPUT_E_TIMES_F			0x000f0000
+#define NV30_3D_RC_IN_ALPHA_B_COMPONENT_USAGE__MASK		0x00100000
+#define NV30_3D_RC_IN_ALPHA_B_COMPONENT_USAGE__SHIFT		20
+#define NV30_3D_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE		0x00000000
+#define NV30_3D_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA		0x00100000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING__MASK			0x00e00000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING__SHIFT			21
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT		0x00200000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL		0x00400000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE		0x00600000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL		0x00800000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE		0x00a00000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY		0x00c00000
+#define NV30_3D_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE		0x00e00000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT__MASK			0x0f000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT__SHIFT			24
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_ZERO			0x00000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0		0x01000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1		0x02000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_FOG				0x03000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR		0x04000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR		0x05000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_TEXTURE0			0x08000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_TEXTURE1			0x09000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_TEXTURE2			0x0a000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_TEXTURE3			0x0b000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_SPARE0			0x0c000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_SPARE1			0x0d000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0e000000
+#define NV30_3D_RC_IN_ALPHA_A_INPUT_E_TIMES_F			0x0f000000
+#define NV30_3D_RC_IN_ALPHA_A_COMPONENT_USAGE__MASK		0x10000000
+#define NV30_3D_RC_IN_ALPHA_A_COMPONENT_USAGE__SHIFT		28
+#define NV30_3D_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE		0x00000000
+#define NV30_3D_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA		0x10000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING__MASK			0xe0000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING__SHIFT			29
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT		0x20000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL		0x40000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE		0x60000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL		0x80000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE		0xa0000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY		0xc0000000
+#define NV30_3D_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE		0xe0000000
+
+#define NV30_3D_RC_IN_RGB(i0)				       (0x00000904 + 0x20*(i0))
+#define NV30_3D_RC_IN_RGB_D_INPUT__MASK				0x0000000f
+#define NV30_3D_RC_IN_RGB_D_INPUT__SHIFT			0
+#define NV30_3D_RC_IN_RGB_D_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0		0x00000001
+#define NV30_3D_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1		0x00000002
+#define NV30_3D_RC_IN_RGB_D_INPUT_FOG				0x00000003
+#define NV30_3D_RC_IN_RGB_D_INPUT_PRIMARY_COLOR			0x00000004
+#define NV30_3D_RC_IN_RGB_D_INPUT_SECONDARY_COLOR		0x00000005
+#define NV30_3D_RC_IN_RGB_D_INPUT_TEXTURE0			0x00000008
+#define NV30_3D_RC_IN_RGB_D_INPUT_TEXTURE1			0x00000009
+#define NV30_3D_RC_IN_RGB_D_INPUT_TEXTURE2			0x0000000a
+#define NV30_3D_RC_IN_RGB_D_INPUT_TEXTURE3			0x0000000b
+#define NV30_3D_RC_IN_RGB_D_INPUT_SPARE0			0x0000000c
+#define NV30_3D_RC_IN_RGB_D_INPUT_SPARE1			0x0000000d
+#define NV30_3D_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0000000e
+#define NV30_3D_RC_IN_RGB_D_INPUT_E_TIMES_F			0x0000000f
+#define NV30_3D_RC_IN_RGB_D_COMPONENT_USAGE__MASK		0x00000010
+#define NV30_3D_RC_IN_RGB_D_COMPONENT_USAGE__SHIFT		4
+#define NV30_3D_RC_IN_RGB_D_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA		0x00000010
+#define NV30_3D_RC_IN_RGB_D_MAPPING__MASK			0x000000e0
+#define NV30_3D_RC_IN_RGB_D_MAPPING__SHIFT			5
+#define NV30_3D_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT		0x00000020
+#define NV30_3D_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL		0x00000040
+#define NV30_3D_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE		0x00000060
+#define NV30_3D_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL		0x00000080
+#define NV30_3D_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE		0x000000a0
+#define NV30_3D_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY		0x000000c0
+#define NV30_3D_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE		0x000000e0
+#define NV30_3D_RC_IN_RGB_C_INPUT__MASK				0x00000f00
+#define NV30_3D_RC_IN_RGB_C_INPUT__SHIFT			8
+#define NV30_3D_RC_IN_RGB_C_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_IN_RGB_C_INPUT_FOG				0x00000300
+#define NV30_3D_RC_IN_RGB_C_INPUT_PRIMARY_COLOR			0x00000400
+#define NV30_3D_RC_IN_RGB_C_INPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_IN_RGB_C_INPUT_TEXTURE0			0x00000800
+#define NV30_3D_RC_IN_RGB_C_INPUT_TEXTURE1			0x00000900
+#define NV30_3D_RC_IN_RGB_C_INPUT_TEXTURE2			0x00000a00
+#define NV30_3D_RC_IN_RGB_C_INPUT_TEXTURE3			0x00000b00
+#define NV30_3D_RC_IN_RGB_C_INPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_IN_RGB_C_INPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_IN_RGB_C_INPUT_E_TIMES_F			0x00000f00
+#define NV30_3D_RC_IN_RGB_C_COMPONENT_USAGE__MASK		0x00001000
+#define NV30_3D_RC_IN_RGB_C_COMPONENT_USAGE__SHIFT		12
+#define NV30_3D_RC_IN_RGB_C_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA		0x00001000
+#define NV30_3D_RC_IN_RGB_C_MAPPING__MASK			0x0000e000
+#define NV30_3D_RC_IN_RGB_C_MAPPING__SHIFT			13
+#define NV30_3D_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT		0x00002000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL		0x00004000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE		0x00006000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL		0x00008000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE		0x0000a000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY		0x0000c000
+#define NV30_3D_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE		0x0000e000
+#define NV30_3D_RC_IN_RGB_B_INPUT__MASK				0x000f0000
+#define NV30_3D_RC_IN_RGB_B_INPUT__SHIFT			16
+#define NV30_3D_RC_IN_RGB_B_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0		0x00010000
+#define NV30_3D_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1		0x00020000
+#define NV30_3D_RC_IN_RGB_B_INPUT_FOG				0x00030000
+#define NV30_3D_RC_IN_RGB_B_INPUT_PRIMARY_COLOR			0x00040000
+#define NV30_3D_RC_IN_RGB_B_INPUT_SECONDARY_COLOR		0x00050000
+#define NV30_3D_RC_IN_RGB_B_INPUT_TEXTURE0			0x00080000
+#define NV30_3D_RC_IN_RGB_B_INPUT_TEXTURE1			0x00090000
+#define NV30_3D_RC_IN_RGB_B_INPUT_TEXTURE2			0x000a0000
+#define NV30_3D_RC_IN_RGB_B_INPUT_TEXTURE3			0x000b0000
+#define NV30_3D_RC_IN_RGB_B_INPUT_SPARE0			0x000c0000
+#define NV30_3D_RC_IN_RGB_B_INPUT_SPARE1			0x000d0000
+#define NV30_3D_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000e0000
+#define NV30_3D_RC_IN_RGB_B_INPUT_E_TIMES_F			0x000f0000
+#define NV30_3D_RC_IN_RGB_B_COMPONENT_USAGE__MASK		0x00100000
+#define NV30_3D_RC_IN_RGB_B_COMPONENT_USAGE__SHIFT		20
+#define NV30_3D_RC_IN_RGB_B_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA		0x00100000
+#define NV30_3D_RC_IN_RGB_B_MAPPING__MASK			0x00e00000
+#define NV30_3D_RC_IN_RGB_B_MAPPING__SHIFT			21
+#define NV30_3D_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT		0x00200000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL		0x00400000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE		0x00600000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL		0x00800000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE		0x00a00000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY		0x00c00000
+#define NV30_3D_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE		0x00e00000
+#define NV30_3D_RC_IN_RGB_A_INPUT__MASK				0x0f000000
+#define NV30_3D_RC_IN_RGB_A_INPUT__SHIFT			24
+#define NV30_3D_RC_IN_RGB_A_INPUT_ZERO				0x00000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0		0x01000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1		0x02000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_FOG				0x03000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_PRIMARY_COLOR			0x04000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_SECONDARY_COLOR		0x05000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_TEXTURE0			0x08000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_TEXTURE1			0x09000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_TEXTURE2			0x0a000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_TEXTURE3			0x0b000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_SPARE0			0x0c000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_SPARE1			0x0d000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0e000000
+#define NV30_3D_RC_IN_RGB_A_INPUT_E_TIMES_F			0x0f000000
+#define NV30_3D_RC_IN_RGB_A_COMPONENT_USAGE__MASK		0x10000000
+#define NV30_3D_RC_IN_RGB_A_COMPONENT_USAGE__SHIFT		28
+#define NV30_3D_RC_IN_RGB_A_COMPONENT_USAGE_RGB			0x00000000
+#define NV30_3D_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA		0x10000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING__MASK			0xe0000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING__SHIFT			29
+#define NV30_3D_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY		0x00000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT		0x20000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL		0x40000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE		0x60000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL		0x80000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE		0xa0000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY		0xc0000000
+#define NV30_3D_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE		0xe0000000
+
+#define NV30_3D_RC_CONSTANT_COLOR0(i0)			       (0x00000908 + 0x20*(i0))
+#define NV30_3D_RC_CONSTANT_COLOR0_B__MASK			0x000000ff
+#define NV30_3D_RC_CONSTANT_COLOR0_B__SHIFT			0
+#define NV30_3D_RC_CONSTANT_COLOR0_G__MASK			0x0000ff00
+#define NV30_3D_RC_CONSTANT_COLOR0_G__SHIFT			8
+#define NV30_3D_RC_CONSTANT_COLOR0_R__MASK			0x00ff0000
+#define NV30_3D_RC_CONSTANT_COLOR0_R__SHIFT			16
+#define NV30_3D_RC_CONSTANT_COLOR0_A__MASK			0xff000000
+#define NV30_3D_RC_CONSTANT_COLOR0_A__SHIFT			24
+
+#define NV30_3D_RC_CONSTANT_COLOR1(i0)			       (0x0000090c + 0x20*(i0))
+#define NV30_3D_RC_CONSTANT_COLOR1_B__MASK			0x000000ff
+#define NV30_3D_RC_CONSTANT_COLOR1_B__SHIFT			0
+#define NV30_3D_RC_CONSTANT_COLOR1_G__MASK			0x0000ff00
+#define NV30_3D_RC_CONSTANT_COLOR1_G__SHIFT			8
+#define NV30_3D_RC_CONSTANT_COLOR1_R__MASK			0x00ff0000
+#define NV30_3D_RC_CONSTANT_COLOR1_R__SHIFT			16
+#define NV30_3D_RC_CONSTANT_COLOR1_A__MASK			0xff000000
+#define NV30_3D_RC_CONSTANT_COLOR1_A__SHIFT			24
+
+#define NV30_3D_RC_OUT_ALPHA(i0)			       (0x00000910 + 0x20*(i0))
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT__MASK			0x0000000f
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT__SHIFT			0
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0		0x00000001
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1		0x00000002
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_FOG			0x00000003
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR		0x00000004
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR		0x00000005
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0			0x00000008
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1			0x00000009
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE2			0x0000000a
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE3			0x0000000b
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_SPARE0			0x0000000c
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_SPARE1			0x0000000d
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0000000e
+#define NV30_3D_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F		0x0000000f
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT__MASK			0x000000f0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT__SHIFT			4
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0		0x00000010
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1		0x00000020
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_FOG			0x00000030
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR		0x00000040
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR		0x00000050
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0			0x00000080
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1			0x00000090
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE2			0x000000a0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE3			0x000000b0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_SPARE0			0x000000c0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_SPARE1			0x000000d0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000000e0
+#define NV30_3D_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F		0x000000f0
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT__MASK			0x00000f00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT__SHIFT			8
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_FOG			0x00000300
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR		0x00000400
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0		0x00000800
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1		0x00000900
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE2		0x00000a00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE3		0x00000b00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F		0x00000f00
+#define NV30_3D_RC_OUT_ALPHA_CD_DOT_PRODUCT			0x00001000
+#define NV30_3D_RC_OUT_ALPHA_AB_DOT_PRODUCT			0x00002000
+#define NV30_3D_RC_OUT_ALPHA_MUX_SUM				0x00004000
+#define NV30_3D_RC_OUT_ALPHA_BIAS__MASK				0x00008000
+#define NV30_3D_RC_OUT_ALPHA_BIAS__SHIFT			15
+#define NV30_3D_RC_OUT_ALPHA_BIAS_NONE				0x00000000
+#define NV30_3D_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF	0x00008000
+#define NV30_3D_RC_OUT_ALPHA_SCALE__MASK			0x00030000
+#define NV30_3D_RC_OUT_ALPHA_SCALE__SHIFT			16
+#define NV30_3D_RC_OUT_ALPHA_SCALE_NONE				0x00000000
+#define NV30_3D_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO			0x00010000
+#define NV30_3D_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR		0x00020000
+#define NV30_3D_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF		0x00030000
+
+#define NV30_3D_RC_OUT_RGB(i0)				       (0x00000914 + 0x20*(i0))
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT__MASK			0x0000000f
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT__SHIFT			0
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0		0x00000001
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1		0x00000002
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_FOG			0x00000003
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR		0x00000004
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR		0x00000005
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_TEXTURE0			0x00000008
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_TEXTURE1			0x00000009
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_TEXTURE2			0x0000000a
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_TEXTURE3			0x0000000b
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_SPARE0			0x0000000c
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_SPARE1			0x0000000d
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x0000000e
+#define NV30_3D_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F			0x0000000f
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT__MASK			0x000000f0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT__SHIFT			4
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0		0x00000010
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1		0x00000020
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_FOG			0x00000030
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR		0x00000040
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR		0x00000050
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_TEXTURE0			0x00000080
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_TEXTURE1			0x00000090
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_TEXTURE2			0x000000a0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_TEXTURE3			0x000000b0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_SPARE0			0x000000c0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_SPARE1			0x000000d0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x000000e0
+#define NV30_3D_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F			0x000000f0
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT__MASK			0x00000f00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT__SHIFT			8
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_ZERO			0x00000000
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0		0x00000100
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1		0x00000200
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_FOG			0x00000300
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR		0x00000400
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR		0x00000500
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0			0x00000800
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1			0x00000900
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_TEXTURE2			0x00000a00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_TEXTURE3			0x00000b00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_SPARE0			0x00000c00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_SPARE1			0x00000d00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR	0x00000e00
+#define NV30_3D_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F			0x00000f00
+#define NV30_3D_RC_OUT_RGB_CD_DOT_PRODUCT			0x00001000
+#define NV30_3D_RC_OUT_RGB_AB_DOT_PRODUCT			0x00002000
+#define NV30_3D_RC_OUT_RGB_MUX_SUM				0x00004000
+#define NV30_3D_RC_OUT_RGB_BIAS__MASK				0x00008000
+#define NV30_3D_RC_OUT_RGB_BIAS__SHIFT				15
+#define NV30_3D_RC_OUT_RGB_BIAS_NONE				0x00000000
+#define NV30_3D_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF	0x00008000
+#define NV30_3D_RC_OUT_RGB_SCALE__MASK				0x00030000
+#define NV30_3D_RC_OUT_RGB_SCALE__SHIFT				16
+#define NV30_3D_RC_OUT_RGB_SCALE_NONE				0x00000000
+#define NV30_3D_RC_OUT_RGB_SCALE_SCALE_BY_TWO			0x00010000
+#define NV30_3D_RC_OUT_RGB_SCALE_SCALE_BY_FOUR			0x00020000
+#define NV30_3D_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF		0x00030000
+
+#define NV30_3D_VIEWPORT_HORIZ					0x00000a00
+#define NV30_3D_VIEWPORT_HORIZ_X__MASK				0x0000ffff
+#define NV30_3D_VIEWPORT_HORIZ_X__SHIFT				0
+#define NV30_3D_VIEWPORT_HORIZ_W__MASK				0xffff0000
+#define NV30_3D_VIEWPORT_HORIZ_W__SHIFT				16
+
+#define NV30_3D_VIEWPORT_VERT					0x00000a04
+#define NV30_3D_VIEWPORT_VERT_Y__MASK				0x0000ffff
+#define NV30_3D_VIEWPORT_VERT_Y__SHIFT				0
+#define NV30_3D_VIEWPORT_VERT_H__MASK				0xffff0000
+#define NV30_3D_VIEWPORT_VERT_H__SHIFT				16
+
+#define NV30_3D_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION	0x00000a10
+
+
+#define NV30_3D_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R	0x00000a10
+
+#define NV30_3D_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G	0x00000a14
+
+#define NV30_3D_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B	0x00000a18
+
+#define NV30_3D_VIEWPORT_TRANSLATE				0x00000a20
+
+
+#define NV30_3D_VIEWPORT_TRANSLATE_X				0x00000a20
+
+#define NV30_3D_VIEWPORT_TRANSLATE_Y				0x00000a24
+
+#define NV30_3D_VIEWPORT_TRANSLATE_Z				0x00000a28
+
+#define NV30_3D_VIEWPORT_TRANSLATE_W				0x00000a2c
+
+#define NV30_3D_VIEWPORT_SCALE					0x00000a30
+
+
+#define NV30_3D_VIEWPORT_SCALE_X				0x00000a30
+
+#define NV30_3D_VIEWPORT_SCALE_Y				0x00000a34
+
+#define NV30_3D_VIEWPORT_SCALE_Z				0x00000a38
+
+#define NV30_3D_VIEWPORT_SCALE_W				0x00000a3c
+
+#define NV30_3D_POLYGON_OFFSET_POINT_ENABLE			0x00000a60
+
+#define NV30_3D_POLYGON_OFFSET_LINE_ENABLE			0x00000a64
+
+#define NV30_3D_POLYGON_OFFSET_FILL_ENABLE			0x00000a68
+
+#define NV30_3D_DEPTH_FUNC					0x00000a6c
+#define NV30_3D_DEPTH_FUNC_NEVER				0x00000200
+#define NV30_3D_DEPTH_FUNC_LESS					0x00000201
+#define NV30_3D_DEPTH_FUNC_EQUAL				0x00000202
+#define NV30_3D_DEPTH_FUNC_LEQUAL				0x00000203
+#define NV30_3D_DEPTH_FUNC_GREATER				0x00000204
+#define NV30_3D_DEPTH_FUNC_NOTEQUAL				0x00000205
+#define NV30_3D_DEPTH_FUNC_GEQUAL				0x00000206
+#define NV30_3D_DEPTH_FUNC_ALWAYS				0x00000207
+
+#define NV30_3D_DEPTH_WRITE_ENABLE				0x00000a70
+
+#define NV30_3D_DEPTH_TEST_ENABLE				0x00000a74
+
+#define NV30_3D_POLYGON_OFFSET_FACTOR				0x00000a78
+
+#define NV30_3D_POLYGON_OFFSET_UNITS				0x00000a7c
+
+#define NV30_3D_VTX_ATTR_3I_XY(i0)			       (0x00000a80 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_3I_XY__ESIZE				0x00000008
+#define NV30_3D_VTX_ATTR_3I_XY__LEN				0x00000010
+#define NV30_3D_VTX_ATTR_3I_XY_X__MASK				0x0000ffff
+#define NV30_3D_VTX_ATTR_3I_XY_X__SHIFT				0
+#define NV30_3D_VTX_ATTR_3I_XY_Y__MASK				0xffff0000
+#define NV30_3D_VTX_ATTR_3I_XY_Y__SHIFT				16
+
+#define NV30_3D_VTX_ATTR_3I_Z(i0)			       (0x00000a84 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_3I_Z__ESIZE				0x00000008
+#define NV30_3D_VTX_ATTR_3I_Z__LEN				0x00000010
+#define NV30_3D_VTX_ATTR_3I_Z_Z__MASK				0x0000ffff
+#define NV30_3D_VTX_ATTR_3I_Z_Z__SHIFT				0
+
+#define NV30_3D_TEX_FILTER_OPTIMIZATION(i0)		       (0x00000b00 + 0x4*(i0))
+#define NV30_3D_TEX_FILTER_OPTIMIZATION__ESIZE			0x00000004
+#define NV30_3D_TEX_FILTER_OPTIMIZATION__LEN			0x00000004
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR__MASK		0x0000001f
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR__SHIFT	0
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR_OFF		0x00000000
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR_HIGH_QUALITY	0x00000004
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR_QUALITY	0x00000006
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR_PERFORMANCE	0x00000008
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_TRILINEAR_HIGH_PERFORMANCE	0x00000018
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE__MASK	0x000001c0
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE__SHIFT	6
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_OFF	0x00000000
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_HIGH_QUALITY	0x000000c0
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_QUALITY	0x000001c0
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_ANISO_SAMPLE_PERFORMANCE	0x00000140
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_UNKNOWN__MASK		0x00007c00
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_UNKNOWN__SHIFT		10
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_UNKNOWN_OFF		0x00000000
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_UNKNOWN_PARTIAL		0x00002c00
+#define NV40_3D_TEX_FILTER_OPTIMIZATION_UNKNOWN_FULL		0x00007c00
+
+#define NV40_3D_UNK0B40(i0)				       (0x00000b40 + 0x4*(i0))
+#define NV40_3D_UNK0B40__ESIZE					0x00000004
+#define NV40_3D_UNK0B40__LEN					0x00000008
+
+#define NV30_3D_VP_UPLOAD_INST(i0)			       (0x00000b80 + 0x4*(i0))
+#define NV30_3D_VP_UPLOAD_INST__ESIZE				0x00000004
+#define NV30_3D_VP_UPLOAD_INST__LEN				0x00000004
+
+
+#define NV30_3D_TEX_CLIP_PLANE(i0)			       (0x00000e00 + 0x10*(i0))
+
+
+#define NV30_3D_TEX_CLIP_PLANE_X(i0)			       (0x00000e00 + 0x10*(i0))
+
+#define NV30_3D_TEX_CLIP_PLANE_Y(i0)			       (0x00000e04 + 0x10*(i0))
+
+#define NV30_3D_TEX_CLIP_PLANE_Z(i0)			       (0x00000e08 + 0x10*(i0))
+
+#define NV30_3D_TEX_CLIP_PLANE_W(i0)			       (0x00000e0c + 0x10*(i0))
+
+#define NV30_3D_LIGHT						0x00001000
+
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT(i0)	       (0x00001000 + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(i0)	       (0x00001000 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(i0)	       (0x00001004 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(i0)	       (0x00001008 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE(i0)	       (0x0000100c + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(i0)	       (0x0000100c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(i0)	       (0x00001010 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(i0)	       (0x00001014 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR(i0)	       (0x00001018 + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(i0)	       (0x00001018 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(i0)	       (0x0000101c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(i0)	       (0x00001020 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_UNK24(i0)				       (0x00001024 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_HALF_VECTOR(i0)			       (0x00001028 + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_HALF_VECTOR_X(i0)			       (0x00001028 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_HALF_VECTOR_Y(i0)			       (0x0000102c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_HALF_VECTOR_Z(i0)			       (0x00001030 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_DIRECTION(i0)			       (0x00001034 + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_DIRECTION_X(i0)			       (0x00001034 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_DIRECTION_Y(i0)			       (0x00001038 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_DIRECTION_Z(i0)			       (0x0000103c + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_SPOT_CUTOFF_A(i0)			       (0x00001200 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_CUTOFF_B(i0)			       (0x00001204 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_CUTOFF_C(i0)			       (0x00001208 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_DIR(i0)			       (0x0000120c + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_SPOT_DIR_X(i0)			       (0x0000120c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_DIR_Y(i0)			       (0x00001210 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_DIR_Z(i0)			       (0x00001214 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_SPOT_CUTOFF_D(i0)			       (0x00001218 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_POSITION(i0)			       (0x0000121c + 0x40*(i0))
+
+
+#define NV30_3D_LIGHT_POSITION_X(i0)			       (0x0000121c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_POSITION_Y(i0)			       (0x00001220 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_POSITION_Z(i0)			       (0x00001224 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_ATTENUATION(i0)			       (0x00001228 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_ATTENUATION_CONSTANT(i0)		       (0x00001228 + 0x40*(i0))
+
+#define NV30_3D_LIGHT_ATTENUATION_LINEAR(i0)		       (0x0000122c + 0x40*(i0))
+
+#define NV30_3D_LIGHT_ATTENUATION_QUADRATIC(i0)		       (0x00001230 + 0x40*(i0))
+
+#define NV30_3D_FRONT_MATERIAL_SHININESS(i0)		       (0x00001400 + 0x4*(i0))
+#define NV30_3D_FRONT_MATERIAL_SHININESS__ESIZE			0x00000004
+#define NV30_3D_FRONT_MATERIAL_SHININESS__LEN			0x00000006
+
+#define NV30_3D_ENABLED_LIGHTS					0x00001420
+
+#define NV30_3D_VERTEX_TWO_SIDE_ENABLE				0x0000142c
+
+#define NV30_3D_FP_REG_CONTROL					0x00001450
+#define NV30_3D_FP_REG_CONTROL_UNK0__MASK			0x0000ffff
+#define NV30_3D_FP_REG_CONTROL_UNK0__SHIFT			0
+#define NV30_3D_FP_REG_CONTROL_UNK1__MASK			0xffff0000
+#define NV30_3D_FP_REG_CONTROL_UNK1__SHIFT			16
+
+#define NV30_3D_FLATSHADE_FIRST					0x00001454
+
+#define NV30_3D_EDGEFLAG					0x0000145c
+#define NV30_3D_EDGEFLAG_ENABLE					0x00000001
+
+#define NV30_3D_VP_CLIP_PLANES_ENABLE				0x00001478
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE0			0x00000002
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE1			0x00000020
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE2			0x00000200
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE3			0x00002000
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE4			0x00020000
+#define NV30_3D_VP_CLIP_PLANES_ENABLE_PLANE5			0x00200000
+
+#define NV30_3D_POLYGON_STIPPLE_ENABLE				0x0000147c
+
+#define NV30_3D_POLYGON_STIPPLE_PATTERN(i0)		       (0x00001480 + 0x4*(i0))
+#define NV30_3D_POLYGON_STIPPLE_PATTERN__ESIZE			0x00000004
+#define NV30_3D_POLYGON_STIPPLE_PATTERN__LEN			0x00000020
+
+#define NV30_3D_VTX_ATTR_3F(i0)				       (0x00001500 + 0x10*(i0))
+#define NV30_3D_VTX_ATTR_3F__ESIZE				0x00000010
+#define NV30_3D_VTX_ATTR_3F__LEN				0x00000010
+
+
+#define NV30_3D_VTX_ATTR_3F_X(i0)			       (0x00001500 + 0x10*(i0))
+
+#define NV30_3D_VTX_ATTR_3F_Y(i0)			       (0x00001504 + 0x10*(i0))
+
+#define NV30_3D_VTX_ATTR_3F_Z(i0)			       (0x00001508 + 0x10*(i0))
+
+
+#define NV30_3D_VP_CLIP_PLANE(i0, i1)			       (0x00001600 + 0x10*(i0) + 0x4*(i1))
+#define NV30_3D_VP_CLIP_PLANE__ESIZE				0x00000004
+#define NV30_3D_VP_CLIP_PLANE__LEN				0x00000004
+
+#define NV30_3D_VTXBUF(i0)				       (0x00001680 + 0x4*(i0))
+#define NV30_3D_VTXBUF__ESIZE					0x00000004
+#define NV30_3D_VTXBUF__LEN					0x00000010
+#define NV30_3D_VTXBUF_OFFSET__MASK				0x0fffffff
+#define NV30_3D_VTXBUF_OFFSET__SHIFT				0
+#define NV30_3D_VTXBUF_DMA1					0x80000000
+
+#define NV40_3D_VTX_CACHE_INVALIDATE				0x00001714
+
+#define NV30_3D_VTXFMT(i0)				       (0x00001740 + 0x4*(i0))
+#define NV30_3D_VTXFMT__ESIZE					0x00000004
+#define NV30_3D_VTXFMT__LEN					0x00000010
+#define NV30_3D_VTXFMT_TYPE__MASK				0x0000000f
+#define NV30_3D_VTXFMT_TYPE__SHIFT				0
+#define NV30_3D_VTXFMT_TYPE_B8G8R8A8_UNORM			0x00000000
+#define NV30_3D_VTXFMT_TYPE_V16_SNORM				0x00000001
+#define NV30_3D_VTXFMT_TYPE_V32_FLOAT				0x00000002
+#define NV30_3D_VTXFMT_TYPE_V16_FLOAT				0x00000003
+#define NV30_3D_VTXFMT_TYPE_U8_UNORM				0x00000004
+#define NV30_3D_VTXFMT_TYPE_V16_SSCALED				0x00000005
+#define NV30_3D_VTXFMT_TYPE_U8_USCALED				0x00000007
+#define NV30_3D_VTXFMT_SIZE__MASK				0x000000f0
+#define NV30_3D_VTXFMT_SIZE__SHIFT				4
+#define NV30_3D_VTXFMT_STRIDE__MASK				0x0000ff00
+#define NV30_3D_VTXFMT_STRIDE__SHIFT				8
+
+#define NV30_3D_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION	0x000017a0
+
+
+#define NV30_3D_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R	0x000017a0
+
+#define NV30_3D_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G	0x000017a4
+
+#define NV30_3D_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B	0x000017a8
+
+#define NV30_3D_COLOR_MATERIAL_BACK				0x000017b0
+
+
+#define NV30_3D_COLOR_MATERIAL_BACK_R				0x000017b0
+
+#define NV30_3D_COLOR_MATERIAL_BACK_G				0x000017b4
+
+#define NV30_3D_COLOR_MATERIAL_BACK_B				0x000017b8
+
+#define NV30_3D_COLOR_MATERIAL_BACK_A				0x000017bc
+
+#define NV30_3D_QUERY_RESET					0x000017c8
+
+#define NV30_3D_QUERY_ENABLE					0x000017cc
+
+#define NV30_3D_QUERY_GET					0x00001800
+#define NV30_3D_QUERY_GET_OFFSET__MASK				0x00ffffff
+#define NV30_3D_QUERY_GET_OFFSET__SHIFT				0
+#define NV30_3D_QUERY_GET_UNK24__MASK				0xff000000
+#define NV30_3D_QUERY_GET_UNK24__SHIFT				24
+
+#define NV30_3D_VERTEX_BEGIN_END				0x00001808
+#define NV30_3D_VERTEX_BEGIN_END_STOP				0x00000000
+#define NV30_3D_VERTEX_BEGIN_END_POINTS				0x00000001
+#define NV30_3D_VERTEX_BEGIN_END_LINES				0x00000002
+#define NV30_3D_VERTEX_BEGIN_END_LINE_LOOP			0x00000003
+#define NV30_3D_VERTEX_BEGIN_END_LINE_STRIP			0x00000004
+#define NV30_3D_VERTEX_BEGIN_END_TRIANGLES			0x00000005
+#define NV30_3D_VERTEX_BEGIN_END_TRIANGLE_STRIP			0x00000006
+#define NV30_3D_VERTEX_BEGIN_END_TRIANGLE_FAN			0x00000007
+#define NV30_3D_VERTEX_BEGIN_END_QUADS				0x00000008
+#define NV30_3D_VERTEX_BEGIN_END_QUAD_STRIP			0x00000009
+#define NV30_3D_VERTEX_BEGIN_END_POLYGON			0x0000000a
+
+#define NV30_3D_VB_ELEMENT_U16					0x0000180c
+#define NV30_3D_VB_ELEMENT_U16_0__MASK				0x0000ffff
+#define NV30_3D_VB_ELEMENT_U16_0__SHIFT				0
+#define NV30_3D_VB_ELEMENT_U16_1__MASK				0xffff0000
+#define NV30_3D_VB_ELEMENT_U16_1__SHIFT				16
+
+#define NV30_3D_VB_ELEMENT_U32					0x00001810
+
+#define NV30_3D_VB_VERTEX_BATCH					0x00001814
+#define NV30_3D_VB_VERTEX_BATCH_OFFSET__MASK			0x00ffffff
+#define NV30_3D_VB_VERTEX_BATCH_OFFSET__SHIFT			0
+#define NV30_3D_VB_VERTEX_BATCH_COUNT__MASK			0xff000000
+#define NV30_3D_VB_VERTEX_BATCH_COUNT__SHIFT			24
+
+#define NV30_3D_VERTEX_DATA					0x00001818
+
+#define NV30_3D_IDXBUF_OFFSET					0x0000181c
+
+#define NV30_3D_IDXBUF_FORMAT					0x00001820
+#define NV30_3D_IDXBUF_FORMAT_DMA1				0x00000001
+#define NV30_3D_IDXBUF_FORMAT_TYPE__MASK			0x000000f0
+#define NV30_3D_IDXBUF_FORMAT_TYPE__SHIFT			4
+#define NV30_3D_IDXBUF_FORMAT_TYPE_U32				0x00000000
+#define NV30_3D_IDXBUF_FORMAT_TYPE_U16				0x00000010
+
+#define NV30_3D_VB_INDEX_BATCH					0x00001824
+#define NV30_3D_VB_INDEX_BATCH_START__MASK			0x00ffffff
+#define NV30_3D_VB_INDEX_BATCH_START__SHIFT			0
+#define NV30_3D_VB_INDEX_BATCH_COUNT__MASK			0xff000000
+#define NV30_3D_VB_INDEX_BATCH_COUNT__SHIFT			24
+
+#define NV30_3D_POLYGON_MODE_FRONT				0x00001828
+#define NV30_3D_POLYGON_MODE_FRONT_POINT			0x00001b00
+#define NV30_3D_POLYGON_MODE_FRONT_LINE				0x00001b01
+#define NV30_3D_POLYGON_MODE_FRONT_FILL				0x00001b02
+
+#define NV30_3D_POLYGON_MODE_BACK				0x0000182c
+#define NV30_3D_POLYGON_MODE_BACK_POINT				0x00001b00
+#define NV30_3D_POLYGON_MODE_BACK_LINE				0x00001b01
+#define NV30_3D_POLYGON_MODE_BACK_FILL				0x00001b02
+
+#define NV30_3D_CULL_FACE					0x00001830
+#define NV30_3D_CULL_FACE_FRONT					0x00000404
+#define NV30_3D_CULL_FACE_BACK					0x00000405
+#define NV30_3D_CULL_FACE_FRONT_AND_BACK			0x00000408
+
+#define NV30_3D_FRONT_FACE					0x00001834
+#define NV30_3D_FRONT_FACE_CW					0x00000900
+#define NV30_3D_FRONT_FACE_CCW					0x00000901
+
+#define NV30_3D_POLYGON_SMOOTH_ENABLE				0x00001838
+
+#define NV30_3D_CULL_FACE_ENABLE				0x0000183c
+
+#define NV30_3D_TEX_PALETTE_OFFSET(i0)			       (0x00001840 + 0x4*(i0))
+#define NV30_3D_TEX_PALETTE_OFFSET__ESIZE			0x00000004
+#define NV30_3D_TEX_PALETTE_OFFSET__LEN				0x00000008
+
+#define NV40_3D_TEX_SIZE1(i0)				       (0x00001840 + 0x4*(i0))
+#define NV40_3D_TEX_SIZE1__ESIZE				0x00000004
+#define NV40_3D_TEX_SIZE1__LEN					0x00000008
+#define NV40_3D_TEX_SIZE1_DEPTH__MASK				0xfff00000
+#define NV40_3D_TEX_SIZE1_DEPTH__SHIFT				20
+#define NV40_3D_TEX_SIZE1_PITCH__MASK				0x0000ffff
+#define NV40_3D_TEX_SIZE1_PITCH__SHIFT				0
+
+#define NV30_3D_VTX_ATTR_2F(i0)				       (0x00001880 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_2F__ESIZE				0x00000008
+#define NV30_3D_VTX_ATTR_2F__LEN				0x00000010
+
+
+#define NV30_3D_VTX_ATTR_2F_X(i0)			       (0x00001880 + 0x8*(i0))
+
+#define NV30_3D_VTX_ATTR_2F_Y(i0)			       (0x00001884 + 0x8*(i0))
+
+#define NV30_3D_VTX_ATTR_2I(i0)				       (0x00001900 + 0x4*(i0))
+#define NV30_3D_VTX_ATTR_2I__ESIZE				0x00000004
+#define NV30_3D_VTX_ATTR_2I__LEN				0x00000010
+#define NV30_3D_VTX_ATTR_2I_X__MASK				0x0000ffff
+#define NV30_3D_VTX_ATTR_2I_X__SHIFT				0
+#define NV30_3D_VTX_ATTR_2I_Y__MASK				0xffff0000
+#define NV30_3D_VTX_ATTR_2I_Y__SHIFT				16
+
+#define NV30_3D_VTX_ATTR_4UB(i0)			       (0x00001940 + 0x4*(i0))
+#define NV30_3D_VTX_ATTR_4UB__ESIZE				0x00000004
+#define NV30_3D_VTX_ATTR_4UB__LEN				0x00000010
+#define NV30_3D_VTX_ATTR_4UB_X__MASK				0x000000ff
+#define NV30_3D_VTX_ATTR_4UB_X__SHIFT				0
+#define NV30_3D_VTX_ATTR_4UB_Y__MASK				0x0000ff00
+#define NV30_3D_VTX_ATTR_4UB_Y__SHIFT				8
+#define NV30_3D_VTX_ATTR_4UB_Z__MASK				0x00ff0000
+#define NV30_3D_VTX_ATTR_4UB_Z__SHIFT				16
+#define NV30_3D_VTX_ATTR_4UB_W__MASK				0xff000000
+#define NV30_3D_VTX_ATTR_4UB_W__SHIFT				24
+
+#define NV30_3D_VTX_ATTR_4I(i0)				       (0x00001980 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_4I__ESIZE				0x00000008
+#define NV30_3D_VTX_ATTR_4I__LEN				0x00000010
+
+#define NV30_3D_VTX_ATTR_4I_XY(i0)			       (0x00001980 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_4I_XY_X__MASK				0x0000ffff
+#define NV30_3D_VTX_ATTR_4I_XY_X__SHIFT				0
+#define NV30_3D_VTX_ATTR_4I_XY_Y__MASK				0xffff0000
+#define NV30_3D_VTX_ATTR_4I_XY_Y__SHIFT				16
+
+#define NV30_3D_VTX_ATTR_4I_ZW(i0)			       (0x00001984 + 0x8*(i0))
+#define NV30_3D_VTX_ATTR_4I_ZW_Z__MASK				0x0000ffff
+#define NV30_3D_VTX_ATTR_4I_ZW_Z__SHIFT				0
+#define NV30_3D_VTX_ATTR_4I_ZW_W__MASK				0xffff0000
+#define NV30_3D_VTX_ATTR_4I_ZW_W__SHIFT				16
+
+#define NV30_3D_TEX_OFFSET(i0)				       (0x00001a00 + 0x20*(i0))
+#define NV30_3D_TEX_OFFSET__ESIZE				0x00000020
+#define NV30_3D_TEX_OFFSET__LEN					0x00000008
+
+#define NV30_3D_TEX_FORMAT(i0)				       (0x00001a04 + 0x20*(i0))
+#define NV30_3D_TEX_FORMAT__ESIZE				0x00000020
+#define NV30_3D_TEX_FORMAT__LEN					0x00000008
+#define NV30_3D_TEX_FORMAT_DMA0					0x00000001
+#define NV30_3D_TEX_FORMAT_DMA1					0x00000002
+#define NV30_3D_TEX_FORMAT_CUBIC				0x00000004
+#define NV30_3D_TEX_FORMAT_NO_BORDER				0x00000008
+#define NV30_3D_TEX_FORMAT_DIMS__MASK				0x000000f0
+#define NV30_3D_TEX_FORMAT_DIMS__SHIFT				4
+#define NV30_3D_TEX_FORMAT_DIMS_1D				0x00000010
+#define NV30_3D_TEX_FORMAT_DIMS_2D				0x00000020
+#define NV30_3D_TEX_FORMAT_DIMS_3D				0x00000030
+#define NV30_3D_TEX_FORMAT_FORMAT__MASK				0x0000ff00
+#define NV30_3D_TEX_FORMAT_FORMAT__SHIFT			8
+#define NV30_3D_TEX_FORMAT_FORMAT_L8				0x00000000
+#define NV30_3D_TEX_FORMAT_FORMAT_I8				0x00000100
+#define NV30_3D_TEX_FORMAT_FORMAT_A1R5G5B5			0x00000200
+#define NV30_3D_TEX_FORMAT_FORMAT_A4R4G4B4			0x00000400
+#define NV30_3D_TEX_FORMAT_FORMAT_R5G6B5			0x00000500
+#define NV30_3D_TEX_FORMAT_FORMAT_A8R8G8B8			0x00000600
+#define NV30_3D_TEX_FORMAT_FORMAT_X8R8G8B8			0x00000700
+#define NV30_3D_TEX_FORMAT_FORMAT_INDEX8			0x00000b00
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT1				0x00000c00
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT3				0x00000e00
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT5				0x00000f00
+#define NV30_3D_TEX_FORMAT_FORMAT_A1R5G5B5_RECT			0x00001000
+#define NV30_3D_TEX_FORMAT_FORMAT_R5G6B5_RECT			0x00001100
+#define NV30_3D_TEX_FORMAT_FORMAT_A8R8G8B8_RECT			0x00001200
+#define NV30_3D_TEX_FORMAT_FORMAT_L8_RECT			0x00001300
+#define NV30_3D_TEX_FORMAT_FORMAT_DSDT8_RECT			0x00001700
+#define NV30_3D_TEX_FORMAT_FORMAT_A8L8				0x00001a00
+#define NV30_3D_TEX_FORMAT_FORMAT_I8_RECT			0x00001b00
+#define NV30_3D_TEX_FORMAT_FORMAT_A4R4G4B4_RECT			0x00001d00
+#define NV30_3D_TEX_FORMAT_FORMAT_R8G8B8_RECT			0x00001e00
+#define NV30_3D_TEX_FORMAT_FORMAT_A8L8_RECT			0x00002000
+#define NV30_3D_TEX_FORMAT_FORMAT_Z24				0x00002a00
+#define NV30_3D_TEX_FORMAT_FORMAT_Z24_RECT			0x00002b00
+#define NV30_3D_TEX_FORMAT_FORMAT_Z16				0x00002c00
+#define NV30_3D_TEX_FORMAT_FORMAT_Z16_RECT			0x00002d00
+#define NV30_3D_TEX_FORMAT_FORMAT_DSDT8				0x00002800
+#define NV30_3D_TEX_FORMAT_FORMAT_HILO16			0x00003300
+#define NV30_3D_TEX_FORMAT_FORMAT_HILO16_RECT			0x00003600
+#define NV30_3D_TEX_FORMAT_FORMAT_HILO8				0x00004400
+#define NV30_3D_TEX_FORMAT_FORMAT_SIGNED_HILO8			0x00004500
+#define NV30_3D_TEX_FORMAT_FORMAT_HILO8_RECT			0x00004600
+#define NV30_3D_TEX_FORMAT_FORMAT_SIGNED_HILO8_RECT		0x00004700
+#define NV30_3D_TEX_FORMAT_FORMAT_A16				0x00003200
+#define NV30_3D_TEX_FORMAT_FORMAT_A16_RECT			0x00003500
+#define NV30_3D_TEX_FORMAT_FORMAT_UNK3F				0x00003f00
+#define NV30_3D_TEX_FORMAT_FORMAT_FLOAT_RGBA16_NV		0x00004a00
+#define NV30_3D_TEX_FORMAT_FORMAT_FLOAT_RGBA32_NV		0x00004b00
+#define NV30_3D_TEX_FORMAT_FORMAT_FLOAT_R32_NV			0x00004c00
+#define NV40_3D_TEX_FORMAT_FORMAT__MASK				0x00001f00
+#define NV40_3D_TEX_FORMAT_FORMAT__SHIFT			8
+#define NV40_3D_TEX_FORMAT_FORMAT_L8				0x00000100
+#define NV40_3D_TEX_FORMAT_FORMAT_A1R5G5B5			0x00000200
+#define NV40_3D_TEX_FORMAT_FORMAT_A4R4G4B4			0x00000300
+#define NV40_3D_TEX_FORMAT_FORMAT_R5G6B5			0x00000400
+#define NV40_3D_TEX_FORMAT_FORMAT_A8R8G8B8			0x00000500
+#define NV40_3D_TEX_FORMAT_FORMAT_DXT1				0x00000600
+#define NV40_3D_TEX_FORMAT_FORMAT_DXT3				0x00000700
+#define NV40_3D_TEX_FORMAT_FORMAT_DXT5				0x00000800
+#define NV40_3D_TEX_FORMAT_FORMAT_A8L8				0x00000b00
+#define NV40_3D_TEX_FORMAT_FORMAT_Z24				0x00001000
+#define NV40_3D_TEX_FORMAT_FORMAT_Z16				0x00001200
+#define NV40_3D_TEX_FORMAT_FORMAT_A16				0x00001400
+#define NV40_3D_TEX_FORMAT_FORMAT_A16L16			0x00001500
+#define NV40_3D_TEX_FORMAT_FORMAT_HILO8				0x00001800
+#define NV40_3D_TEX_FORMAT_FORMAT_RGBA16F			0x00001a00
+#define NV40_3D_TEX_FORMAT_FORMAT_RGBA32F			0x00001b00
+#define NV40_3D_TEX_FORMAT_LINEAR				0x00002000
+#define NV40_3D_TEX_FORMAT_RECT					0x00004000
+#define NV40_3D_TEX_FORMAT_MIPMAP_COUNT__MASK			0x000f0000
+#define NV40_3D_TEX_FORMAT_MIPMAP_COUNT__SHIFT			16
+#define NV30_3D_TEX_FORMAT_MIPMAP				0x00080000
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_U__MASK			0x00f00000
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_U__SHIFT			20
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_V__MASK			0x0f000000
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_V__SHIFT			24
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_W__MASK			0xf0000000
+#define NV30_3D_TEX_FORMAT_BASE_SIZE_W__SHIFT			28
+
+#define NV30_3D_TEX_WRAP(i0)				       (0x00001a08 + 0x20*(i0))
+#define NV30_3D_TEX_WRAP__ESIZE					0x00000020
+#define NV30_3D_TEX_WRAP__LEN					0x00000008
+#define NV30_3D_TEX_WRAP_S__MASK				0x000000ff
+#define NV30_3D_TEX_WRAP_S__SHIFT				0
+#define NV30_3D_TEX_WRAP_S_REPEAT				0x00000001
+#define NV30_3D_TEX_WRAP_S_MIRRORED_REPEAT			0x00000002
+#define NV30_3D_TEX_WRAP_S_CLAMP_TO_EDGE			0x00000003
+#define NV30_3D_TEX_WRAP_S_CLAMP_TO_BORDER			0x00000004
+#define NV30_3D_TEX_WRAP_S_CLAMP				0x00000005
+#define NV40_3D_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE			0x00000006
+#define NV40_3D_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER		0x00000007
+#define NV40_3D_TEX_WRAP_S_MIRROR_CLAMP				0x00000008
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION__MASK	0x00000070
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION__SHIFT	4
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF	0x00000000
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_QUALITY	0x00000020
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_PERFORMANCE	0x00000030
+#define NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_HIGH_PERFORMANCE	0x00000070
+#define NV30_3D_TEX_WRAP_T__MASK				0x00000f00
+#define NV30_3D_TEX_WRAP_T__SHIFT				8
+#define NV30_3D_TEX_WRAP_T_REPEAT				0x00000100
+#define NV30_3D_TEX_WRAP_T_MIRRORED_REPEAT			0x00000200
+#define NV30_3D_TEX_WRAP_T_CLAMP_TO_EDGE			0x00000300
+#define NV30_3D_TEX_WRAP_T_CLAMP_TO_BORDER			0x00000400
+#define NV30_3D_TEX_WRAP_T_CLAMP				0x00000500
+#define NV40_3D_TEX_WRAP_T_MIRROR_CLAMP_TO_EDGE			0x00000600
+#define NV40_3D_TEX_WRAP_T_MIRROR_CLAMP_TO_BORDER		0x00000700
+#define NV40_3D_TEX_WRAP_T_MIRROR_CLAMP				0x00000800
+#define NV30_3D_TEX_WRAP_EXPAND_NORMAL				0x0000f000
+#define NV30_3D_TEX_WRAP_R__MASK				0x000f0000
+#define NV30_3D_TEX_WRAP_R__SHIFT				16
+#define NV30_3D_TEX_WRAP_R_REPEAT				0x00010000
+#define NV30_3D_TEX_WRAP_R_MIRRORED_REPEAT			0x00020000
+#define NV30_3D_TEX_WRAP_R_CLAMP_TO_EDGE			0x00030000
+#define NV30_3D_TEX_WRAP_R_CLAMP_TO_BORDER			0x00040000
+#define NV30_3D_TEX_WRAP_R_CLAMP				0x00050000
+#define NV40_3D_TEX_WRAP_R_MIRROR_CLAMP_TO_EDGE			0x00060000
+#define NV40_3D_TEX_WRAP_R_MIRROR_CLAMP_TO_BORDER		0x00070000
+#define NV40_3D_TEX_WRAP_R_MIRROR_CLAMP				0x00080000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER__MASK		0x00f00000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER__SHIFT		20
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER_NONE		0x00000000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER_RED		0x00100000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER_GREEN		0x00200000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER_BLUE		0x00400000
+#define NV40_3D_TEX_WRAP_GAMMA_DECREASE_FILTER_ALL		0x00f00000
+#define NV30_3D_TEX_WRAP_RCOMP__MASK				0xf0000000
+#define NV30_3D_TEX_WRAP_RCOMP__SHIFT				28
+#define NV30_3D_TEX_WRAP_RCOMP_NEVER				0x00000000
+#define NV30_3D_TEX_WRAP_RCOMP_GREATER				0x10000000
+#define NV30_3D_TEX_WRAP_RCOMP_EQUAL				0x20000000
+#define NV30_3D_TEX_WRAP_RCOMP_GEQUAL				0x30000000
+#define NV30_3D_TEX_WRAP_RCOMP_LESS				0x40000000
+#define NV30_3D_TEX_WRAP_RCOMP_NOTEQUAL				0x50000000
+#define NV30_3D_TEX_WRAP_RCOMP_LEQUAL				0x60000000
+#define NV30_3D_TEX_WRAP_RCOMP_ALWAYS				0x70000000
+
+#define NV30_3D_TEX_ENABLE(i0)				       (0x00001a0c + 0x20*(i0))
+#define NV30_3D_TEX_ENABLE__ESIZE				0x00000020
+#define NV30_3D_TEX_ENABLE__LEN					0x00000008
+#define NV30_3D_TEX_ENABLE_ANISO__MASK				0x00000030
+#define NV30_3D_TEX_ENABLE_ANISO__SHIFT				4
+#define NV30_3D_TEX_ENABLE_ANISO_NONE				0x00000000
+#define NV30_3D_TEX_ENABLE_ANISO_2X				0x00000010
+#define NV30_3D_TEX_ENABLE_ANISO_4X				0x00000020
+#define NV30_3D_TEX_ENABLE_ANISO_8X				0x00000030
+#define NV40_3D_TEX_ENABLE_ANISO__MASK				0x000000f0
+#define NV40_3D_TEX_ENABLE_ANISO__SHIFT				4
+#define NV40_3D_TEX_ENABLE_ANISO_NONE				0x00000000
+#define NV40_3D_TEX_ENABLE_ANISO_2X				0x00000010
+#define NV40_3D_TEX_ENABLE_ANISO_4X				0x00000020
+#define NV40_3D_TEX_ENABLE_ANISO_6X				0x00000030
+#define NV40_3D_TEX_ENABLE_ANISO_8X				0x00000040
+#define NV40_3D_TEX_ENABLE_ANISO_10X				0x00000050
+#define NV40_3D_TEX_ENABLE_ANISO_12X				0x00000060
+#define NV40_3D_TEX_ENABLE_ANISO_16X				0x00000070
+#define NV30_3D_TEX_ENABLE_MIPMAP_MAX_LOD__MASK			0x0003c000
+#define NV30_3D_TEX_ENABLE_MIPMAP_MAX_LOD__SHIFT		14
+#define NV40_3D_TEX_ENABLE_MIPMAP_MAX_LOD__MASK			0x00038000
+#define NV40_3D_TEX_ENABLE_MIPMAP_MAX_LOD__SHIFT		15
+#define NV30_3D_TEX_ENABLE_MIPMAP_MIN_LOD__MASK			0x3c000000
+#define NV30_3D_TEX_ENABLE_MIPMAP_MIN_LOD__SHIFT		26
+#define NV40_3D_TEX_ENABLE_MIPMAP_MIN_LOD__MASK			0x38000000
+#define NV40_3D_TEX_ENABLE_MIPMAP_MIN_LOD__SHIFT		27
+#define NV30_3D_TEX_ENABLE_ENABLE				0x40000000
+#define NV40_3D_TEX_ENABLE_ENABLE				0x80000000
+
+#define NV30_3D_TEX_SWIZZLE(i0)				       (0x00001a10 + 0x20*(i0))
+#define NV30_3D_TEX_SWIZZLE__ESIZE				0x00000020
+#define NV30_3D_TEX_SWIZZLE__LEN				0x00000008
+#define NV30_3D_TEX_SWIZZLE_S1_W__MASK				0x00000003
+#define NV30_3D_TEX_SWIZZLE_S1_W__SHIFT				0
+#define NV30_3D_TEX_SWIZZLE_S1_W_W				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S1_W_Z				0x00000001
+#define NV30_3D_TEX_SWIZZLE_S1_W_Y				0x00000002
+#define NV30_3D_TEX_SWIZZLE_S1_W_X				0x00000003
+#define NV30_3D_TEX_SWIZZLE_S1_Z__MASK				0x0000000c
+#define NV30_3D_TEX_SWIZZLE_S1_Z__SHIFT				2
+#define NV30_3D_TEX_SWIZZLE_S1_Z_W				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S1_Z_Z				0x00000004
+#define NV30_3D_TEX_SWIZZLE_S1_Z_Y				0x00000008
+#define NV30_3D_TEX_SWIZZLE_S1_Z_X				0x0000000c
+#define NV30_3D_TEX_SWIZZLE_S1_Y__MASK				0x00000030
+#define NV30_3D_TEX_SWIZZLE_S1_Y__SHIFT				4
+#define NV30_3D_TEX_SWIZZLE_S1_Y_W				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S1_Y_Z				0x00000010
+#define NV30_3D_TEX_SWIZZLE_S1_Y_Y				0x00000020
+#define NV30_3D_TEX_SWIZZLE_S1_Y_X				0x00000030
+#define NV30_3D_TEX_SWIZZLE_S1_X__MASK				0x000000c0
+#define NV30_3D_TEX_SWIZZLE_S1_X__SHIFT				6
+#define NV30_3D_TEX_SWIZZLE_S1_X_W				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S1_X_Z				0x00000040
+#define NV30_3D_TEX_SWIZZLE_S1_X_Y				0x00000080
+#define NV30_3D_TEX_SWIZZLE_S1_X_X				0x000000c0
+#define NV30_3D_TEX_SWIZZLE_S0_W__MASK				0x00000300
+#define NV30_3D_TEX_SWIZZLE_S0_W__SHIFT				8
+#define NV30_3D_TEX_SWIZZLE_S0_W_ZERO				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S0_W_ONE				0x00000100
+#define NV30_3D_TEX_SWIZZLE_S0_W_S1				0x00000200
+#define NV30_3D_TEX_SWIZZLE_S0_Z__MASK				0x00000c00
+#define NV30_3D_TEX_SWIZZLE_S0_Z__SHIFT				10
+#define NV30_3D_TEX_SWIZZLE_S0_Z_ZERO				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S0_Z_ONE				0x00000400
+#define NV30_3D_TEX_SWIZZLE_S0_Z_S1				0x00000800
+#define NV30_3D_TEX_SWIZZLE_S0_Y__MASK				0x00003000
+#define NV30_3D_TEX_SWIZZLE_S0_Y__SHIFT				12
+#define NV30_3D_TEX_SWIZZLE_S0_Y_ZERO				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S0_Y_ONE				0x00001000
+#define NV30_3D_TEX_SWIZZLE_S0_Y_S1				0x00002000
+#define NV30_3D_TEX_SWIZZLE_S0_X__MASK				0x0000c000
+#define NV30_3D_TEX_SWIZZLE_S0_X__SHIFT				14
+#define NV30_3D_TEX_SWIZZLE_S0_X_ZERO				0x00000000
+#define NV30_3D_TEX_SWIZZLE_S0_X_ONE				0x00004000
+#define NV30_3D_TEX_SWIZZLE_S0_X_S1				0x00008000
+#define NV30_3D_TEX_SWIZZLE_RECT_PITCH__MASK			0xffff0000
+#define NV30_3D_TEX_SWIZZLE_RECT_PITCH__SHIFT			16
+
+#define NV30_3D_TEX_FILTER(i0)				       (0x00001a14 + 0x20*(i0))
+#define NV30_3D_TEX_FILTER__ESIZE				0x00000020
+#define NV30_3D_TEX_FILTER__LEN					0x00000008
+#define NV30_3D_TEX_FILTER_LOD_BIAS__MASK			0x00000f00
+#define NV30_3D_TEX_FILTER_LOD_BIAS__SHIFT			8
+#define NV30_3D_TEX_FILTER_MIN__MASK				0x000f0000
+#define NV30_3D_TEX_FILTER_MIN__SHIFT				16
+#define NV30_3D_TEX_FILTER_MIN_NEAREST				0x00010000
+#define NV30_3D_TEX_FILTER_MIN_LINEAR				0x00020000
+#define NV30_3D_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST		0x00030000
+#define NV30_3D_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST		0x00040000
+#define NV30_3D_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR		0x00050000
+#define NV30_3D_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR		0x00060000
+#define NV30_3D_TEX_FILTER_MAG__MASK				0x0f000000
+#define NV30_3D_TEX_FILTER_MAG__SHIFT				24
+#define NV30_3D_TEX_FILTER_MAG_NEAREST				0x01000000
+#define NV30_3D_TEX_FILTER_MAG_LINEAR				0x02000000
+#define NV30_3D_TEX_FILTER_SIGNED_BLUE				0x10000000
+#define NV30_3D_TEX_FILTER_SIGNED_GREEN				0x20000000
+#define NV30_3D_TEX_FILTER_SIGNED_RED				0x40000000
+#define NV30_3D_TEX_FILTER_SIGNED_ALPHA				0x80000000
+
+#define NV30_3D_TEX_NPOT_SIZE(i0)			       (0x00001a18 + 0x20*(i0))
+#define NV30_3D_TEX_NPOT_SIZE__ESIZE				0x00000020
+#define NV30_3D_TEX_NPOT_SIZE__LEN				0x00000008
+#define NV30_3D_TEX_NPOT_SIZE_H__MASK				0x0000ffff
+#define NV30_3D_TEX_NPOT_SIZE_H__SHIFT				0
+#define NV30_3D_TEX_NPOT_SIZE_W__MASK				0xffff0000
+#define NV30_3D_TEX_NPOT_SIZE_W__SHIFT				16
+
+#define NV30_3D_TEX_BORDER_COLOR(i0)			       (0x00001a1c + 0x20*(i0))
+#define NV30_3D_TEX_BORDER_COLOR__ESIZE				0x00000020
+#define NV30_3D_TEX_BORDER_COLOR__LEN				0x00000008
+#define NV30_3D_TEX_BORDER_COLOR_B__MASK			0x000000ff
+#define NV30_3D_TEX_BORDER_COLOR_B__SHIFT			0
+#define NV30_3D_TEX_BORDER_COLOR_G__MASK			0x0000ff00
+#define NV30_3D_TEX_BORDER_COLOR_G__SHIFT			8
+#define NV30_3D_TEX_BORDER_COLOR_R__MASK			0x00ff0000
+#define NV30_3D_TEX_BORDER_COLOR_R__SHIFT			16
+#define NV30_3D_TEX_BORDER_COLOR_A__MASK			0xff000000
+#define NV30_3D_TEX_BORDER_COLOR_A__SHIFT			24
+
+#define NV30_3D_VTX_ATTR_4F(i0)				       (0x00001c00 + 0x10*(i0))
+#define NV30_3D_VTX_ATTR_4F__ESIZE				0x00000010
+#define NV30_3D_VTX_ATTR_4F__LEN				0x00000010
+
+
+#define NV30_3D_VTX_ATTR_4F_X(i0)			       (0x00001c00 + 0x10*(i0))
+
+#define NV30_3D_VTX_ATTR_4F_Y(i0)			       (0x00001c04 + 0x10*(i0))
+
+#define NV30_3D_VTX_ATTR_4F_Z(i0)			       (0x00001c08 + 0x10*(i0))
+
+#define NV30_3D_VTX_ATTR_4F_W(i0)			       (0x00001c0c + 0x10*(i0))
+
+#define NV30_3D_FP_CONTROL					0x00001d60
+#define NV30_3D_FP_CONTROL_USED_REGS_MINUS1_DIV2__MASK		0x0000000f
+#define NV30_3D_FP_CONTROL_USED_REGS_MINUS1_DIV2__SHIFT		0
+#define NV30_3D_FP_CONTROL_USES_KIL				0x00000080
+#define NV40_3D_FP_CONTROL_KIL					0x00000080
+#define NV40_3D_FP_CONTROL_TEMP_COUNT__MASK			0xff000000
+#define NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT			24
+
+#define NV30_3D_FENCE_OFFSET					0x00001d6c
+
+#define NV30_3D_FENCE_VALUE					0x00001d70
+
+#define NV30_3D_DEPTH_CONTROL					0x00001d78
+#define NV30_3D_DEPTH_CONTROL_CLAMP				0x000000f0
+
+#define NV30_3D_MULTISAMPLE_CONTROL				0x00001d7c
+#define NV30_3D_MULTISAMPLE_CONTROL_ENABLE			0x00000001
+#define NV30_3D_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_COVERAGE	0x00000010
+#define NV30_3D_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_ONE		0x00000100
+#define NV30_3D_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE__MASK	0xffff0000
+#define NV30_3D_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE__SHIFT	16
+
+#define NV30_3D_COORD_CONVENTIONS				0x00001d88
+#define NV30_3D_COORD_CONVENTIONS_HEIGHT__MASK			0x00000fff
+#define NV30_3D_COORD_CONVENTIONS_HEIGHT__SHIFT			0
+#define NV30_3D_COORD_CONVENTIONS_ORIGIN__MASK			0x00001000
+#define NV30_3D_COORD_CONVENTIONS_ORIGIN__SHIFT			12
+#define NV30_3D_COORD_CONVENTIONS_ORIGIN_NORMAL			0x00000000
+#define NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED		0x00001000
+#define NV30_3D_COORD_CONVENTIONS_CENTER__MASK			0x00010000
+#define NV30_3D_COORD_CONVENTIONS_CENTER__SHIFT			16
+#define NV30_3D_COORD_CONVENTIONS_CENTER_HALF_INTEGER		0x00000000
+#define NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER		0x00010000
+
+#define NV30_3D_CLEAR_DEPTH_VALUE				0x00001d8c
+
+#define NV30_3D_CLEAR_COLOR_VALUE				0x00001d90
+#define NV30_3D_CLEAR_COLOR_VALUE_B__MASK			0x000000ff
+#define NV30_3D_CLEAR_COLOR_VALUE_B__SHIFT			0
+#define NV30_3D_CLEAR_COLOR_VALUE_G__MASK			0x0000ff00
+#define NV30_3D_CLEAR_COLOR_VALUE_G__SHIFT			8
+#define NV30_3D_CLEAR_COLOR_VALUE_R__MASK			0x00ff0000
+#define NV30_3D_CLEAR_COLOR_VALUE_R__SHIFT			16
+#define NV30_3D_CLEAR_COLOR_VALUE_A__MASK			0xff000000
+#define NV30_3D_CLEAR_COLOR_VALUE_A__SHIFT			24
+
+#define NV30_3D_CLEAR_BUFFERS					0x00001d94
+#define NV30_3D_CLEAR_BUFFERS_DEPTH				0x00000001
+#define NV30_3D_CLEAR_BUFFERS_STENCIL				0x00000002
+#define NV30_3D_CLEAR_BUFFERS_COLOR_R				0x00000010
+#define NV30_3D_CLEAR_BUFFERS_COLOR_G				0x00000020
+#define NV30_3D_CLEAR_BUFFERS_COLOR_B				0x00000040
+#define NV30_3D_CLEAR_BUFFERS_COLOR_A				0x00000080
+
+#define NV30_3D_PRIMITIVE_RESTART_ENABLE			0x00001dac
+
+#define NV30_3D_PRIMITIVE_RESTART_INDEX				0x00001db0
+
+#define NV30_3D_LINE_STIPPLE_ENABLE				0x00001db4
+
+#define NV30_3D_LINE_STIPPLE_PATTERN				0x00001db8
+#define NV30_3D_LINE_STIPPLE_PATTERN_FACTOR__MASK		0x0000ffff
+#define NV30_3D_LINE_STIPPLE_PATTERN_FACTOR__SHIFT		0
+#define NV30_3D_LINE_STIPPLE_PATTERN_PATTERN__MASK		0xffff0000
+#define NV30_3D_LINE_STIPPLE_PATTERN_PATTERN__SHIFT		16
+
+#define NV30_3D_BACK_MATERIAL_SHININESS(i0)		       (0x00001e20 + 0x4*(i0))
+#define NV30_3D_BACK_MATERIAL_SHININESS__ESIZE			0x00000004
+#define NV30_3D_BACK_MATERIAL_SHININESS__LEN			0x00000006
+
+#define NV30_3D_VTX_ATTR_1F(i0)				       (0x00001e40 + 0x4*(i0))
+#define NV30_3D_VTX_ATTR_1F__ESIZE				0x00000004
+#define NV30_3D_VTX_ATTR_1F__LEN				0x00000010
+
+#define NV30_3D_ENGINE						0x00001e94
+#define NV30_3D_ENGINE_FP					0x00000001
+#define NV30_3D_ENGINE_VP					0x00000002
+#define NV30_3D_ENGINE_FIXED					0x00000004
+
+#define NV30_3D_VP_UPLOAD_FROM_ID				0x00001e9c
+
+#define NV30_3D_VP_START_FROM_ID				0x00001ea0
+
+#define NV30_3D_POINT_PARAMETERS(i0)			       (0x00001ec0 + 0x4*(i0))
+#define NV30_3D_POINT_PARAMETERS__ESIZE				0x00000004
+#define NV30_3D_POINT_PARAMETERS__LEN				0x00000008
+
+#define NV30_3D_POINT_SIZE					0x00001ee0
+
+#define NV30_3D_POINT_PARAMETERS_ENABLE				0x00001ee4
+
+#define NV30_3D_POINT_SPRITE					0x00001ee8
+#define NV30_3D_POINT_SPRITE_ENABLE				0x00000001
+#define NV30_3D_POINT_SPRITE_R_MODE__MASK			0x00000006
+#define NV30_3D_POINT_SPRITE_R_MODE__SHIFT			1
+#define NV30_3D_POINT_SPRITE_R_MODE_ZERO			0x00000000
+#define NV30_3D_POINT_SPRITE_R_MODE_R				0x00000002
+#define NV30_3D_POINT_SPRITE_R_MODE_S				0x00000004
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_0			0x00000100
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_1			0x00000200
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_2			0x00000400
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_3			0x00000800
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_4			0x00001000
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_5			0x00002000
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_6			0x00004000
+#define NV30_3D_POINT_SPRITE_COORD_REPLACE_7			0x00008000
+
+#define NV30_3D_VP_UPLOAD_CONST_ID				0x00001efc
+
+#define NV30_3D_VP_UPLOAD_CONST(i0)			       (0x00001f00 + 0x10*(i0))
+#define NV30_3D_VP_UPLOAD_CONST__ESIZE				0x00000010
+#define NV30_3D_VP_UPLOAD_CONST__LEN				0x00000004
+
+
+#define NV30_3D_VP_UPLOAD_CONST_X(i0)			       (0x00001f00 + 0x10*(i0))
+
+#define NV30_3D_VP_UPLOAD_CONST_Y(i0)			       (0x00001f04 + 0x10*(i0))
+
+#define NV30_3D_VP_UPLOAD_CONST_Z(i0)			       (0x00001f08 + 0x10*(i0))
+
+#define NV30_3D_VP_UPLOAD_CONST_W(i0)			       (0x00001f0c + 0x10*(i0))
+
+#define NV30_3D_UNK1F80(i0)				       (0x00001f80 + 0x4*(i0))
+#define NV30_3D_UNK1F80__ESIZE					0x00000004
+#define NV30_3D_UNK1F80__LEN					0x00000010
+
+#define NV40_3D_TEX_CACHE_CTL					0x00001fd8
+
+#define NV40_3D_VP_ATTRIB_EN					0x00001ff0
+
+#define NV40_3D_VP_RESULT_EN					0x00001ff4
+
+
+#endif /* _HOME_SKEGGSB_GIT_ENVYTOOLS_RNNDB_NV30_40_3D_XML */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
new file mode 100644
index 00000000000..5317e892b25
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+
+#include "nouveau_gldefs.h"
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+static INLINE uint32_t
+pack_rgba(enum pipe_format format, const float *rgba)
+{
+   union util_color uc;
+   util_pack_color(rgba, format, &uc);
+   return uc.ui;
+}
+
+static INLINE uint32_t
+pack_zeta(enum pipe_format format, double depth, unsigned stencil)
+{
+   uint32_t zuint = (uint32_t)(depth * 4294967295.0);
+   if (format != PIPE_FORMAT_Z16_UNORM)
+      return (zuint & 0xffffff00) | (stencil & 0xff);
+   return zuint >> 16;
+}
+
+static void
+nv30_clear(struct pipe_context *pipe, unsigned buffers,
+           const union pipe_color_union *color, double depth, unsigned stencil)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+   uint32_t colr = 0, zeta = 0, mode = 0;
+
+   if (!nv30_state_validate(nv30, TRUE))
+      return;
+
+   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
+      colr  = pack_rgba(fb->cbufs[0]->format, color->f);
+      mode |= NV30_3D_CLEAR_BUFFERS_COLOR_R |
+              NV30_3D_CLEAR_BUFFERS_COLOR_G |
+              NV30_3D_CLEAR_BUFFERS_COLOR_B |
+              NV30_3D_CLEAR_BUFFERS_COLOR_A;
+   }
+
+   if (fb->zsbuf) {
+      zeta = pack_zeta(fb->zsbuf->format, depth, stencil);
+      if (buffers & PIPE_CLEAR_DEPTH)
+         mode |= NV30_3D_CLEAR_BUFFERS_DEPTH;
+      if (buffers & PIPE_CLEAR_STENCIL)
+         mode |= NV30_3D_CLEAR_BUFFERS_STENCIL;
+   }
+
+   /*XXX: wtf? fixes clears sometimes not clearing on nv3x... */
+   if (nv30->screen->eng3d->oclass < NV40_3D_CLASS) {
+      BEGIN_NV04(push, NV30_3D(CLEAR_DEPTH_VALUE), 3);
+      PUSH_DATA (push, zeta);
+      PUSH_DATA (push, colr);
+      PUSH_DATA (push, mode);
+   }
+
+   BEGIN_NV04(push, NV30_3D(CLEAR_DEPTH_VALUE), 3);
+   PUSH_DATA (push, zeta);
+   PUSH_DATA (push, colr);
+   PUSH_DATA (push, mode);
+
+   nv30_state_release(nv30);
+}
+
+static void
+nv30_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps,
+                         const union pipe_color_union *color,
+                         unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_surface *sf = nv30_surface(ps);
+   struct nv30_miptree *mt = nv30_miptree(ps->texture);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nouveau_pushbuf_refn refn;
+   uint32_t rt_format;
+
+   rt_format = nv30_format(pipe->screen, ps->format)->hw;
+   if (util_format_get_blocksize(ps->format) == 4)
+      rt_format |= NV30_3D_RT_FORMAT_ZETA_Z24S8;
+   else
+      rt_format |= NV30_3D_RT_FORMAT_ZETA_Z16;
+
+   if (nv30_miptree(ps->texture)->swizzled) {
+      rt_format |= NV30_3D_RT_FORMAT_TYPE_SWIZZLED;
+      rt_format |= util_logbase2(sf->width) << 16;
+      rt_format |= util_logbase2(sf->height) << 24;
+   } else {
+      rt_format |= NV30_3D_RT_FORMAT_TYPE_LINEAR;
+   }
+
+   refn.bo = mt->base.bo;
+   refn.flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+   if (nouveau_pushbuf_space(push, 16, 1, 0) ||
+       nouveau_pushbuf_refn (push, &refn, 1))
+      return;
+
+   BEGIN_NV04(push, NV30_3D(RT_ENABLE), 1);
+   PUSH_DATA (push, NV30_3D_RT_ENABLE_COLOR0);
+   BEGIN_NV04(push, NV30_3D(RT_HORIZ), 3);
+   PUSH_DATA (push, sf->width << 16);
+   PUSH_DATA (push, sf->height << 16);
+   PUSH_DATA (push, rt_format);
+   BEGIN_NV04(push, NV30_3D(COLOR0_PITCH), 2);
+   if (eng3d->oclass < NV40_3D_CLASS)
+      PUSH_DATA (push, (sf->pitch << 16) | sf->pitch);
+   else
+      PUSH_DATA (push, sf->pitch);
+   PUSH_RELOC(push, mt->base.bo, sf->offset, NOUVEAU_BO_LOW, 0, 0);
+   BEGIN_NV04(push, NV30_3D(SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, (w << 16) | x);
+   PUSH_DATA (push, (h << 16) | y);
+
+   BEGIN_NV04(push, NV30_3D(CLEAR_COLOR_VALUE), 2);
+   PUSH_DATA (push, pack_rgba(ps->format, color->f));
+   PUSH_DATA (push, NV30_3D_CLEAR_BUFFERS_COLOR_R |
+                    NV30_3D_CLEAR_BUFFERS_COLOR_G |
+                    NV30_3D_CLEAR_BUFFERS_COLOR_B |
+                    NV30_3D_CLEAR_BUFFERS_COLOR_A);
+
+   nv30->dirty |= NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps,
+                         unsigned buffers, double depth, unsigned stencil,
+                         unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_surface *sf = nv30_surface(ps);
+   struct nv30_miptree *mt = nv30_miptree(ps->texture);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nouveau_pushbuf_refn refn;
+   uint32_t rt_format, mode = 0;
+
+   rt_format = nv30_format(pipe->screen, ps->format)->hw;
+   if (util_format_get_blocksize(ps->format) == 4)
+      rt_format |= NV30_3D_RT_FORMAT_COLOR_A8R8G8B8;
+   else
+      rt_format |= NV30_3D_RT_FORMAT_COLOR_R5G6B5;
+
+   if (nv30_miptree(ps->texture)->swizzled) {
+      rt_format |= NV30_3D_RT_FORMAT_TYPE_SWIZZLED;
+      rt_format |= util_logbase2(sf->width) << 16;
+      rt_format |= util_logbase2(sf->height) << 24;
+   } else {
+      rt_format |= NV30_3D_RT_FORMAT_TYPE_LINEAR;
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTH)
+      mode |= NV30_3D_CLEAR_BUFFERS_DEPTH;
+   if (buffers & PIPE_CLEAR_STENCIL)
+      mode |= NV30_3D_CLEAR_BUFFERS_STENCIL;
+
+   refn.bo = mt->base.bo;
+   refn.flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+   if (nouveau_pushbuf_space(push, 32, 1, 0) ||
+       nouveau_pushbuf_refn (push, &refn, 1))
+      return;
+
+   BEGIN_NV04(push, NV30_3D(RT_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(RT_HORIZ), 3);
+   PUSH_DATA (push, sf->width << 16);
+   PUSH_DATA (push, sf->height << 16);
+   PUSH_DATA (push, rt_format);
+   if (eng3d->oclass < NV40_3D_CLASS) {
+      BEGIN_NV04(push, NV30_3D(COLOR0_PITCH), 1);
+      PUSH_DATA (push, (sf->pitch << 16) | sf->pitch);
+   } else {
+      BEGIN_NV04(push, NV40_3D(ZETA_PITCH), 1);
+      PUSH_DATA (push, sf->pitch);
+   }
+   BEGIN_NV04(push, NV30_3D(ZETA_OFFSET), 1);
+   PUSH_RELOC(push, mt->base.bo, sf->offset, NOUVEAU_BO_LOW, 0, 0);
+   BEGIN_NV04(push, NV30_3D(SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, (w << 16) | x);
+   PUSH_DATA (push, (h << 16) | y);
+
+   BEGIN_NV04(push, NV30_3D(CLEAR_DEPTH_VALUE), 1);
+   PUSH_DATA (push, pack_zeta(ps->format, depth, stencil));
+   BEGIN_NV04(push, NV30_3D(CLEAR_BUFFERS), 1);
+   PUSH_DATA (push, mode);
+
+   nv30->dirty |= NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR;
+}
+
+void
+nv30_clear_init(struct pipe_context *pipe)
+{
+   pipe->clear = nv30_clear;
+   pipe->clear_render_target = nv30_clear_render_target;
+   pipe->clear_depth_stencil = nv30_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
new file mode 100644
index 00000000000..2146d2726b4
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "draw/draw_context.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+
+#include "nouveau_fence.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_transfer.h"
+#include "nv30/nv30_state.h"
+
+static void
+nv30_context_kick_notify(struct nouveau_pushbuf *push)
+{
+   struct nouveau_screen *screen;
+   struct nv30_context *nv30;
+
+   if (!push->user_priv)
+      return;
+   nv30 = container_of(push->user_priv, nv30, bufctx);
+   screen = &nv30->screen->base;
+
+   nouveau_fence_next(screen);
+   nouveau_fence_update(screen, TRUE);
+
+   if (push->bufctx) {
+      struct nouveau_bufref *bref;
+      LIST_FOR_EACH_ENTRY(bref, &push->bufctx->current, thead) {
+         struct nv04_resource *res = bref->priv;
+         if (res && res->mm) {
+            nouveau_fence_ref(screen->fence.current, &res->fence);
+
+            if (bref->flags & NOUVEAU_BO_RD)
+               res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+            if (bref->flags & NOUVEAU_BO_WR) {
+               nouveau_fence_ref(screen->fence.current, &res->fence_wr);
+               res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING |
+                  NOUVEAU_BUFFER_STATUS_DIRTY;
+            }
+         }
+      }
+   }
+}
+
+static void
+nv30_context_flush(struct pipe_context *pipe, struct pipe_fence_handle **fence,
+                   unsigned flags)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   if (fence)
+      nouveau_fence_ref(nv30->screen->base.fence.current,
+                        (struct nouveau_fence **)fence);
+
+   PUSH_KICK(push);
+
+   nouveau_context_update_frame_stats(&nv30->base);
+}
+
+static int
+nv30_invalidate_resource_storage(struct nouveau_context *nv,
+                                 struct pipe_resource *res,
+                                 int ref)
+{
+   struct nv30_context *nv30 = nv30_context(&nv->pipe);
+   unsigned i;
+
+   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+      for (i = 0; i < nv30->framebuffer.nr_cbufs; ++i) {
+         if (nv30->framebuffer.cbufs[i] &&
+             nv30->framebuffer.cbufs[i]->texture == res) {
+            nv30->dirty |= NV30_NEW_FRAMEBUFFER;
+            nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FB);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (nv30->framebuffer.zsbuf &&
+          nv30->framebuffer.zsbuf->texture == res) {
+            nv30->dirty |= NV30_NEW_FRAMEBUFFER;
+            nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FB);
+            if (!--ref)
+               return ref;
+      }
+   }
+
+   if (res->bind & PIPE_BIND_VERTEX_BUFFER) {
+      for (i = 0; i < nv30->num_vtxbufs; ++i) {
+         if (nv30->vtxbuf[i].buffer == res) {
+            nv30->dirty |= NV30_NEW_ARRAYS;
+            nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXBUF);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_INDEX_BUFFER) {
+      if (nv30->idxbuf.buffer == res) {
+         nouveau_bufctx_reset(nv30->bufctx, BUFCTX_IDXBUF);
+         if (!--ref)
+            return ref;
+      }
+   }
+
+   if (res->bind & PIPE_BIND_SAMPLER_VIEW) {
+      for (i = 0; i < nv30->fragprog.num_textures; ++i) {
+         if (nv30->fragprog.textures[i] &&
+             nv30->fragprog.textures[i]->texture == res) {
+            nv30->dirty |= NV30_NEW_FRAGTEX;
+            nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FRAGTEX(i));
+            if (!--ref)
+               return ref;
+         }
+      }
+      for (i = 0; i < nv30->vertprog.num_textures; ++i) {
+         if (nv30->vertprog.textures[i] &&
+             nv30->vertprog.textures[i]->texture == res) {
+            nv30->dirty |= NV30_NEW_VERTTEX;
+            nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VERTTEX(i));
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+
+   return ref;
+}
+
+static void
+nv30_context_destroy(struct pipe_context *pipe)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   if (nv30->blitter)
+      util_blitter_destroy(nv30->blitter);
+
+   if (nv30->draw)
+      draw_destroy(nv30->draw);
+
+   nouveau_bufctx_del(&nv30->bufctx);
+
+   if (nv30->screen->cur_ctx == nv30)
+      nv30->screen->cur_ctx = NULL;
+
+   nouveau_context_destroy(&nv30->base);
+}
+
+#define FAIL_CONTEXT_INIT(str, err)                   \
+   do {                                               \
+      NOUVEAU_ERR(str, err);                          \
+      nv30_context_destroy(pipe);                     \
+      return NULL;                                    \
+   } while(0)
+
+struct pipe_context *
+nv30_context_create(struct pipe_screen *pscreen, void *priv)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nv30_context *nv30 = CALLOC_STRUCT(nv30_context);
+   struct nouveau_pushbuf *push;
+   struct pipe_context *pipe;
+   int ret;
+
+   if (!nv30)
+      return NULL;
+
+   nv30->screen = screen;
+   nv30->base.screen = &screen->base;
+   nv30->base.copy_data = nv30_transfer_copy_data;
+
+   pipe = &nv30->base.pipe;
+   pipe->screen = pscreen;
+   pipe->priv = priv;
+   pipe->destroy = nv30_context_destroy;
+   pipe->flush = nv30_context_flush;
+
+   /*XXX: *cough* per-context client */
+   nv30->base.client = screen->base.client;
+
+   /*XXX: *cough* per-context pushbufs */
+   push = screen->base.pushbuf;
+   nv30->base.pushbuf = push;
+   nv30->base.pushbuf->user_priv = &nv30->bufctx; /* hack at validate time */
+   nv30->base.pushbuf->rsvd_kick = 16; /* hack in screen before first space */
+   nv30->base.pushbuf->kick_notify = nv30_context_kick_notify;
+
+   nv30->base.invalidate_resource_storage = nv30_invalidate_resource_storage;
+
+   ret = nouveau_bufctx_new(nv30->base.client, 64, &nv30->bufctx);
+   if (ret) {
+      nv30_context_destroy(pipe);
+      return NULL;
+   }
+
+   /*XXX: make configurable with performance vs quality, these defaults
+    *     match the binary driver's defaults
+    */
+   if (screen->eng3d->oclass < NV40_3D_CLASS)
+      nv30->config.filter = 0x00000004;
+   else
+      nv30->config.filter = 0x00002dc4;
+
+   nv30->config.aniso = NV40_3D_TEX_WRAP_ANISO_MIP_FILTER_OPTIMIZATION_OFF;
+
+   if (debug_get_bool_option("NV30_SWTNL", FALSE))
+      nv30->draw_flags |= NV30_NEW_SWTNL;
+
+   /*XXX: nvfx... */
+   nv30->is_nv4x = (screen->eng3d->oclass >= NV40_3D_CLASS) ? ~0 : 0;
+   nv30->use_nv4x = (screen->eng3d->oclass >= NV40_3D_CLASS) ? ~0 : 0;
+   nv30->render_mode = HW;
+
+   nv30->sample_mask = 0xffff;
+   nv30_vbo_init(pipe);
+   nv30_query_init(pipe);
+   nv30_state_init(pipe);
+   nv30_resource_init(pipe);
+   nv30_clear_init(pipe);
+   nv30_fragprog_init(pipe);
+   nv30_vertprog_init(pipe);
+   nv30_texture_init(pipe);
+   nv30_fragtex_init(pipe);
+   nv40_verttex_init(pipe);
+   nv30_draw_init(pipe);
+
+   nv30->blitter = util_blitter_create(pipe);
+   if (!nv30->blitter) {
+      nv30_context_destroy(pipe);
+      return NULL;
+   }
+
+   nouveau_context_init_vdec(&nv30->base);
+
+   return pipe;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
new file mode 100644
index 00000000000..12ae0c71e4f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -0,0 +1,237 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_format.h"
+#include "util/u_blitter.h"
+
+#include "nv30/nv30_screen.h"
+#include "nv30/nv30_state.h"
+
+#include "nouveau_context.h"
+
+#define BUFCTX_FB          0
+#define BUFCTX_VTXTMP      1
+#define BUFCTX_VTXBUF      2
+#define BUFCTX_IDXBUF      3
+#define BUFCTX_VERTTEX(n) (4 + (n))
+#define BUFCTX_FRAGPROG    8
+#define BUFCTX_FRAGTEX(n) (9 + (n))
+
+#define NV30_NEW_BLEND        (1 << 0)
+#define NV30_NEW_RASTERIZER   (1 << 1)
+#define NV30_NEW_ZSA          (1 << 2)
+#define NV30_NEW_VERTPROG     (1 << 3)
+#define NV30_NEW_VERTCONST    (1 << 4)
+#define NV30_NEW_FRAGPROG     (1 << 5)
+#define NV30_NEW_FRAGCONST    (1 << 6)
+#define NV30_NEW_BLEND_COLOUR (1 << 7)
+#define NV30_NEW_STENCIL_REF  (1 << 8)
+#define NV30_NEW_CLIP         (1 << 9)
+#define NV30_NEW_SAMPLE_MASK  (1 << 10)
+#define NV30_NEW_FRAMEBUFFER  (1 << 11)
+#define NV30_NEW_STIPPLE      (1 << 12)
+#define NV30_NEW_SCISSOR      (1 << 13)
+#define NV30_NEW_VIEWPORT     (1 << 14)
+#define NV30_NEW_ARRAYS       (1 << 15)
+#define NV30_NEW_VERTEX       (1 << 16)
+#define NV30_NEW_CONSTBUF     (1 << 17)
+#define NV30_NEW_FRAGTEX      (1 << 18)
+#define NV30_NEW_VERTTEX      (1 << 19)
+#define NV30_NEW_SWTNL        (1 << 31)
+#define NV30_NEW_ALL          0x000fffff
+
+struct nv30_context {
+   struct nouveau_context base;
+   struct nv30_screen *screen;
+   struct blitter_context *blitter;
+
+   struct nouveau_bufctx *bufctx;
+
+   struct {
+      unsigned rt_enable;
+      unsigned scissor_off;
+      unsigned num_vtxelts;
+      boolean  prim_restart;
+      struct nv30_fragprog *fragprog;
+   } state;
+
+   uint32_t dirty;
+
+   struct draw_context *draw;
+   uint32_t draw_flags;
+   uint32_t draw_dirty;
+
+   struct nv30_blend_stateobj *blend;
+   struct nv30_rasterizer_stateobj *rast;
+   struct nv30_zsa_stateobj *zsa;
+   struct nv30_vertex_stateobj *vertex;
+
+   struct {
+      unsigned filter;
+      unsigned aniso;
+   } config;
+
+   struct {
+      struct nv30_vertprog *program;
+
+      struct pipe_resource *constbuf;
+      unsigned constbuf_nr;
+
+      struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
+      unsigned num_textures;
+      struct nv30_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+      unsigned num_samplers;
+      unsigned dirty_samplers;
+   } vertprog;
+
+   struct {
+      struct nv30_fragprog *program;
+
+      struct pipe_resource *constbuf;
+      unsigned constbuf_nr;
+
+      struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
+      unsigned num_textures;
+      struct nv30_sampler_state *samplers[PIPE_MAX_SAMPLERS];
+      unsigned num_samplers;
+      unsigned dirty_samplers;
+   } fragprog;
+
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_blend_color blend_colour;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_poly_stipple stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_viewport_state viewport;
+   struct pipe_clip_state clip;
+
+   unsigned sample_mask;
+
+   struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+   unsigned num_vtxbufs;
+   struct pipe_index_buffer idxbuf;
+   uint32_t vbo_fifo;
+   uint32_t vbo_user;
+   unsigned vbo_min_index;
+   unsigned vbo_max_index;
+   boolean  vbo_push_hint;
+
+   struct nouveau_heap  *blit_vp;
+   struct pipe_resource *blit_fp;
+
+   /*XXX: nvfx state, DO NOT USE EVER OUTSIDE "STOLEN" NVFX code */
+   unsigned is_nv4x;
+   unsigned use_nv4x;
+   bool hw_pointsprite_control;
+   enum {
+      HW,
+   } render_mode;
+
+   struct pipe_query *render_cond_query;
+   unsigned render_cond_mode;
+   boolean render_cond_cond;
+};
+
+static INLINE struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+   return (struct nv30_context *)pipe;
+}
+
+struct pipe_context *
+nv30_context_create(struct pipe_screen *pscreen, void *priv);
+
+void
+nv30_vbo_init(struct pipe_context *pipe);
+
+void
+nv30_vbo_validate(struct nv30_context *nv30);
+
+void
+nv30_query_init(struct pipe_context *pipe);
+
+void
+nv30_state_init(struct pipe_context *pipe);
+
+void
+nv30_clear_init(struct pipe_context *pipe);
+
+void
+nv30_vertprog_init(struct pipe_context *pipe);
+
+void
+nv30_vertprog_validate(struct nv30_context *nv30);
+
+void
+nv30_fragprog_init(struct pipe_context *pipe);
+
+void
+nv30_fragprog_validate(struct nv30_context *nv30);
+
+void
+nv30_texture_init(struct pipe_context *pipe);
+
+void
+nv30_texture_validate(struct nv30_context *nv30);
+
+void
+nv30_fragtex_init(struct pipe_context *pipe);
+
+void
+nv30_fragtex_validate(struct nv30_context *nv30);
+
+void
+nv40_verttex_init(struct pipe_context *pipe);
+
+void
+nv40_verttex_validate(struct nv30_context *nv30);
+
+void
+nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info);
+
+void
+nv30_draw_init(struct pipe_context *pipe);
+
+void
+nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
+
+boolean
+nv30_state_validate(struct nv30_context *nv30, boolean hwtnl);
+
+void
+nv30_state_release(struct nv30_context *nv30);
+
+//XXX: needed to make it build, clean this up!
+void
+_nvfx_fragprog_translate(struct nv30_context *nvfx, struct nv30_fragprog *fp,
+         boolean emulate_sprite_flipping);
+
+boolean
+_nvfx_vertprog_translate(struct nv30_context *nv30, struct nv30_vertprog *vp);
+
+#ifdef NV30_3D_VERTEX_BEGIN_END
+#define NV30_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NV30_3D_VERTEX_BEGIN_END_##n
+
+static INLINE unsigned
+nv30_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NV30_PRIM_GL_CASE(POINTS);
+   NV30_PRIM_GL_CASE(LINES);
+   NV30_PRIM_GL_CASE(LINE_LOOP);
+   NV30_PRIM_GL_CASE(LINE_STRIP);
+   NV30_PRIM_GL_CASE(TRIANGLES);
+   NV30_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NV30_PRIM_GL_CASE(TRIANGLE_FAN);
+   NV30_PRIM_GL_CASE(QUADS);
+   NV30_PRIM_GL_CASE(QUAD_STRIP);
+   NV30_PRIM_GL_CASE(POLYGON);
+   default:
+      return NV30_3D_VERTEX_BEGIN_END_POINTS;
+      break;
+   }
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
new file mode 100644
index 00000000000..3575c3d29fa
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_private.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+struct nv30_render {
+   struct vbuf_render base;
+   struct nv30_context *nv30;
+
+   struct pipe_transfer *transfer;
+   struct pipe_resource *buffer;
+   unsigned offset;
+   unsigned length;
+
+   struct vertex_info vertex_info;
+
+   struct nouveau_heap *vertprog;
+   uint32_t vtxprog[16][4];
+   uint32_t vtxfmt[16];
+   uint32_t vtxptr[16];
+   uint32_t prim;
+};
+
+static INLINE struct nv30_render *
+nv30_render(struct vbuf_render *render)
+{
+   return (struct nv30_render *)render;
+}
+
+static const struct vertex_info *
+nv30_render_get_vertex_info(struct vbuf_render *render)
+{
+   return &nv30_render(render)->vertex_info;
+}
+
+static boolean
+nv30_render_allocate_vertices(struct vbuf_render *render,
+                              ushort vertex_size, ushort nr_vertices)
+{
+   struct nv30_render *r = nv30_render(render);
+   struct nv30_context *nv30 = r->nv30;
+
+   r->length = vertex_size * nr_vertices;
+
+   if (r->offset + r->length >= render->max_vertex_buffer_bytes) {
+      pipe_resource_reference(&r->buffer, NULL);
+      r->buffer = pipe_buffer_create(&nv30->screen->base.base,
+                                     PIPE_BIND_VERTEX_BUFFER, 0,
+                                     render->max_vertex_buffer_bytes);
+      if (!r->buffer)
+         return FALSE;
+
+      r->offset = 0;
+   }
+
+   return TRUE;
+}
+
+static void *
+nv30_render_map_vertices(struct vbuf_render *render)
+{
+   struct nv30_render *r = nv30_render(render);
+   char *map = pipe_buffer_map(&r->nv30->base.pipe, r->buffer,
+                               PIPE_TRANSFER_WRITE |
+                               PIPE_TRANSFER_UNSYNCHRONIZED, &r->transfer);
+   return map + r->offset;
+}
+
+static void
+nv30_render_unmap_vertices(struct vbuf_render *render,
+                           ushort min_index, ushort max_index)
+{
+   struct nv30_render *r = nv30_render(render);
+   pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer);
+}
+
+static void
+nv30_render_set_primitive(struct vbuf_render *render, unsigned prim)
+{
+   struct nv30_render *r = nv30_render(render);
+
+   r->prim = nv30_prim_gl(prim);
+}
+
+static void
+nv30_render_draw_elements(struct vbuf_render *render,
+                          const ushort *indices, uint count)
+{
+   struct nv30_render *r = nv30_render(render);
+   struct nv30_context *nv30 = r->nv30;
+   struct nouveau_pushbuf *push = nv30->screen->base.pushbuf;
+   unsigned i;
+
+   BEGIN_NV04(push, NV30_3D(VTXBUF(0)), r->vertex_info.num_attribs);
+   for (i = 0; i < r->vertex_info.num_attribs; i++) {
+      PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
+                       nv04_resource(r->buffer), r->offset + r->vtxptr[i],
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+   }
+
+   if (!nv30_state_validate(nv30, FALSE))
+      return;
+
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, r->prim);
+
+   if (count & 1) {
+      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *indices++);
+   }
+
+   count >>= 1;
+   while (count) {
+      unsigned npush = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+      count -= npush;
+
+      BEGIN_NI04(push, NV30_3D(VB_ELEMENT_U16), npush);
+      while (npush--) {
+         PUSH_DATA(push, (indices[1] << 16) | indices[0]);
+         indices += 2;
+      }
+   }
+
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+   PUSH_RESET(push, BUFCTX_VTXTMP);
+}
+
+static void
+nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
+{
+   struct nv30_render *r = nv30_render(render);
+   struct nv30_context *nv30 = r->nv30;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   unsigned fn = nr >> 8, pn = nr & 0xff;
+   unsigned ps = fn + (pn ? 1 : 0);
+   unsigned i;
+
+   BEGIN_NV04(push, NV30_3D(VTXBUF(0)), r->vertex_info.num_attribs);
+   for (i = 0; i < r->vertex_info.num_attribs; i++) {
+      PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
+                       nv04_resource(r->buffer), r->offset + r->vtxptr[i],
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+   }
+
+   if (!nv30_state_validate(nv30, FALSE))
+      return;
+
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, r->prim);
+
+   BEGIN_NI04(push, NV30_3D(VB_VERTEX_BATCH), ps);
+   while (fn--) {
+      PUSH_DATA (push, 0xff000000 | start);
+      start += 256;
+   }
+
+   if (pn)
+      PUSH_DATA (push, ((pn - 1) << 24) | start);
+
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+   PUSH_RESET(push, BUFCTX_VTXTMP);
+}
+
+static void
+nv30_render_release_vertices(struct vbuf_render *render)
+{
+   struct nv30_render *r = nv30_render(render);
+   r->offset += r->length;
+}
+
+static const struct {
+   unsigned emit;
+   unsigned interp;
+   unsigned vp30;
+   unsigned vp40;
+   unsigned ow40;
+} vroute [] = {
+   [TGSI_SEMANTIC_POSITION] = { EMIT_4F, INTERP_PERSPECTIVE, 0, 0, 0x00000000 },
+   [TGSI_SEMANTIC_COLOR   ] = { EMIT_4F, INTERP_LINEAR     , 3, 1, 0x00000001 },
+   [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
+   [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
+   [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
+   [TGSI_SEMANTIC_GENERIC ] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }
+};
+
+static boolean
+vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
+{
+   struct pipe_screen *pscreen = &r->nv30->screen->base.base;
+   struct nv30_fragprog *fp = r->nv30->fragprog.program;
+   struct vertex_info *vinfo = &r->vertex_info;
+   enum pipe_format format;
+   uint emit = EMIT_OMIT;
+   uint result = *idx;
+
+   if (sem == TGSI_SEMANTIC_GENERIC && result >= 8) {
+      for (result = 0; result < 8; result++) {
+         if (fp->texcoord[result] == *idx) {
+            emit = vroute[sem].emit;
+            break;
+         }
+      }
+   } else {
+      emit = vroute[sem].emit;
+   }
+
+   if (emit == EMIT_OMIT)
+      return FALSE;
+
+   draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
+   format = draw_translate_vinfo_format(emit);
+
+   r->vtxfmt[attrib] = nv30_vtxfmt(pscreen, format)->hw;
+   r->vtxptr[attrib] = vinfo->size | NV30_3D_VTXBUF_DMA1;
+   vinfo->size += draw_translate_vinfo_size(emit);
+
+   if (nv30_screen(pscreen)->eng3d->oclass < NV40_3D_CLASS) {
+      r->vtxprog[attrib][0] = 0x001f38d8;
+      r->vtxprog[attrib][1] = 0x0080001b | (attrib << 9);
+      r->vtxprog[attrib][2] = 0x0836106c;
+      r->vtxprog[attrib][3] = 0x2000f800 | (result + vroute[sem].vp30) << 2;
+   } else {
+      r->vtxprog[attrib][0] = 0x401f9c6c;
+      r->vtxprog[attrib][1] = 0x0040000d | (attrib << 8);
+      r->vtxprog[attrib][2] = 0x8106c083;
+      r->vtxprog[attrib][3] = 0x6041ff80 | (result + vroute[sem].vp40) << 2;
+   }
+
+   *idx = vroute[sem].ow40 << result;
+   return TRUE;
+}
+
+static boolean
+nv30_render_validate(struct nv30_context *nv30)
+{
+   struct nv30_render *r = nv30_render(nv30->draw->render);
+   struct nv30_rasterizer_stateobj *rast = nv30->rast;
+   struct pipe_screen *pscreen = &nv30->screen->base.base;
+   struct nouveau_pushbuf *push = nv30->screen->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nv30_vertprog *vp = nv30->vertprog.program;
+   struct vertex_info *vinfo = &r->vertex_info;
+   unsigned vp_attribs = 0;
+   unsigned vp_results = 0;
+   unsigned attrib = 0;
+   unsigned pntc;
+   int i;
+
+   if (!r->vertprog) {
+      struct nouveau_heap *heap = nv30_screen(pscreen)->vp_exec_heap;
+      if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog)) {
+         while (heap->next && heap->size < 16) {
+            struct nouveau_heap **evict = heap->next->priv;
+            nouveau_heap_free(evict);
+         }
+
+         if (nouveau_heap_alloc(heap, 16, &r->vertprog, &r->vertprog))
+            return FALSE;
+      }
+   }
+
+   vinfo->num_attribs = 0;
+   vinfo->size = 0;
+
+   /* setup routing for all necessary vp outputs */
+   for (i = 0; i < vp->info.num_outputs && attrib < 16; i++) {
+      uint semantic = vp->info.output_semantic_name[i];
+      uint index = vp->info.output_semantic_index[i];
+      if (vroute_add(r, attrib, semantic, &index)) {
+         vp_attribs |= (1 << attrib++);
+         vp_results |= index;
+      }
+   }
+
+   /* setup routing for replaced point coords not written by vp */
+   if (rast && rast->pipe.point_quad_rasterization)
+      pntc = rast->pipe.sprite_coord_enable & 0x000002ff;
+   else
+      pntc = 0;
+
+   while (pntc && attrib < 16) {
+      uint index = ffs(pntc) - 1; pntc &= ~(1 << index);
+      if (vroute_add(r, attrib, TGSI_SEMANTIC_GENERIC, &index)) {
+         vp_attribs |= (1 << attrib++);
+         vp_results |= index;
+      }
+   }
+
+   /* modify vertex format for correct stride, and stub out unused ones */
+   BEGIN_NV04(push, NV30_3D(VP_UPLOAD_FROM_ID), 1);
+   PUSH_DATA (push, r->vertprog->start);
+   r->vtxprog[attrib - 1][3] |= 1;
+   for (i = 0; i < attrib; i++) {
+      BEGIN_NV04(push, NV30_3D(VP_UPLOAD_INST(0)), 4);
+      PUSH_DATAp(push, r->vtxprog[i], 4);
+      r->vtxfmt[i] |= vinfo->size << 8;
+   }
+   for (; i < 16; i++)
+      r->vtxfmt[i]  = NV30_3D_VTXFMT_TYPE_V32_FLOAT;
+
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_TRANSLATE_X), 8);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   BEGIN_NV04(push, NV30_3D(DEPTH_RANGE_NEAR), 2);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 1.0);
+
+   BEGIN_NV04(push, NV30_3D(VTXFMT(0)), 16);
+   PUSH_DATAp(push, r->vtxfmt, 16);
+
+   BEGIN_NV04(push, NV30_3D(VP_START_FROM_ID), 1);
+   PUSH_DATA (push, r->vertprog->start);
+   BEGIN_NV04(push, NV30_3D(ENGINE), 1);
+   PUSH_DATA (push, 0x00000103);
+   if (eng3d->oclass >= NV40_3D_CLASS) {
+      BEGIN_NV04(push, NV40_3D(VP_ATTRIB_EN), 2);
+      PUSH_DATA (push, vp_attribs);
+      PUSH_DATA (push, vp_results);
+   }
+
+   vinfo->size /= 4;
+   return TRUE;
+}
+
+void
+nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct draw_context *draw = nv30->draw;
+   struct pipe_transfer *transfer[PIPE_MAX_ATTRIBS] = {NULL};
+   struct pipe_transfer *transferi = NULL;
+   int i;
+
+   nv30_render_validate(nv30);
+
+   if (nv30->draw_dirty & NV30_NEW_VIEWPORT)
+      draw_set_viewport_states(draw, 0, 1, &nv30->viewport);
+   if (nv30->draw_dirty & NV30_NEW_RASTERIZER)
+      draw_set_rasterizer_state(draw, &nv30->rast->pipe, NULL);
+   if (nv30->draw_dirty & NV30_NEW_CLIP)
+      draw_set_clip_state(draw, &nv30->clip);
+   if (nv30->draw_dirty & NV30_NEW_ARRAYS) {
+      draw_set_vertex_buffers(draw, 0, nv30->num_vtxbufs, nv30->vtxbuf);
+      draw_set_vertex_elements(draw, nv30->vertex->num_elements, nv30->vertex->pipe);
+   }
+   if (nv30->draw_dirty & NV30_NEW_FRAGPROG) {
+      struct nv30_fragprog *fp = nv30->fragprog.program;
+      if (!fp->draw)
+         fp->draw = draw_create_fragment_shader(draw, &fp->pipe);
+      draw_bind_fragment_shader(draw, fp->draw);
+   }
+   if (nv30->draw_dirty & NV30_NEW_VERTPROG) {
+      struct nv30_vertprog *vp = nv30->vertprog.program;
+      if (!vp->draw)
+         vp->draw = draw_create_vertex_shader(draw, &vp->pipe);
+      draw_bind_vertex_shader(draw, vp->draw);
+   }
+   if (nv30->draw_dirty & NV30_NEW_VERTCONST) {
+      if (nv30->vertprog.constbuf) {
+         void *map = nv04_resource(nv30->vertprog.constbuf)->data;
+         draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
+                                         map, nv30->vertprog.constbuf_nr);
+      }
+   }
+
+   for (i = 0; i < nv30->num_vtxbufs; i++) {
+      const void *map = nv30->vtxbuf[i].user_buffer;
+      if (!map) {
+         if (!nv30->vtxbuf[i].buffer) {
+            continue;
+         }
+         map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
+                                  PIPE_TRANSFER_UNSYNCHRONIZED |
+                                  PIPE_TRANSFER_READ, &transfer[i]);
+      }
+      draw_set_mapped_vertex_buffer(draw, i, map, ~0);
+   }
+
+   if (info->indexed) {
+      const void *map = nv30->idxbuf.user_buffer;
+      if (!map)
+         pipe_buffer_map(pipe, nv30->idxbuf.buffer,
+                                  PIPE_TRANSFER_UNSYNCHRONIZED |
+                                  PIPE_TRANSFER_READ, &transferi);
+      draw_set_indexes(draw,
+                       (ubyte *) map + nv30->idxbuf.offset,
+                       nv30->idxbuf.index_size, ~0);
+   } else {
+      draw_set_indexes(draw, NULL, 0, 0);
+   }
+
+   draw_vbo(draw, info);
+   draw_flush(draw);
+
+   if (info->indexed && transferi)
+      pipe_buffer_unmap(pipe, transferi);
+   for (i = 0; i < nv30->num_vtxbufs; i++)
+      if (transfer[i])
+         pipe_buffer_unmap(pipe, transfer[i]);
+
+   nv30->draw_dirty = 0;
+   nv30_state_release(nv30);
+}
+
+static void
+nv30_render_destroy(struct vbuf_render *render)
+{
+   FREE(render);
+}
+
+static struct vbuf_render *
+nv30_render_create(struct nv30_context *nv30)
+{
+   struct nv30_render *r = CALLOC_STRUCT(nv30_render);
+   if (!r)
+      return NULL;
+
+   r->nv30 = nv30;
+   r->offset = 1 * 1024 * 1024;
+
+   r->base.max_indices = 16 * 1024;
+   r->base.max_vertex_buffer_bytes = r->offset;
+
+   r->base.get_vertex_info = nv30_render_get_vertex_info;
+   r->base.allocate_vertices = nv30_render_allocate_vertices;
+   r->base.map_vertices = nv30_render_map_vertices;
+   r->base.unmap_vertices = nv30_render_unmap_vertices;
+   r->base.set_primitive = nv30_render_set_primitive;
+   r->base.draw_elements = nv30_render_draw_elements;
+   r->base.draw_arrays = nv30_render_draw_arrays;
+   r->base.release_vertices = nv30_render_release_vertices;
+   r->base.destroy = nv30_render_destroy;
+   return &r->base;
+}
+
+void
+nv30_draw_init(struct pipe_context *pipe)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct vbuf_render *render;
+   struct draw_context *draw;
+   struct draw_stage *stage;
+
+   draw = draw_create(pipe);
+   if (!draw)
+      return;
+
+   render = nv30_render_create(nv30);
+   if (!render) {
+      draw_destroy(draw);
+      return;
+   }
+
+   stage = draw_vbuf_stage(draw, render);
+   if (!stage) {
+      render->destroy(render);
+      draw_destroy(draw);
+      return;
+   }
+
+   draw_set_render(draw, render);
+   draw_set_rasterize_stage(draw, stage);
+   draw_wide_line_threshold(draw, 10000000.f);
+   draw_wide_point_threshold(draw, 10000000.f);
+   draw_wide_point_sprites(draw, TRUE);
+   nv30->draw = draw;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.c b/src/gallium/drivers/nouveau/nv30/nv30_format.c
new file mode 100644
index 00000000000..67e0d5e9c56
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_format.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+#define NV30_3D_RT_FORMAT_COLOR_X1R5G5B5 2
+
+#define NV30_3D_TEX_FORMAT_FORMAT_A16L16 NV30_3D_TEX_FORMAT_FORMAT_HILO16
+#define NV30_3D_TEX_FORMAT_FORMAT_A16L16_RECT NV30_3D_TEX_FORMAT_FORMAT_HILO16_RECT
+#define NV30_3D_TEX_FORMAT_FORMAT_RGBA16F 0x00004a00
+#define NV30_3D_TEX_FORMAT_FORMAT_RGBA16F_RECT NV30_3D_TEX_FORMAT_FORMAT_RGBA16F
+#define NV30_3D_TEX_FORMAT_FORMAT_RGBA32F 0x00004b00
+#define NV30_3D_TEX_FORMAT_FORMAT_RGBA32F_RECT NV30_3D_TEX_FORMAT_FORMAT_RGBA32F
+#define NV30_3D_TEX_FORMAT_FORMAT_R32F 0x00004c00
+#define NV30_3D_TEX_FORMAT_FORMAT_R32F_RECT NV30_3D_TEX_FORMAT_FORMAT_R32F
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT1_RECT NV30_3D_TEX_FORMAT_FORMAT_DXT1
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT3_RECT NV30_3D_TEX_FORMAT_FORMAT_DXT3
+#define NV30_3D_TEX_FORMAT_FORMAT_DXT5_RECT NV30_3D_TEX_FORMAT_FORMAT_DXT5
+#define NV30_3D_TEX_FORMAT_FORMAT_RG16F 0xdeadcafe
+#define NV30_3D_TEX_FORMAT_FORMAT_RG16F_RECT 0xdeadcafe
+
+#define NV40_3D_TEX_FORMAT_FORMAT_R32F 0x00001c00
+#define NV40_3D_TEX_FORMAT_FORMAT_RG16F 0x00001f00
+
+#define ____ 0
+#define S___ PIPE_BIND_SAMPLER_VIEW
+#define _R__ PIPE_BIND_RENDER_TARGET
+#define _B__ PIPE_BIND_RENDER_TARGET | PIPE_BIND_BLENDABLE
+#define _Z__ PIPE_BIND_DEPTH_STENCIL
+#define __V_ PIPE_BIND_VERTEX_BUFFER
+#define SR__ (S___ | _R__)
+#define SB__ (S___ | _B__)
+#define SZ__ (S___ | _Z__)
+#define S_V_ (S___ | __V_)
+#define SRV_ (SR__ | __V_)
+#define SBV_ (SB__ | __V_)
+
+#define _(a,b) [PIPE_FORMAT_##a] = {                                           \
+   .bindings = (b),                                                            \
+}
+const struct nv30_format_info
+nv30_format_info_table[PIPE_FORMAT_COUNT] = {
+   _(L8_UNORM            , S___),
+   _(L8_SNORM            , S___),
+   _(L8_SRGB             , S___),
+   _(I8_UNORM            , S___),
+   _(I8_SNORM            , S___),
+   _(A8_UNORM            , S___),
+   _(A8_SNORM            , S___),
+   _(R8_UNORM            , S_V_),
+   _(R8_SNORM            , S___),
+   _(B5G5R5X1_UNORM      , SB__),
+   _(B5G5R5A1_UNORM      , S___),
+   _(B4G4R4X4_UNORM      , S___),
+   _(B4G4R4A4_UNORM      , S___),
+   _(B5G6R5_UNORM        , SB__),
+   _(B8G8R8X8_UNORM      , SB__),
+   _(B8G8R8X8_SRGB       , S___),
+   _(B8G8R8A8_UNORM      , SB__),
+   _(B8G8R8A8_SRGB       , S___),
+   _(R8G8B8A8_UNORM      , __V_),
+   _(R8G8B8A8_SNORM      , S___),
+   _(DXT1_RGB            , S___),
+   _(DXT1_SRGB           , S___),
+   _(DXT1_RGBA           , S___),
+   _(DXT1_SRGBA          , S___),
+   _(DXT3_RGBA           , S___),
+   _(DXT3_SRGBA          , S___),
+   _(DXT5_RGBA           , S___),
+   _(DXT5_SRGBA          , S___),
+   _(L8A8_UNORM          , S___),
+   _(L8A8_SRGB           , S___),
+   _(R8G8_UNORM          , S_V_),
+   _(R8G8_SNORM          , S___),
+   _(R8G8B8_UNORM        , __V_),
+   _(Z16_UNORM           , SZ__),
+   _(X8Z24_UNORM         , SZ__),
+   _(S8_UINT_Z24_UNORM   , SZ__),
+   _(L16_UNORM           , S___),
+   _(L16_SNORM           , S___),
+   _(I16_UNORM           , S___),
+   _(I16_SNORM           , S___),
+   _(A16_UNORM           , S___),
+   _(A16_SNORM           , S___),
+   _(R16_UNORM           , S___),
+   _(R16_SNORM           , S_V_),
+   _(R16G16_SNORM        , __V_),
+   _(R16G16B16_SNORM     , __V_),
+   _(R16G16B16A16_SNORM  , __V_),
+   _(R8G8B8A8_USCALED    , __V_),
+   _(R16_FLOAT           , __V_),
+   _(R16G16_FLOAT        , __V_), //S_V_),
+   _(R16G16B16_FLOAT     , __V_),
+   _(R16G16B16A16_FLOAT  , __V_), //SBV_),
+   _(R16_SSCALED         , __V_),
+   _(R16G16_SSCALED      , __V_),
+   _(R16G16B16_SSCALED   , __V_),
+   _(R16G16B16A16_SSCALED, __V_),
+   _(R32_FLOAT           , __V_), //SRV_),
+   _(R32G32_FLOAT        , __V_),
+   _(R32G32B32_FLOAT     , __V_),
+   _(R32G32B32A32_FLOAT  , __V_), //SRV_),
+};
+#undef _
+#undef ____
+
+#define R_(a,b) [PIPE_FORMAT_##a] = {                                          \
+   .hw = NV30_3D_RT_FORMAT_COLOR_##b,                                          \
+}
+#define Z_(a,b) [PIPE_FORMAT_##a] = {                                          \
+   .hw = NV30_3D_RT_FORMAT_ZETA_##b,                                           \
+}
+const struct nv30_format
+nv30_format_table[PIPE_FORMAT_COUNT] = {
+   R_(B5G5R5X1_UNORM    , X1R5G5B5          ),
+   R_(B5G6R5_UNORM      , R5G6B5            ),
+   R_(B8G8R8X8_UNORM    , X8R8G8B8          ),
+   R_(B8G8R8A8_UNORM    , A8R8G8B8          ),
+   Z_(Z16_UNORM         , Z16               ),
+   Z_(X8Z24_UNORM       , Z24S8             ),
+   Z_(S8_UINT_Z24_UNORM , Z24S8             ),
+   R_(R16G16B16A16_FLOAT, A16B16G16R16_FLOAT),
+   R_(R32G32B32A32_FLOAT, A32B32G32R32_FLOAT),
+   R_(R32_FLOAT         , R32_FLOAT         ),
+};
+
+#define _(a,b,c) [PIPE_FORMAT_##a] = {                                         \
+   .hw = NV30_3D_VTXFMT_TYPE_##b | ((c) << NV30_3D_VTXFMT_SIZE__SHIFT)         \
+}
+const struct nv30_vtxfmt
+nv30_vtxfmt_table[PIPE_FORMAT_COUNT] = {
+   _(R8_UNORM            , U8_UNORM   , 1),
+   _(R8G8_UNORM          , U8_UNORM   , 2),
+   _(R8G8B8_UNORM        , U8_UNORM   , 3),
+   _(R8G8B8A8_UNORM      , U8_UNORM   , 4),
+   _(R8G8B8A8_USCALED    , U8_USCALED , 4),
+   _(R16_SNORM           , V16_SNORM  , 1),
+   _(R16G16_SNORM        , V16_SNORM  , 2),
+   _(R16G16B16_SNORM     , V16_SNORM  , 3),
+   _(R16G16B16A16_SNORM  , V16_SNORM  , 4),
+   _(R16_SSCALED         , V16_SSCALED, 1),
+   _(R16G16_SSCALED      , V16_SSCALED, 2),
+   _(R16G16B16_SSCALED   , V16_SSCALED, 3),
+   _(R16G16B16A16_SSCALED, V16_SSCALED, 4),
+   _(R16_FLOAT           , V16_FLOAT  , 1),
+   _(R16G16_FLOAT        , V16_FLOAT  , 2),
+   _(R16G16B16_FLOAT     , V16_FLOAT  , 3),
+   _(R16G16B16A16_FLOAT  , V16_FLOAT  , 4),
+   _(R32_FLOAT           , V32_FLOAT  , 1),
+   _(R32G32_FLOAT        , V32_FLOAT  , 2),
+   _(R32G32B32_FLOAT     , V32_FLOAT  , 3),
+   _(R32G32B32A32_FLOAT  , V32_FLOAT  , 4),
+};
+#undef _
+
+#define SWZ_OUT_0 0
+#define SWZ_OUT_1 1
+#define SWZ_OUT_C 2
+
+#define SWZ_SRC_0 3
+#define SWZ_SRC_1 2
+#define SWZ_SRC_2 1
+#define SWZ_SRC_3 0
+#define SWZ_SRC_x 0
+
+#define NONE 0x00000000
+#define SRGB 0x00700000
+
+#define ____ 0x00000000
+#define SSSS 0xf0000000
+
+#define _(a,b,c,d,e,f,g,h,i,j,k,l,m) [PIPE_FORMAT_##a] = {                     \
+   .nv30 = NV30_3D_TEX_FORMAT_FORMAT_##b,                                      \
+   .nv30_rect = NV30_3D_TEX_FORMAT_FORMAT_##b##_RECT,                          \
+   .nv40 = NV40_3D_TEX_FORMAT_FORMAT_##b,                                      \
+   .swz[0] = { SWZ_OUT_##d, SWZ_SRC_##h },                                     \
+   .swz[1] = { SWZ_OUT_##e, SWZ_SRC_##i },                                     \
+   .swz[2] = { SWZ_OUT_##f, SWZ_SRC_##j },                                     \
+   .swz[3] = { SWZ_OUT_##g, SWZ_SRC_##k },                                     \
+   .swz[4] = { SWZ_OUT_0, SWZ_SRC_x },                                         \
+   .swz[5] = { SWZ_OUT_1, SWZ_SRC_x },                                         \
+   .swizzle = (c) * 0x00010000,                                                \
+   .wrap =  (l),                                                               \
+   .filter = (m),                                                              \
+}
+const struct nv30_texfmt
+nv30_texfmt_table[PIPE_FORMAT_COUNT] = {
+   _(L8_UNORM          , L8      , 0, C, C, C, 1, 0, 0, 0, x, NONE, ____),
+   _(L8_SNORM          , L8      , 0, C, C, C, 1, 0, 0, 0, x, NONE, SSSS),
+   _(L8_SRGB           , L8      , 0, C, C, C, 1, 0, 0, 0, x, SRGB, ____),
+   _(I8_UNORM          , L8      , 0, C, C, C, C, 0, 0, 0, 0, NONE, ____),
+   _(I8_SNORM          , L8      , 0, C, C, C, C, 0, 0, 0, 0, NONE, SSSS),
+   _(A8_UNORM          , L8      , 0, 0, 0, 0, C, x, x, x, 0, NONE, ____),
+   _(A8_SNORM          , L8      , 0, 0, 0, 0, C, x, x, x, 0, NONE, SSSS),
+   _(R8_UNORM          , L8      , 0, C, 0, 0, 1, 0, x, x, x, NONE, ____),
+   _(R8_SNORM          , L8      , 0, C, 0, 0, 1, 0, x, x, x, NONE, SSSS),
+   _(B5G5R5X1_UNORM    , A1R5G5B5, 0, C, C, C, 1, 2, 1, 0, x, NONE, ____),
+   _(B5G5R5A1_UNORM    , A1R5G5B5, 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(B4G4R4X4_UNORM    , A4R4G4B4, 0, C, C, C, 1, 2, 1, 0, x, NONE, ____),
+   _(B4G4R4A4_UNORM    , A4R4G4B4, 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(B5G6R5_UNORM      , R5G6B5  , 0, C, C, C, 1, 2, 1, 0, x, NONE, ____),
+   _(B8G8R8X8_UNORM    , A8R8G8B8, 0, C, C, C, 1, 2, 1, 0, x, NONE, ____),
+   _(B8G8R8X8_SRGB     , A8R8G8B8, 0, C, C, C, 1, 2, 1, 0, x, SRGB, ____),
+   _(B8G8R8A8_UNORM    , A8R8G8B8, 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(B8G8R8A8_SRGB     , A8R8G8B8, 0, C, C, C, C, 2, 1, 0, 3, SRGB, ____),
+   _(R8G8B8A8_SNORM    , A8R8G8B8, 0, C, C, C, C, 0, 1, 2, 3, NONE, SSSS),
+   _(DXT1_RGB          , DXT1    , 0, C, C, C, 1, 2, 1, 0, x, NONE, ____),
+   _(DXT1_SRGB         , DXT1    , 0, C, C, C, 1, 2, 1, 0, x, SRGB, ____),
+   _(DXT1_RGBA         , DXT1    , 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(DXT1_SRGBA        , DXT1    , 0, C, C, C, C, 2, 1, 0, 3, SRGB, ____),
+   _(DXT3_RGBA         , DXT3    , 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(DXT3_SRGBA        , DXT3    , 0, C, C, C, C, 2, 1, 0, 3, SRGB, ____),
+   _(DXT5_RGBA         , DXT5    , 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(DXT5_SRGBA        , DXT5    , 0, C, C, C, C, 2, 1, 0, 3, SRGB, ____),
+   _(L8A8_UNORM        , A8L8    , 0, C, C, C, C, 0, 0, 0, 3, NONE, ____),
+   _(L8A8_SRGB         , A8L8    , 0, C, C, C, C, 0, 0, 0, 3, SRGB, ____),
+   _(R8G8_UNORM        , A8L8    , 0, C, C, 0, 1, 0, 3, x, x, NONE, ____),
+   _(R8G8_SNORM        , A8L8    , 0, C, C, 0, 1, 0, 3, x, x, NONE, SSSS),
+   _(Z16_UNORM         , Z16     , 0, C, C, C, 1, 3, 3, 3, x, NONE, ____),
+   _(X8Z24_UNORM       , Z24     , 0, C, C, C, 1, 3, 3, 3, x, NONE, ____),
+   _(S8_UINT_Z24_UNORM , Z24     , 0, C, C, C, 1, 3, 3, 3, x, NONE, ____),
+   _(L16_UNORM         , A16     , 0, C, C, C, 1, 1, 1, 1, 1, NONE, ____),
+   _(L16_SNORM         , A16     , 0, C, C, C, 1, 1, 1, 1, 1, NONE, SSSS),
+   _(I16_UNORM         , A16     , 0, C, C, C, C, 1, 1, 1, 1, NONE, ____),
+   _(I16_SNORM         , A16     , 0, C, C, C, C, 1, 1, 1, 1, NONE, SSSS),
+   _(A16_UNORM         , A16     , 0, 0, 0, 0, C, 1, 1, 1, 1, NONE, ____),
+   _(A16_SNORM         , A16     , 0, 0, 0, 0, C, 1, 1, 1, 1, NONE, SSSS),
+   _(R16_UNORM         , A16     , 0, C, 0, 0, 1, 1, 1, 1, 1, NONE, ____),
+   _(R16_SNORM         , A16     , 0, C, 0, 0, 1, 1, 1, 1, 1, NONE, SSSS),
+   _(R16G16_FLOAT      , RG16F   , 0, C, C, 0, 1, 2, 1, 0, 3, NONE, ____),
+   _(R16G16B16A16_FLOAT, RGBA16F , 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+   _(R32_FLOAT         , R32F    , 0, C, 0, 0, 1, 2, 1, 0, 3, NONE, ____),
+   _(R32G32B32A32_FLOAT, RGBA32F , 0, C, C, C, C, 2, 1, 0, 3, NONE, ____),
+};
+#undef _
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_format.h b/src/gallium/drivers/nouveau/nv30/nv30_format.h
new file mode 100644
index 00000000000..8bf4a37299f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_format.h
@@ -0,0 +1,57 @@
+#ifndef __NV30_FORMAT_H__
+#define __NV30_FORMAT_H__
+
+struct nv30_format_info {
+   unsigned bindings;
+};
+
+struct nv30_format {
+   unsigned hw;
+};
+
+struct nv30_vtxfmt {
+   unsigned hw;
+};
+
+struct nv30_texfmt {
+   unsigned nv30;
+   unsigned nv30_rect;
+   unsigned nv40;
+   struct {
+      unsigned src;
+      unsigned cmp;
+   } swz[6];
+   unsigned swizzle;
+   unsigned filter;
+   unsigned wrap;
+};
+
+extern const struct nv30_format_info nv30_format_info_table[];
+static INLINE const struct nv30_format_info *
+nv30_format_info(struct pipe_screen *pscreen, enum pipe_format format)
+{
+   return &nv30_format_info_table[format];
+}
+
+extern const struct nv30_format nv30_format_table[];
+static INLINE const struct nv30_format *
+nv30_format(struct pipe_screen *pscreen, enum pipe_format format)
+{
+   return &nv30_format_table[format];
+}
+
+extern const struct nv30_vtxfmt nv30_vtxfmt_table[];
+static INLINE const struct nv30_vtxfmt *
+nv30_vtxfmt(struct pipe_screen *pscreen, enum pipe_format format)
+{
+   return &nv30_vtxfmt_table[format];
+}
+
+extern const struct nv30_texfmt nv30_texfmt_table[];
+static INLINE const struct nv30_texfmt *
+nv30_texfmt(struct pipe_screen *pscreen, enum pipe_format format)
+{
+   return &nv30_texfmt_table[format];
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
new file mode 100644
index 00000000000..e8acdfeaf75
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nvfx_shader.h"
+
+static void
+nv30_fragprog_upload(struct nv30_context *nv30)
+{
+   struct nouveau_context *nv = &nv30->base;
+   struct nv30_fragprog *fp = nv30->fragprog.program;
+   struct pipe_context *pipe = &nv30->base.pipe;
+   struct pipe_transfer *transfer;
+   uint32_t *map;
+   int i; (void)i;
+
+   if (unlikely(!fp->buffer)) {
+      fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4);
+   }
+
+   map = pipe_buffer_map(pipe, fp->buffer, PIPE_TRANSFER_WRITE, &transfer);
+#ifndef PIPE_ARCH_BIG_ENDIAN
+   memcpy(map, fp->insn, fp->insn_len * 4);
+#else
+   for (i = 0; i < fp->insn_len; i++)
+      *map++ = (fp->insn[i] >> 16) | (fp->insn[i] << 16);
+#endif
+   pipe_buffer_unmap(pipe, transfer);
+
+   if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM)
+      nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM);
+}
+
+void
+nv30_fragprog_validate(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nv30_fragprog *fp = nv30->fragprog.program;
+   boolean upload = FALSE;
+   int i;
+
+   if (!fp->translated) {
+      _nvfx_fragprog_translate(nv30, fp, FALSE);
+      if (!fp->translated)
+         return;
+
+      upload = TRUE;
+   }
+
+   /* update constants, also needs to be done on every fp switch as we
+    * have no idea whether the constbuf changed in the meantime
+    */
+   if (nv30->fragprog.constbuf) {
+      struct pipe_resource *constbuf = nv30->fragprog.constbuf;
+      uint32_t *cbuf = (uint32_t *)nv04_resource(constbuf)->data;
+
+      for (i = 0; i < fp->nr_consts; i++) {
+         unsigned off = fp->consts[i].offset;
+         unsigned idx = fp->consts[i].index * 4;
+
+         if (!memcmp(&fp->insn[off], &cbuf[idx], 4 * 4))
+            continue;
+         memcpy(&fp->insn[off], &cbuf[idx], 4 * 4);
+         upload = TRUE;
+      }
+   }
+
+   if (upload)
+      nv30_fragprog_upload(nv30);
+
+   /* FP_ACTIVE_PROGRAM needs to be done again even if only the consts
+    * were updated.  TEX_CACHE_CTL magic is not enough to convince the
+    * GPU that it should re-read the fragprog from VRAM... sigh.
+    */
+   if (nv30->state.fragprog != fp || upload) {
+      struct nv04_resource *r = nv04_resource(fp->buffer);
+
+      if (!PUSH_SPACE(push, 8))
+         return;
+      PUSH_RESET(push, BUFCTX_FRAGPROG);
+
+      BEGIN_NV04(push, NV30_3D(FP_ACTIVE_PROGRAM), 1);
+      PUSH_RESRC(push, NV30_3D(FP_ACTIVE_PROGRAM), BUFCTX_FRAGPROG, r, 0,
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD | NOUVEAU_BO_OR,
+                       NV30_3D_FP_ACTIVE_PROGRAM_DMA0,
+                       NV30_3D_FP_ACTIVE_PROGRAM_DMA1);
+      BEGIN_NV04(push, NV30_3D(FP_CONTROL), 1);
+      PUSH_DATA (push, fp->fp_control);
+      if (eng3d->oclass < NV40_3D_CLASS) {
+         BEGIN_NV04(push, NV30_3D(FP_REG_CONTROL), 1);
+         PUSH_DATA (push, 0x00010004);
+         BEGIN_NV04(push, NV30_3D(TEX_UNITS_ENABLE), 1);
+         PUSH_DATA (push, fp->texcoords);
+      } else {
+         BEGIN_NV04(push, SUBC_3D(0x0b40), 1);
+         PUSH_DATA (push, 0x00000000);
+      }
+
+      nv30->state.fragprog = fp;
+   }
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   struct nv30_fragprog *fp = CALLOC_STRUCT(nv30_fragprog);
+   if (!fp)
+      return NULL;
+
+   fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+   tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+   return fp;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_fragprog *fp = hwcso;
+
+   pipe_resource_reference(&fp->buffer, NULL);
+
+   FREE((void *)fp->pipe.tokens);
+   FREE(fp->insn);
+   FREE(fp);
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->fragprog.program = hwcso;
+   nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+void
+nv30_fragprog_init(struct pipe_context *pipe)
+{
+   pipe->create_fs_state = nv30_fp_state_create;
+   pipe->bind_fs_state = nv30_fp_state_bind;
+   pipe->delete_fs_state = nv30_fp_state_delete;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c b/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c
new file mode 100644
index 00000000000..60ffb03c89f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragtex.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_inlines.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+void
+nv30_fragtex_validate(struct nv30_context *nv30)
+{
+   struct pipe_screen *pscreen = &nv30->screen->base.base;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   unsigned dirty = nv30->fragprog.dirty_samplers;
+
+   while (dirty) {
+      unsigned unit = ffs(dirty) - 1;
+      struct nv30_sampler_view *sv = (void *)nv30->fragprog.textures[unit];
+      struct nv30_sampler_state *ss = nv30->fragprog.samplers[unit];
+
+      PUSH_RESET(push, BUFCTX_FRAGTEX(unit));
+
+      if (ss && sv) {
+         const struct nv30_texfmt *fmt = nv30_texfmt(pscreen, sv->pipe.format);
+         struct pipe_resource *pt = sv->pipe.texture;
+         struct nv30_miptree *mt = nv30_miptree(pt);
+         unsigned min_lod, max_lod;
+         u32 filter = sv->filt | (ss->filt & sv->filt_mask);
+         u32 format = sv->fmt | ss->fmt;
+         u32 enable = ss->en;
+
+         /* handle base_level when not using a mip filter, min/max level
+          * is unfortunately ignored by the hardware otherwise
+          */
+         if (ss->pipe.min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+            if (sv->base_lod)
+               filter += 0x00020000; /* N/L -> NMN/LMN */
+            max_lod = sv->base_lod;
+            min_lod = sv->base_lod;
+         } else {
+            max_lod = MIN2(ss->max_lod + sv->base_lod, sv->high_lod);
+            min_lod = MIN2(ss->min_lod + sv->base_lod, max_lod);
+         }
+
+         if (eng3d->oclass >= NV40_3D_CLASS) {
+            /* this is a tad stupid of the hardware, but there's no non-rcomp
+             * z16/z24 texture formats to be had, we have to suffer and lose
+             * some precision to handle this case.
+             */
+            if (ss->pipe.compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               if (fmt->nv40 == NV40_3D_TEX_FORMAT_FORMAT_Z16)
+                  format |= NV40_3D_TEX_FORMAT_FORMAT_A8L8;
+               else
+               if (fmt->nv40 == NV40_3D_TEX_FORMAT_FORMAT_Z24)
+                  format |= NV40_3D_TEX_FORMAT_FORMAT_A16L16;
+               else
+                  format |= fmt->nv40;
+            } else {
+               format |= fmt->nv40;
+            }
+
+            enable |= (min_lod << 19) | (max_lod << 7);
+            enable |= NV40_3D_TEX_ENABLE_ENABLE;
+
+            BEGIN_NV04(push, NV40_3D(TEX_SIZE1(unit)), 1);
+            PUSH_DATA (push, sv->npot_size1);
+         } else {
+            /* this is a tad stupid of the hardware, but there's no non-rcomp
+             * z16/z24 texture formats to be had, we have to suffer and lose
+             * some precision to handle this case.
+             */
+            if (ss->pipe.compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               if (fmt->nv30 == NV30_3D_TEX_FORMAT_FORMAT_Z16) {
+                  if (ss->pipe.normalized_coords)
+                     format |= NV30_3D_TEX_FORMAT_FORMAT_A8L8;
+                  else
+                     format |= NV30_3D_TEX_FORMAT_FORMAT_A8L8_RECT;
+               } else
+               if (fmt->nv30 == NV30_3D_TEX_FORMAT_FORMAT_Z24) {
+                  if (ss->pipe.normalized_coords)
+                     format |= NV30_3D_TEX_FORMAT_FORMAT_HILO16;
+                  else
+                     format |= NV30_3D_TEX_FORMAT_FORMAT_HILO16_RECT;
+               } else {
+                  if (ss->pipe.normalized_coords)
+                     format |= fmt->nv30;
+                  else
+                     format |= fmt->nv30_rect;
+               }
+            } else {
+               if (ss->pipe.normalized_coords)
+                  format |= fmt->nv30;
+               else
+                  format |= fmt->nv30_rect;
+            }
+
+            enable |= NV30_3D_TEX_ENABLE_ENABLE;
+            enable |= (min_lod << 18) | (max_lod << 6);
+         }
+
+         BEGIN_NV04(push, NV30_3D(TEX_OFFSET(unit)), 8);
+         PUSH_MTHDl(push, NV30_3D(TEX_OFFSET(unit)), BUFCTX_FRAGTEX(unit),
+                          mt->base.bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+         PUSH_MTHDs(push, NV30_3D(TEX_FORMAT(unit)), BUFCTX_FRAGTEX(unit),
+                          mt->base.bo, format, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD,
+                          NV30_3D_TEX_FORMAT_DMA0,
+                          NV30_3D_TEX_FORMAT_DMA1);
+         PUSH_DATA (push, sv->wrap | (ss->wrap & sv->wrap_mask));
+         PUSH_DATA (push, enable);
+         PUSH_DATA (push, sv->swz);
+         PUSH_DATA (push, filter);
+         PUSH_DATA (push, sv->npot_size0);
+         PUSH_DATA (push, ss->bcol);
+         BEGIN_NV04(push, NV30_3D(TEX_FILTER_OPTIMIZATION(unit)), 1);
+         PUSH_DATA (push, nv30->config.filter);
+      } else {
+         BEGIN_NV04(push, NV30_3D(TEX_ENABLE(unit)), 1);
+         PUSH_DATA (push, 0);
+      }
+
+      dirty &= ~(1 << unit);
+   }
+
+   nv30->fragprog.dirty_samplers = 0;
+}
+
+static void
+nv30_fragtex_sampler_states_bind(struct pipe_context *pipe,
+                                 unsigned nr, void **hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < nr; i++) {
+      nv30->fragprog.samplers[i] = hwcso[i];
+      nv30->fragprog.dirty_samplers |= (1 << i);
+   }
+
+   for (; i < nv30->fragprog.num_samplers; i++) {
+      nv30->fragprog.samplers[i] = NULL;
+      nv30->fragprog.dirty_samplers |= (1 << i);
+   }
+
+   nv30->fragprog.num_samplers = nr;
+   nv30->dirty |= NV30_NEW_FRAGTEX;
+}
+
+
+static void
+nv30_fragtex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
+                               struct pipe_sampler_view **views)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < nr; i++) {
+      nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FRAGTEX(i));
+      pipe_sampler_view_reference(&nv30->fragprog.textures[i], views[i]);
+      nv30->fragprog.dirty_samplers |= (1 << i);
+   }
+
+   for (; i < nv30->fragprog.num_textures; i++) {
+      nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FRAGTEX(i));
+      pipe_sampler_view_reference(&nv30->fragprog.textures[i], NULL);
+      nv30->fragprog.dirty_samplers |= (1 << i);
+   }
+
+   nv30->fragprog.num_textures = nr;
+   nv30->dirty |= NV30_NEW_FRAGTEX;
+}
+
+void
+nv30_fragtex_init(struct pipe_context *pipe)
+{
+   pipe->bind_fragment_sampler_states = nv30_fragtex_sampler_states_bind;
+   pipe->set_fragment_sampler_views = nv30_fragtex_set_sampler_views;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_miptree.c b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
new file mode 100644
index 00000000000..4c237f615cd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_miptree.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+
+#include "nv_m2mf.xml.h"
+#include "nv30/nv30_screen.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_resource.h"
+#include "nv30/nv30_transfer.h"
+
+static INLINE unsigned
+layer_offset(struct pipe_resource *pt, unsigned level, unsigned layer)
+{
+   struct nv30_miptree *mt = nv30_miptree(pt);
+   struct nv30_miptree_level *lvl = &mt->level[level];
+
+   if (pt->target == PIPE_TEXTURE_CUBE)
+      return (layer * mt->layer_size) + lvl->offset;
+
+   return lvl->offset + (layer * lvl->zslice_size);
+}
+
+static boolean
+nv30_miptree_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_resource *pt,
+                        struct winsys_handle *handle)
+{
+   struct nv30_miptree *mt = nv30_miptree(pt);
+   unsigned stride;
+
+   if (!mt || !mt->base.bo)
+      return FALSE;
+
+   stride = mt->level[0].pitch;
+
+   return nouveau_screen_bo_get_handle(pscreen, mt->base.bo, stride, handle);
+}
+
+static void
+nv30_miptree_destroy(struct pipe_screen *pscreen, struct pipe_resource *pt)
+{
+   struct nv30_miptree *mt = nv30_miptree(pt);
+
+   nouveau_bo_ref(NULL, &mt->base.bo);
+   FREE(mt);
+}
+
+struct nv30_transfer {
+   struct pipe_transfer base;
+   struct nv30_rect img;
+   struct nv30_rect tmp;
+   unsigned nblocksx;
+   unsigned nblocksy;
+};
+
+static INLINE struct nv30_transfer *
+nv30_transfer(struct pipe_transfer *ptx)
+{
+   return (struct nv30_transfer *)ptx;
+}
+
+static INLINE void
+define_rect(struct pipe_resource *pt, unsigned level, unsigned z,
+            unsigned x, unsigned y, unsigned w, unsigned h,
+            struct nv30_rect *rect)
+{
+   struct nv30_miptree *mt = nv30_miptree(pt);
+   struct nv30_miptree_level *lvl = &mt->level[level];
+
+   rect->w = u_minify(pt->width0, level) << mt->ms_x;
+   rect->w = util_format_get_nblocksx(pt->format, rect->w);
+   rect->h = u_minify(pt->height0, level) << mt->ms_y;
+   rect->h = util_format_get_nblocksy(pt->format, rect->h);
+   rect->d = 1;
+   rect->z = 0;
+   if (mt->swizzled) {
+      if (pt->target == PIPE_TEXTURE_3D) {
+         rect->d = u_minify(pt->depth0, level);
+         rect->z = z; z = 0;
+      }
+      rect->pitch = 0;
+   } else {
+      rect->pitch = lvl->pitch;
+   }
+
+   rect->bo     = mt->base.bo;
+   rect->domain = NOUVEAU_BO_VRAM;
+   rect->offset = layer_offset(pt, level, z);
+   rect->cpp    = util_format_get_blocksize(pt->format);
+
+   rect->x0     = util_format_get_nblocksx(pt->format, x) << mt->ms_x;
+   rect->y0     = util_format_get_nblocksy(pt->format, y) << mt->ms_y;
+   rect->x1     = rect->x0 + (w << mt->ms_x);
+   rect->y1     = rect->y0 + (h << mt->ms_y);
+}
+
+void
+nv30_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dstres, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *srcres, unsigned src_level,
+                          const struct pipe_box *src_box)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_rect src, dst;
+
+   if (dstres->target == PIPE_BUFFER && srcres->target == PIPE_BUFFER) {
+      nouveau_copy_buffer(&nv30->base,
+                          nv04_resource(dstres), dstx,
+                          nv04_resource(srcres), src_box->x, src_box->width);
+      return;
+   }
+
+   define_rect(srcres, src_level, src_box->z, src_box->x, src_box->y,
+                       src_box->width, src_box->height, &src);
+   define_rect(dstres, dst_level, dstz, dstx, dsty,
+                       src_box->width, src_box->height, &dst);
+
+   nv30_transfer_rect(nv30, NEAREST, &src, &dst);
+}
+
+void
+nv30_resource_resolve(struct pipe_context *pipe,
+                      const struct pipe_resolve_info *info)
+{
+#if 0
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_rect src, dst;
+
+   define_rect(info->src.res, 0, 0, info->src.x0, info->src.y0,
+               info->src.x1 - info->src.x0, info->src.y1 - info->src.y0, &src);
+   define_rect(info->dst.res, info->dst.level, 0, info->dst.x0, info->dst.y0,
+               info->dst.x1 - info->dst.x0, info->dst.y1 - info->dst.y0, &dst);
+
+   nv30_transfer_rect(nv30, BILINEAR, &src, &dst);
+#endif
+}
+
+void
+nv30_blit(struct pipe_context *pipe,
+          const struct pipe_blit_info *blit_info)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct pipe_blit_info info = *blit_info;
+
+   if (info.src.resource->nr_samples > 1 &&
+       info.dst.resource->nr_samples <= 1 &&
+       !util_format_is_depth_or_stencil(info.src.resource->format) &&
+       !util_format_is_pure_integer(info.src.resource->format)) {
+      debug_printf("nv30: color resolve unimplemented\n");
+      return;
+   }
+
+   if (util_try_blit_via_copy_region(pipe, &info)) {
+      return; /* done */
+   }
+
+   if (info.mask & PIPE_MASK_S) {
+      debug_printf("nv30: cannot blit stencil, skipping\n");
+      info.mask &= ~PIPE_MASK_S;
+   }
+
+   if (!util_blitter_is_blit_supported(nv30->blitter, &info)) {
+      debug_printf("nv30: blit unsupported %s -> %s\n",
+                   util_format_short_name(info.src.resource->format),
+                   util_format_short_name(info.dst.resource->format));
+      return;
+   }
+
+   /* XXX turn off occlusion queries */
+
+   util_blitter_save_vertex_buffer_slot(nv30->blitter, nv30->vtxbuf);
+   util_blitter_save_vertex_elements(nv30->blitter, nv30->vertex);
+   util_blitter_save_vertex_shader(nv30->blitter, nv30->vertprog.program);
+   util_blitter_save_rasterizer(nv30->blitter, nv30->rast);
+   util_blitter_save_viewport(nv30->blitter, &nv30->viewport);
+   util_blitter_save_scissor(nv30->blitter, &nv30->scissor);
+   util_blitter_save_fragment_shader(nv30->blitter, nv30->fragprog.program);
+   util_blitter_save_blend(nv30->blitter, nv30->blend);
+   util_blitter_save_depth_stencil_alpha(nv30->blitter,
+                                         nv30->zsa);
+   util_blitter_save_stencil_ref(nv30->blitter, &nv30->stencil_ref);
+   util_blitter_save_sample_mask(nv30->blitter, nv30->sample_mask);
+   util_blitter_save_framebuffer(nv30->blitter, &nv30->framebuffer);
+   util_blitter_save_fragment_sampler_states(nv30->blitter,
+                     nv30->fragprog.num_samplers,
+                     (void**)nv30->fragprog.samplers);
+   util_blitter_save_fragment_sampler_views(nv30->blitter,
+                     nv30->fragprog.num_textures, nv30->fragprog.textures);
+   util_blitter_save_render_condition(nv30->blitter, nv30->render_cond_query,
+                                      nv30->render_cond_cond, nv30->render_cond_mode);
+   util_blitter_blit(nv30->blitter, &info);
+}
+
+static void *
+nv30_miptree_transfer_map(struct pipe_context *pipe, struct pipe_resource *pt,
+                          unsigned level, unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **ptransfer)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nouveau_device *dev = nv30->screen->base.device;
+   struct nv30_transfer *tx;
+   unsigned access = 0;
+   int ret;
+
+   tx = CALLOC_STRUCT(nv30_transfer);
+   if (!tx)
+      return NULL;
+   pipe_resource_reference(&tx->base.resource, pt);
+   tx->base.level = level;
+   tx->base.usage = usage;
+   tx->base.box = *box;
+   tx->base.stride = util_format_get_nblocksx(pt->format, box->width) *
+                     util_format_get_blocksize(pt->format);
+   tx->base.layer_stride = util_format_get_nblocksy(pt->format, box->height) *
+                           tx->base.stride;
+
+   tx->nblocksx = util_format_get_nblocksx(pt->format, box->width);
+   tx->nblocksy = util_format_get_nblocksy(pt->format, box->height);
+
+   define_rect(pt, level, box->z, box->x, box->y,
+                   tx->nblocksx, tx->nblocksy, &tx->img);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+                        tx->base.layer_stride, NULL, &tx->tmp.bo);
+   if (ret) {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      FREE(tx);
+      return NULL;
+   }
+
+   tx->tmp.domain = NOUVEAU_BO_GART;
+   tx->tmp.offset = 0;
+   tx->tmp.pitch  = tx->base.stride;
+   tx->tmp.cpp    = tx->img.cpp;
+   tx->tmp.w      = tx->nblocksx;
+   tx->tmp.h      = tx->nblocksy;
+   tx->tmp.d      = 1;
+   tx->tmp.x0     = 0;
+   tx->tmp.y0     = 0;
+   tx->tmp.x1     = tx->tmp.w;
+   tx->tmp.y1     = tx->tmp.h;
+   tx->tmp.z      = 0;
+
+   if (usage & PIPE_TRANSFER_READ)
+      nv30_transfer_rect(nv30, NEAREST, &tx->img, &tx->tmp);
+
+   if (tx->tmp.bo->map) {
+      *ptransfer = &tx->base;
+      return tx->tmp.bo->map;
+   }
+
+   if (usage & PIPE_TRANSFER_READ)
+      access |= NOUVEAU_BO_RD;
+   if (usage & PIPE_TRANSFER_WRITE)
+      access |= NOUVEAU_BO_WR;
+
+   ret = nouveau_bo_map(tx->tmp.bo, access, nv30->base.client);
+   if (ret) {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      FREE(tx);
+      return NULL;
+   }
+
+   *ptransfer = &tx->base;
+   return tx->tmp.bo->map;
+}
+
+static void
+nv30_miptree_transfer_unmap(struct pipe_context *pipe,
+                            struct pipe_transfer *ptx)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_transfer *tx = nv30_transfer(ptx);
+
+   if (ptx->usage & PIPE_TRANSFER_WRITE)
+      nv30_transfer_rect(nv30, NEAREST, &tx->tmp, &tx->img);
+
+   nouveau_bo_ref(NULL, &tx->tmp.bo);
+   pipe_resource_reference(&ptx->resource, NULL);
+   FREE(tx);
+}
+
+const struct u_resource_vtbl nv30_miptree_vtbl = {
+   nv30_miptree_get_handle,
+   nv30_miptree_destroy,
+   nv30_miptree_transfer_map,
+   u_default_transfer_flush_region,
+   nv30_miptree_transfer_unmap,
+   u_default_transfer_inline_write
+};
+
+struct pipe_resource *
+nv30_miptree_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *tmpl)
+{
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nv30_miptree *mt = CALLOC_STRUCT(nv30_miptree);
+   struct pipe_resource *pt = &mt->base.base;
+   unsigned blocksz, size;
+   unsigned w, h, d, l;
+   int ret;
+
+   switch (tmpl->nr_samples) {
+   case 4:
+      mt->ms_mode = 0x00004000;
+      mt->ms_x = 1;
+      mt->ms_y = 1;
+      break;
+   case 2:
+      mt->ms_mode = 0x00003000;
+      mt->ms_x = 1;
+      mt->ms_y = 0;
+      break;
+   default:
+      mt->ms_mode = 0x00000000;
+      mt->ms_x = 0;
+      mt->ms_y = 0;
+      break;
+   }
+
+   mt->base.vtbl = &nv30_miptree_vtbl;
+   *pt = *tmpl;
+   pipe_reference_init(&pt->reference, 1);
+   pt->screen = pscreen;
+
+   w = pt->width0 << mt->ms_x;
+   h = pt->height0 << mt->ms_y;
+   d = (pt->target == PIPE_TEXTURE_3D) ? pt->depth0 : 1;
+   blocksz = util_format_get_blocksize(pt->format);
+
+   if ((pt->target == PIPE_TEXTURE_RECT) ||
+       !util_is_power_of_two(pt->width0) ||
+       !util_is_power_of_two(pt->height0) ||
+       !util_is_power_of_two(pt->depth0) ||
+       util_format_is_compressed(pt->format) ||
+       util_format_is_float(pt->format) || mt->ms_mode) {
+      mt->uniform_pitch = util_format_get_nblocksx(pt->format, w) * blocksz;
+      mt->uniform_pitch = align(mt->uniform_pitch, 64);
+   }
+
+   if (!mt->uniform_pitch)
+      mt->swizzled = TRUE;
+
+   size = 0;
+   for (l = 0; l <= pt->last_level; l++) {
+      struct nv30_miptree_level *lvl = &mt->level[l];
+      unsigned nbx = util_format_get_nblocksx(pt->format, w);
+      unsigned nby = util_format_get_nblocksx(pt->format, h);
+
+      lvl->offset = size;
+      lvl->pitch  = mt->uniform_pitch;
+      if (!lvl->pitch)
+         lvl->pitch = nbx * blocksz;
+
+      lvl->zslice_size = lvl->pitch * nby;
+      size += lvl->zslice_size * d;
+
+      w = u_minify(w, 1);
+      h = u_minify(h, 1);
+      d = u_minify(d, 1);
+   }
+
+   mt->layer_size = size;
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      if (!mt->uniform_pitch)
+         mt->layer_size = align(mt->layer_size, 128);
+      size = mt->layer_size * 6;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, size, NULL, &mt->base.bo);
+   if (ret) {
+      FREE(mt);
+      return NULL;
+   }
+
+   mt->base.domain = NOUVEAU_BO_VRAM;
+   return &mt->base.base;
+}
+
+struct pipe_resource *
+nv30_miptree_from_handle(struct pipe_screen *pscreen,
+                         const struct pipe_resource *tmpl,
+                         struct winsys_handle *handle)
+{
+   struct nv30_miptree *mt;
+   unsigned stride;
+
+   /* only supports 2D, non-mipmapped textures for the moment */
+   if ((tmpl->target != PIPE_TEXTURE_2D &&
+        tmpl->target != PIPE_TEXTURE_RECT) ||
+       tmpl->last_level != 0 ||
+       tmpl->depth0 != 1 ||
+       tmpl->array_size > 1)
+      return NULL;
+
+   mt = CALLOC_STRUCT(nv30_miptree);
+   if (!mt)
+      return NULL;
+
+   mt->base.bo = nouveau_screen_bo_from_handle(pscreen, handle, &stride);
+   if (mt->base.bo == NULL) {
+      FREE(mt);
+      return NULL;
+   }
+
+   mt->base.base = *tmpl;
+   mt->base.vtbl = &nv30_miptree_vtbl;
+   pipe_reference_init(&mt->base.base.reference, 1);
+   mt->base.base.screen = pscreen;
+   mt->uniform_pitch = stride;
+   mt->level[0].pitch = mt->uniform_pitch;
+   mt->level[0].offset = 0;
+
+   /* no need to adjust bo reference count */
+   return &mt->base.base;
+}
+
+struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_context *pipe,
+                         struct pipe_resource *pt,
+                         const struct pipe_surface *tmpl)
+{
+   struct nv30_miptree *mt = nv30_miptree(pt); /* guaranteed */
+   struct nv30_surface *ns;
+   struct pipe_surface *ps;
+   struct nv30_miptree_level *lvl = &mt->level[tmpl->u.tex.level];
+
+   ns = CALLOC_STRUCT(nv30_surface);
+   if (!ns)
+      return NULL;
+   ps = &ns->base;
+
+   pipe_reference_init(&ps->reference, 1);
+   pipe_resource_reference(&ps->texture, pt);
+   ps->context = pipe;
+   ps->format = tmpl->format;
+   ps->u.tex.level = tmpl->u.tex.level;
+   ps->u.tex.first_layer = tmpl->u.tex.first_layer;
+   ps->u.tex.last_layer = tmpl->u.tex.last_layer;
+
+   ns->width = u_minify(pt->width0, ps->u.tex.level);
+   ns->height = u_minify(pt->height0, ps->u.tex.level);
+   ns->depth = ps->u.tex.last_layer - ps->u.tex.first_layer + 1;
+   ns->offset = layer_offset(pt, ps->u.tex.level, ps->u.tex.first_layer);
+   if (mt->swizzled)
+      ns->pitch = 4096; /* random, just something the hw won't reject.. */
+   else
+      ns->pitch = lvl->pitch;
+
+   /* comment says there are going to be removed, but they're used by the st */
+   ps->width = ns->width;
+   ps->height = ns->height;
+   return ps;
+}
+
+void
+nv30_miptree_surface_del(struct pipe_context *pipe, struct pipe_surface *ps)
+{
+   struct nv30_surface *ns = nv30_surface(ps);
+
+   pipe_resource_reference(&ps->texture, NULL);
+   FREE(ns);
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_push.c b/src/gallium/drivers/nouveau/nv30/nv30_push.c
new file mode 100644
index 00000000000..e0734fa70d3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_push.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_resource.h"
+
+struct push_context {
+   struct nouveau_pushbuf *push;
+
+   const void *idxbuf;
+
+   float edgeflag;
+   int edgeflag_attr;
+
+   uint32_t vertex_words;
+   uint32_t packet_vertex_limit;
+
+   struct translate *translate;
+
+   boolean primitive_restart;
+   uint32_t prim;
+   uint32_t restart_index;
+};
+
+static INLINE unsigned
+prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static void
+emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i08(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         BEGIN_NV04(ctx->push, NV30_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+         count--;
+         elts++;
+      }
+   }
+}
+
+static void
+emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i16(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         BEGIN_NV04(ctx->push, NV30_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+         count--;
+         elts++;
+      }
+   }
+}
+
+static void
+emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i32(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, 0, ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         BEGIN_NV04(ctx->push, NV30_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+         count--;
+         elts++;
+      }
+   }
+}
+
+static void
+emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
+{
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size = ctx->vertex_words * push;
+
+      BEGIN_NI04(ctx->push, NV30_3D(VERTEX_DATA), size);
+
+      ctx->translate->run(ctx->translate, start, push, 0, 0, ctx->push->cur);
+      ctx->push->cur += size;
+      count -= push;
+      start += push;
+   }
+}
+
+void
+nv30_push_vbo(struct nv30_context *nv30, const struct pipe_draw_info *info)
+{
+   struct push_context ctx;
+   unsigned i, index_size;
+   boolean apply_bias = info->indexed && info->index_bias;
+
+   ctx.push = nv30->base.pushbuf;
+   ctx.translate = nv30->vertex->translate;
+   ctx.packet_vertex_limit = nv30->vertex->vtx_per_packet_max;
+   ctx.vertex_words = nv30->vertex->vtx_size;
+
+   for (i = 0; i < nv30->num_vtxbufs; ++i) {
+      uint8_t *data;
+      struct pipe_vertex_buffer *vb = &nv30->vtxbuf[i];
+      struct nv04_resource *res = nv04_resource(vb->buffer);
+
+      if (!vb->buffer && !vb->user_buffer) {
+         continue;
+      }
+
+      data = nouveau_resource_map_offset(&nv30->base, res,
+                                         vb->buffer_offset, NOUVEAU_BO_RD);
+
+      if (apply_bias)
+         data += info->index_bias * vb->stride;
+
+      ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+   }
+
+   if (info->indexed) {
+      if (nv30->idxbuf.buffer)
+         ctx.idxbuf = nouveau_resource_map_offset(&nv30->base,
+            nv04_resource(nv30->idxbuf.buffer), nv30->idxbuf.offset,
+            NOUVEAU_BO_RD);
+      else
+         ctx.idxbuf = nv30->idxbuf.user_buffer;
+      if (!ctx.idxbuf) {
+         nv30_state_release(nv30);
+         return;
+      }
+      index_size = nv30->idxbuf.index_size;
+      ctx.primitive_restart = info->primitive_restart;
+      ctx.restart_index = info->restart_index;
+   } else {
+      ctx.idxbuf = NULL;
+      index_size = 0;
+      ctx.primitive_restart = FALSE;
+      ctx.restart_index = 0;
+   }
+
+   if (nv30->screen->eng3d->oclass >= NV40_3D_CLASS) {
+      BEGIN_NV04(ctx.push, NV40_3D(PRIM_RESTART_ENABLE), 2);
+      PUSH_DATA (ctx.push, info->primitive_restart);
+      PUSH_DATA (ctx.push, info->restart_index);
+      nv30->state.prim_restart = info->primitive_restart;
+   }
+
+   ctx.prim = nv30_prim_gl(info->mode);
+
+   PUSH_RESET(ctx.push, BUFCTX_IDXBUF);
+   BEGIN_NV04(ctx.push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (ctx.push, ctx.prim);
+   switch (index_size) {
+   case 0:
+      emit_vertices_seq(&ctx, info->start, info->count);
+      break;
+   case 1:
+      emit_vertices_i08(&ctx, info->start, info->count);
+      break;
+   case 2:
+      emit_vertices_i16(&ctx, info->start, info->count);
+      break;
+   case 4:
+      emit_vertices_i32(&ctx, info->start, info->count);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   BEGIN_NV04(ctx.push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (ctx.push, NV30_3D_VERTEX_BEGIN_END_STOP);
+
+   if (info->indexed)
+      nouveau_resource_unmap(nv04_resource(nv30->idxbuf.buffer));
+
+   for (i = 0; i < nv30->num_vtxbufs; ++i) {
+      if (nv30->vtxbuf[i].buffer) {
+         nouveau_resource_unmap(nv04_resource(nv30->vtxbuf[i].buffer));
+      }
+   }
+
+   nv30_state_release(nv30);
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
new file mode 100644
index 00000000000..01b3817c45d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_screen.h"
+#include "nv30/nv30_context.h"
+
+#define LIST_FIRST_ENTRY(__type, __item, __field) \
+   LIST_ENTRY(__type, (__item)->next, __field)
+
+struct nv30_query_object {
+   struct list_head list;
+   struct nouveau_heap *hw;
+};
+
+static volatile void *
+nv30_ntfy(struct nv30_screen *screen, struct nv30_query_object *qo)
+{
+   struct nv04_notify *query = screen->query->data;
+   struct nouveau_bo *notify = screen->notify;
+   volatile void *ntfy = NULL;
+
+   if (qo && qo->hw)
+      ntfy = (char *)notify->map + query->offset + qo->hw->start;
+
+   return ntfy;
+}
+
+static void
+nv30_query_object_del(struct nv30_screen *screen, struct nv30_query_object **po)
+{
+   struct nv30_query_object *qo = *po; *po = NULL;
+   if (qo) {
+      volatile uint32_t *ntfy = nv30_ntfy(screen, qo);
+      while (ntfy[3] & 0xff000000) {
+      }
+      nouveau_heap_free(&qo->hw);
+      LIST_DEL(&qo->list);
+      FREE(qo);
+   }
+}
+
+static struct nv30_query_object *
+nv30_query_object_new(struct nv30_screen *screen)
+{
+   struct nv30_query_object *oq, *qo = CALLOC_STRUCT(nv30_query_object);
+   volatile uint32_t *ntfy;
+
+   if (!qo)
+      return NULL;
+
+   /* allocate a new hw query object, if no hw objects left we need to
+    * spin waiting for one to become free
+    */
+   while (nouveau_heap_alloc(screen->query_heap, 32, NULL, &qo->hw)) {
+      oq = LIST_FIRST_ENTRY(struct nv30_query_object, &screen->queries, list);
+      nv30_query_object_del(screen, &oq);
+   }
+
+   LIST_ADDTAIL(&qo->list, &screen->queries);
+
+   ntfy = nv30_ntfy(screen, qo);
+   ntfy[0] = 0x00000000;
+   ntfy[1] = 0x00000000;
+   ntfy[2] = 0x00000000;
+   ntfy[3] = 0x01000000;
+   return qo;
+}
+
+struct nv30_query {
+   struct nv30_query_object *qo[2];
+   unsigned type;
+   uint32_t report;
+   uint32_t enable;
+   uint64_t result;
+};
+
+static INLINE struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+   return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned type)
+{
+   struct nv30_query *q = CALLOC_STRUCT(nv30_query);
+   if (!q)
+      return NULL;
+
+   q->type = type;
+
+   switch (q->type) {
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIME_ELAPSED:
+      q->enable = 0x0000;
+      q->report = 1;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      q->enable = NV30_3D_QUERY_ENABLE;
+      q->report = 1;
+      break;
+   case NV30_QUERY_ZCULL_0:
+   case NV30_QUERY_ZCULL_1:
+   case NV30_QUERY_ZCULL_2:
+   case NV30_QUERY_ZCULL_3:
+      q->enable = 0x1804;
+      q->report = 2 + (q->type - NV30_QUERY_ZCULL_0);
+      break;
+   default:
+      FREE(q);
+      return NULL;
+   }
+
+   return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   FREE(pq);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_query *q = nv30_query(pq);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   switch (q->type) {
+   case PIPE_QUERY_TIME_ELAPSED:
+      q->qo[0] = nv30_query_object_new(nv30->screen);
+      if (q->qo[0]) {
+         BEGIN_NV04(push, NV30_3D(QUERY_GET), 1);
+         PUSH_DATA (push, (q->report << 24) | q->qo[0]->hw->start);
+      }
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      return;
+   default:
+      BEGIN_NV04(push, NV30_3D(QUERY_RESET), 1);
+      PUSH_DATA (push, q->report);
+      break;
+   }
+
+   if (q->enable) {
+      BEGIN_NV04(push, SUBC_3D(q->enable), 1);
+      PUSH_DATA (push, 1);
+   }
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_screen *screen = nv30->screen;
+   struct nv30_query *q = nv30_query(pq);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   q->qo[1] = nv30_query_object_new(screen);
+   if (q->qo[1]) {
+      BEGIN_NV04(push, NV30_3D(QUERY_GET), 1);
+      PUSH_DATA (push, (q->report << 24) | q->qo[1]->hw->start);
+   }
+
+   if (q->enable) {
+      BEGIN_NV04(push, SUBC_3D(q->enable), 1);
+      PUSH_DATA (push, 0);
+   }
+   PUSH_KICK (push);
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                  boolean wait, union pipe_query_result *result)
+{
+   struct nv30_screen *screen = nv30_screen(pipe->screen);
+   struct nv30_query *q = nv30_query(pq);
+   volatile uint32_t *ntfy0 = nv30_ntfy(screen, q->qo[0]);
+   volatile uint32_t *ntfy1 = nv30_ntfy(screen, q->qo[1]);
+   uint64_t *res64 = &result->u64;
+
+   if (ntfy1) {
+      while (ntfy1[3] & 0xff000000) {
+         if (!wait)
+            return FALSE;
+      }
+
+      switch (q->type) {
+      case PIPE_QUERY_TIMESTAMP:
+         q->result = *(uint64_t *)&ntfy1[0];
+         break;
+      case PIPE_QUERY_TIME_ELAPSED:
+         q->result = *(uint64_t *)&ntfy1[0] - *(uint64_t *)&ntfy0[0];
+         break;
+      default:
+         q->result = ntfy1[2];
+         break;
+      }
+
+      nv30_query_object_del(screen, &q->qo[0]);
+      nv30_query_object_del(screen, &q->qo[1]);
+   }
+
+   *res64 = q->result;
+   return TRUE;
+}
+
+static void
+nv40_query_render_condition(struct pipe_context *pipe,
+                            struct pipe_query *pq,
+                            boolean condition, uint mode)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nv30_query *q = nv30_query(pq);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   nv30->render_cond_query = pq;
+   nv30->render_cond_mode = mode;
+   nv30->render_cond_cond = condition;
+
+   if (!pq) {
+      BEGIN_NV04(push, SUBC_3D(0x1e98), 1);
+      PUSH_DATA (push, 0x01000000);
+      return;
+   }
+
+   if (mode == PIPE_RENDER_COND_WAIT ||
+       mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
+      BEGIN_NV04(push, SUBC_3D(0x0110), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   BEGIN_NV04(push, SUBC_3D(0x1e98), 1);
+   PUSH_DATA (push, 0x02000000 | q->qo[1]->hw->start);
+}
+
+void
+nv30_query_init(struct pipe_context *pipe)
+{
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
+
+   pipe->create_query = nv30_query_create;
+   pipe->destroy_query = nv30_query_destroy;
+   pipe->begin_query = nv30_query_begin;
+   pipe->end_query = nv30_query_end;
+   pipe->get_query_result = nv30_query_result;
+   if (eng3d->oclass >= NV40_3D_CLASS)
+      pipe->render_condition = nv40_query_render_condition;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
new file mode 100644
index 00000000000..c99db1ce91b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+
+#include "nv30/nv30_screen.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_resource.h"
+#include "nv30/nv30_transfer.h"
+
+static struct pipe_resource *
+nv30_resource_create(struct pipe_screen *pscreen,
+                     const struct pipe_resource *tmpl)
+{
+   switch (tmpl->target) {
+   case PIPE_BUFFER:
+      return nouveau_buffer_create(pscreen, tmpl);
+   default:
+      return nv30_miptree_create(pscreen, tmpl);
+   }
+}
+
+static struct pipe_resource *
+nv30_resource_from_handle(struct pipe_screen *pscreen,
+                          const struct pipe_resource *tmpl,
+                          struct winsys_handle *handle)
+{
+   if (tmpl->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return nv30_miptree_from_handle(pscreen, tmpl, handle);
+}
+
+void
+nv30_resource_screen_init(struct pipe_screen *pscreen)
+{
+   pscreen->resource_create = nv30_resource_create;
+   pscreen->resource_from_handle = nv30_resource_from_handle;
+   pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+   pscreen->resource_destroy = u_resource_destroy_vtbl;
+}
+
+void
+nv30_resource_init(struct pipe_context *pipe)
+{
+   pipe->transfer_map = u_transfer_map_vtbl;
+   pipe->transfer_flush_region = u_transfer_flush_region_vtbl;
+   pipe->transfer_unmap = u_transfer_unmap_vtbl;
+   pipe->transfer_inline_write = u_transfer_inline_write_vtbl;
+   pipe->create_surface = nv30_miptree_surface_new;
+   pipe->surface_destroy = nv30_miptree_surface_del;
+   pipe->resource_copy_region = nv30_resource_copy_region;
+   pipe->blit = nv30_blit;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.h b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
new file mode 100644
index 00000000000..aff41966b3c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.h
@@ -0,0 +1,75 @@
+#ifndef __NV30_RESOURCE_H__
+#define __NV30_RESOURCE_H__
+
+#include "nouveau_buffer.h"
+
+void nv30_resource_screen_init(struct pipe_screen *);
+void nv30_resource_init(struct pipe_context *);
+
+struct nv30_surface {
+   struct pipe_surface base;
+   uint32_t offset;
+   uint32_t pitch;
+   uint32_t width;
+   uint16_t height;
+   uint16_t depth;
+};
+
+static INLINE struct nv30_surface *
+nv30_surface(struct pipe_surface *ps)
+{
+   return (struct nv30_surface *)ps;
+}
+
+struct nv30_miptree_level {
+   uint32_t offset;
+   uint32_t pitch;
+   uint32_t zslice_size;
+};
+
+struct nv30_miptree {
+   struct nv04_resource base;
+   struct nv30_miptree_level level[13];
+   uint32_t uniform_pitch;
+   uint32_t layer_size;
+   boolean swizzled;
+   unsigned ms_mode;
+   unsigned ms_x:1;
+   unsigned ms_y:1;
+};
+
+static INLINE struct nv30_miptree *
+nv30_miptree(struct pipe_resource *pt)
+{
+   return (struct nv30_miptree *)pt;
+}
+
+struct pipe_resource *
+nv30_miptree_create(struct pipe_screen *, const struct pipe_resource *);
+
+struct pipe_resource *
+nv30_miptree_from_handle(struct pipe_screen *, const struct pipe_resource *,
+                         struct winsys_handle *);
+
+struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_context *, struct pipe_resource *,
+                         const struct pipe_surface *);
+
+void
+nv30_miptree_surface_del(struct pipe_context *, struct pipe_surface *);
+
+void
+nv30_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *src, unsigned src_level,
+                          const struct pipe_box *src_box);
+
+void
+nv30_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
+
+void
+nv30_blit(struct pipe_context *pipe,
+          const struct pipe_blit_info *blit_info);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
new file mode 100644
index 00000000000..50ddfec3b21
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+
+#include "nv_object.xml.h"
+#include "nv_m2mf.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv01_2d.xml.h"
+
+#include "nouveau_fence.h"
+#include "nv30/nv30_screen.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_resource.h"
+#include "nv30/nv30_format.h"
+
+#define RANKINE_0397_CHIPSET 0x00000003
+#define RANKINE_0497_CHIPSET 0x000001e0
+#define RANKINE_0697_CHIPSET 0x00000010
+#define CURIE_4097_CHIPSET   0x00000baf
+#define CURIE_4497_CHIPSET   0x00005450
+#define CURIE_4497_CHIPSET6X 0x00000088
+
+static int
+nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nouveau_object *eng3d = screen->eng3d;
+
+   switch (param) {
+   /* non-boolean capabilities */
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return (eng3d->oclass >= NV40_3D_CLASS) ? 4 : 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 13;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 10;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 13;
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 16;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return 120;
+   /* supported capabilities */
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_POINT_SPRITE:
+   case PIPE_CAP_SCALED_RESOLVE:
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+      return 1;
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+      return 0;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
+   /* nv4x capabilities */
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return (eng3d->oclass >= NV40_3D_CLASS) ? 1 : 0;
+   /* unsupported */
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+   case PIPE_CAP_SM3:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: /* XXX: yes? */
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_START_INSTANCE:
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return 0;
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 1;
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_LITTLE;
+   default:
+      debug_printf("unknown param %d\n", param);
+      return 0;
+   }
+}
+
+static float
+nv30_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nouveau_object *eng3d = screen->eng3d;
+
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      return 10.0;
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 64.0;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return (eng3d->oclass >= NV40_3D_CLASS) ? 16.0 : 8.0;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 15.0;
+   default:
+      debug_printf("unknown paramf %d\n", param);
+      return 0;
+   }
+}
+
+static int
+nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
+                             enum pipe_shader_cap param)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nouveau_object *eng3d = screen->eng3d;
+
+   switch (shader) {
+   case PIPE_SHADER_VERTEX:
+      switch (param) {
+      case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 512 : 256;
+      case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 512 : 0;
+      case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+         return 0;
+      case PIPE_SHADER_CAP_MAX_INPUTS:
+         return 16;
+      case PIPE_SHADER_CAP_MAX_CONSTS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? (468 - 6): (256 - 6);
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+         return 1;
+      case PIPE_SHADER_CAP_MAX_TEMPS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 32 : 13;
+      case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+         return 0;
+      case PIPE_SHADER_CAP_MAX_ADDRS:
+         return 2;
+      case PIPE_SHADER_CAP_MAX_PREDS:
+      case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+      case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      case PIPE_SHADER_CAP_SUBROUTINES:
+      case PIPE_SHADER_CAP_INTEGERS:
+         return 0;
+      default:
+         debug_printf("unknown vertex shader param %d\n", param);
+         return 0;
+      }
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      switch (param) {
+      case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+         return 4096;
+      case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+         return 0;
+      case PIPE_SHADER_CAP_MAX_INPUTS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 12 : 10;
+      case PIPE_SHADER_CAP_MAX_CONSTS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 224 : 32;
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+         return 1;
+      case PIPE_SHADER_CAP_MAX_TEMPS:
+         return 32;
+      case PIPE_SHADER_CAP_MAX_ADDRS:
+         return (eng3d->oclass >= NV40_3D_CLASS) ? 1 : 0;
+      case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+         return 16;
+      case PIPE_SHADER_CAP_MAX_PREDS:
+      case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+      case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      case PIPE_SHADER_CAP_SUBROUTINES:
+         return 0;
+      default:
+         debug_printf("unknown fragment shader param %d\n", param);
+         return 0;
+      }
+      break;
+   default:
+      return 0;
+   }
+}
+
+static boolean
+nv30_screen_is_format_supported(struct pipe_screen *pscreen,
+                                enum pipe_format format,
+                                enum pipe_texture_target target,
+                                unsigned sample_count,
+                                unsigned bindings)
+{
+   if (sample_count > 4)
+      return FALSE;
+   if (!(0x00000017 & (1 << sample_count)))
+      return FALSE;
+
+   if (!util_format_is_supported(format, bindings)) {
+      return FALSE;
+   }
+
+   /* transfers & shared are always supported */
+   bindings &= ~(PIPE_BIND_TRANSFER_READ |
+                 PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_SHARED);
+
+   return (nv30_format_info(pscreen, format)->bindings & bindings) == bindings;
+}
+
+static void
+nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+
+   *sequence = ++screen->base.fence.sequence;
+
+   BEGIN_NV04(push, NV30_3D(FENCE_OFFSET), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, *sequence);
+}
+
+static uint32_t
+nv30_screen_fence_update(struct pipe_screen *pscreen)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+   struct nv04_notify *fence = screen->fence->data;
+   return *(uint32_t *)((char *)screen->notify->map + fence->offset);
+}
+
+static void
+nv30_screen_destroy(struct pipe_screen *pscreen)
+{
+   struct nv30_screen *screen = nv30_screen(pscreen);
+
+   if (screen->base.fence.current &&
+       screen->base.fence.current->state >= NOUVEAU_FENCE_STATE_EMITTED) {
+      nouveau_fence_wait(screen->base.fence.current);
+      nouveau_fence_ref (NULL, &screen->base.fence.current);
+   }
+
+   nouveau_object_del(&screen->query);
+   nouveau_object_del(&screen->fence);
+   nouveau_object_del(&screen->ntfy);
+
+   nouveau_object_del(&screen->sifm);
+   nouveau_object_del(&screen->swzsurf);
+   nouveau_object_del(&screen->surf2d);
+   nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->eng3d);
+   nouveau_object_del(&screen->null);
+
+   nouveau_screen_fini(&screen->base);
+   FREE(screen);
+}
+
+#define FAIL_SCREEN_INIT(str, err)                    \
+   do {                                               \
+      NOUVEAU_ERR(str, err);                          \
+      nv30_screen_destroy(pscreen);                   \
+      return NULL;                                    \
+   } while(0)
+
+struct pipe_screen *
+nv30_screen_create(struct nouveau_device *dev)
+{
+   struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+   struct pipe_screen *pscreen;
+   struct nouveau_pushbuf *push;
+   struct nv04_fifo *fifo;
+   unsigned oclass = 0;
+   int ret, i;
+
+   if (!screen)
+      return NULL;
+
+   switch (dev->chipset & 0xf0) {
+   case 0x30:
+      if (RANKINE_0397_CHIPSET & (1 << (dev->chipset & 0x0f)))
+         oclass = NV30_3D_CLASS;
+      else
+      if (RANKINE_0697_CHIPSET & (1 << (dev->chipset & 0x0f)))
+         oclass = NV34_3D_CLASS;
+      else
+      if (RANKINE_0497_CHIPSET & (1 << (dev->chipset & 0x0f)))
+         oclass = NV35_3D_CLASS;
+      break;
+   case 0x40:
+      if (CURIE_4097_CHIPSET & (1 << (dev->chipset & 0x0f)))
+         oclass = NV40_3D_CLASS;
+      else
+      if (CURIE_4497_CHIPSET & (1 << (dev->chipset & 0x0f)))
+         oclass = NV44_3D_CLASS;
+      break;
+   case 0x60:
+      if (CURIE_4497_CHIPSET6X & (1 << (dev->chipset & 0x0f)))
+         oclass = NV44_3D_CLASS;
+      break;
+   default:
+      break;
+   }
+
+   if (!oclass) {
+      NOUVEAU_ERR("unknown 3d class for 0x%02x\n", dev->chipset);
+      FREE(screen);
+      return NULL;
+   }
+
+   pscreen = &screen->base.base;
+   pscreen->destroy = nv30_screen_destroy;
+   pscreen->get_param = nv30_screen_get_param;
+   pscreen->get_paramf = nv30_screen_get_paramf;
+   pscreen->get_shader_param = nv30_screen_get_shader_param;
+   pscreen->context_create = nv30_context_create;
+   pscreen->is_format_supported = nv30_screen_is_format_supported;
+   nv30_resource_screen_init(pscreen);
+   nouveau_screen_init_vdec(&screen->base);
+
+   screen->base.fence.emit = nv30_screen_fence_emit;
+   screen->base.fence.update = nv30_screen_fence_update;
+
+   ret = nouveau_screen_init(&screen->base, dev);
+   if (ret)
+      FAIL_SCREEN_INIT("nv30_screen_init failed: %d\n", ret);
+
+   screen->base.vidmem_bindings |= PIPE_BIND_VERTEX_BUFFER;
+   screen->base.sysmem_bindings |= PIPE_BIND_VERTEX_BUFFER;
+   if (oclass == NV40_3D_CLASS) {
+      screen->base.vidmem_bindings |= PIPE_BIND_INDEX_BUFFER;
+      screen->base.sysmem_bindings |= PIPE_BIND_INDEX_BUFFER;
+   }
+
+   fifo = screen->base.channel->data;
+   push = screen->base.pushbuf;
+   push->rsvd_kick = 16;
+
+   ret = nouveau_object_new(screen->base.channel, 0x00000000, NV01_NULL_CLASS,
+                            NULL, 0, &screen->null);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating null object: %d\n", ret);
+
+   /* DMA_FENCE refuses to accept DMA objects with "adjust" filled in,
+    * this means that the address pointed at by the DMA object must
+    * be 4KiB aligned, which means this object needs to be the first
+    * one allocated on the channel.
+    */
+   ret = nouveau_object_new(screen->base.channel, 0xbeef1e00,
+                            NOUVEAU_NOTIFIER_CLASS, &(struct nv04_notify) {
+                            .length = 32 }, sizeof(struct nv04_notify),
+                            &screen->fence);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating fence notifier: %d\n", ret);
+
+   /* DMA_NOTIFY object, we don't actually use this but M2MF fails without */
+   ret = nouveau_object_new(screen->base.channel, 0xbeef0301,
+                            NOUVEAU_NOTIFIER_CLASS, &(struct nv04_notify) {
+                            .length = 32 }, sizeof(struct nv04_notify),
+                            &screen->ntfy);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating sync notifier: %d\n", ret);
+
+   /* DMA_QUERY, used to implement occlusion queries, we attempt to allocate
+    * the remainder of the "notifier block" assigned by the kernel for
+    * use as query objects
+    */
+   ret = nouveau_object_new(screen->base.channel, 0xbeef0351,
+                            NOUVEAU_NOTIFIER_CLASS, &(struct nv04_notify) {
+                            .length = 4096 - 128 }, sizeof(struct nv04_notify),
+                            &screen->query);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating query notifier: %d\n", ret);
+
+   ret = nouveau_heap_init(&screen->query_heap, 0, 4096 - 128);
+   if (ret)
+      FAIL_SCREEN_INIT("error creating query heap: %d\n", ret);
+
+   LIST_INITHEAD(&screen->queries);
+
+   /* Vertex program resources (code/data), currently 6 of the constant
+    * slots are reserved to implement user clipping planes
+    */
+   if (oclass < NV40_3D_CLASS) {
+      nouveau_heap_init(&screen->vp_exec_heap, 0, 256);
+      nouveau_heap_init(&screen->vp_data_heap, 6, 256 - 6);
+   } else {
+      nouveau_heap_init(&screen->vp_exec_heap, 0, 512);
+      nouveau_heap_init(&screen->vp_data_heap, 6, 468 - 6);
+   }
+
+   ret = nouveau_bo_wrap(screen->base.device, fifo->notify, &screen->notify);
+   if (ret == 0)
+      nouveau_bo_map(screen->notify, 0, screen->base.client);
+   if (ret)
+      FAIL_SCREEN_INIT("error mapping notifier memory: %d\n", ret);
+
+   ret = nouveau_object_new(screen->base.channel, 0xbeef3097, oclass,
+                            NULL, 0, &screen->eng3d);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating 3d object: %d\n", ret);
+
+   BEGIN_NV04(push, NV01_SUBC(3D, OBJECT), 1);
+   PUSH_DATA (push, screen->eng3d->handle);
+   BEGIN_NV04(push, NV30_3D(DMA_NOTIFY), 13);
+   PUSH_DATA (push, screen->ntfy->handle);
+   PUSH_DATA (push, fifo->vram);     /* TEXTURE0 */
+   PUSH_DATA (push, fifo->gart);     /* TEXTURE1 */
+   PUSH_DATA (push, fifo->vram);     /* COLOR1 */
+   PUSH_DATA (push, screen->null->handle);  /* UNK190 */
+   PUSH_DATA (push, fifo->vram);     /* COLOR0 */
+   PUSH_DATA (push, fifo->vram);     /* ZETA */
+   PUSH_DATA (push, fifo->vram);     /* VTXBUF0 */
+   PUSH_DATA (push, fifo->gart);     /* VTXBUF1 */
+   PUSH_DATA (push, screen->fence->handle);  /* FENCE */
+   PUSH_DATA (push, screen->query->handle);  /* QUERY - intr 0x80 if nullobj */
+   PUSH_DATA (push, screen->null->handle);  /* UNK1AC */
+   PUSH_DATA (push, screen->null->handle);  /* UNK1B0 */
+   if (screen->eng3d->oclass < NV40_3D_CLASS) {
+      BEGIN_NV04(push, SUBC_3D(0x03b0), 1);
+      PUSH_DATA (push, 0x00100000);
+      BEGIN_NV04(push, SUBC_3D(0x1d80), 1);
+      PUSH_DATA (push, 3);
+
+      BEGIN_NV04(push, SUBC_3D(0x1e98), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, SUBC_3D(0x17e0), 3);
+      PUSH_DATA (push, fui(0.0));
+      PUSH_DATA (push, fui(0.0));
+      PUSH_DATA (push, fui(1.0));
+      BEGIN_NV04(push, SUBC_3D(0x1f80), 16);
+      for (i = 0; i < 16; i++)
+         PUSH_DATA (push, (i == 8) ? 0x0000ffff : 0);
+
+      BEGIN_NV04(push, NV30_3D(RC_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   } else {
+      BEGIN_NV04(push, NV40_3D(DMA_COLOR2), 2);
+      PUSH_DATA (push, fifo->vram);
+      PUSH_DATA (push, fifo->vram);  /* COLOR3 */
+
+      BEGIN_NV04(push, SUBC_3D(0x1450), 1);
+      PUSH_DATA (push, 0x00000004);
+
+      BEGIN_NV04(push, SUBC_3D(0x1ea4), 3); /* ZCULL */
+      PUSH_DATA (push, 0x00000010);
+      PUSH_DATA (push, 0x01000100);
+      PUSH_DATA (push, 0xff800006);
+
+      /* vtxprog output routing */
+      BEGIN_NV04(push, SUBC_3D(0x1fc4), 1);
+      PUSH_DATA (push, 0x06144321);
+      BEGIN_NV04(push, SUBC_3D(0x1fc8), 2);
+      PUSH_DATA (push, 0xedcba987);
+      PUSH_DATA (push, 0x0000006f);
+      BEGIN_NV04(push, SUBC_3D(0x1fd0), 1);
+      PUSH_DATA (push, 0x00171615);
+      BEGIN_NV04(push, SUBC_3D(0x1fd4), 1);
+      PUSH_DATA (push, 0x001b1a19);
+
+      BEGIN_NV04(push, SUBC_3D(0x1ef8), 1);
+      PUSH_DATA (push, 0x0020ffff);
+      BEGIN_NV04(push, SUBC_3D(0x1d64), 1);
+      PUSH_DATA (push, 0x01d300d4);
+
+      BEGIN_NV04(push, NV40_3D(MIPMAP_ROUNDING), 1);
+      PUSH_DATA (push, NV40_3D_MIPMAP_ROUNDING_MODE_DOWN);
+   }
+
+   ret = nouveau_object_new(screen->base.channel, 0xbeef3901, NV03_M2MF_CLASS,
+                            NULL, 0, &screen->m2mf);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating m2mf object: %d\n", ret);
+
+   BEGIN_NV04(push, NV01_SUBC(M2MF, OBJECT), 1);
+   PUSH_DATA (push, screen->m2mf->handle);
+   BEGIN_NV04(push, NV03_M2MF(DMA_NOTIFY), 1);
+   PUSH_DATA (push, screen->ntfy->handle);
+
+   ret = nouveau_object_new(screen->base.channel, 0xbeef6201,
+                            NV10_SURFACE_2D_CLASS, NULL, 0, &screen->surf2d);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating surf2d object: %d\n", ret);
+
+   BEGIN_NV04(push, NV01_SUBC(SF2D, OBJECT), 1);
+   PUSH_DATA (push, screen->surf2d->handle);
+   BEGIN_NV04(push, NV04_SF2D(DMA_NOTIFY), 1);
+   PUSH_DATA (push, screen->ntfy->handle);
+
+   if (dev->chipset < 0x40)
+      oclass = NV30_SURFACE_SWZ_CLASS;
+   else
+      oclass = NV40_SURFACE_SWZ_CLASS;
+
+   ret = nouveau_object_new(screen->base.channel, 0xbeef5201, oclass,
+                            NULL, 0, &screen->swzsurf);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating swizzled surface object: %d\n", ret);
+
+   BEGIN_NV04(push, NV01_SUBC(SSWZ, OBJECT), 1);
+   PUSH_DATA (push, screen->swzsurf->handle);
+   BEGIN_NV04(push, NV04_SSWZ(DMA_NOTIFY), 1);
+   PUSH_DATA (push, screen->ntfy->handle);
+
+   if (dev->chipset < 0x40)
+      oclass = NV30_SIFM_CLASS;
+   else
+      oclass = NV40_SIFM_CLASS;
+
+   ret = nouveau_object_new(screen->base.channel, 0xbeef7701, oclass,
+                            NULL, 0, &screen->sifm);
+   if (ret)
+      FAIL_SCREEN_INIT("error allocating scaled image object: %d\n", ret);
+
+   BEGIN_NV04(push, NV01_SUBC(SIFM, OBJECT), 1);
+   PUSH_DATA (push, screen->sifm->handle);
+   BEGIN_NV04(push, NV03_SIFM(DMA_NOTIFY), 1);
+   PUSH_DATA (push, screen->ntfy->handle);
+   BEGIN_NV04(push, NV05_SIFM(COLOR_CONVERSION), 1);
+   PUSH_DATA (push, NV05_SIFM_COLOR_CONVERSION_TRUNCATE);
+
+   nouveau_pushbuf_kick(push, push->channel);
+
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+   return pscreen;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.h b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
new file mode 100644
index 00000000000..c4c3aae52d9
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.h
@@ -0,0 +1,51 @@
+#ifndef __NV30_SCREEN_H__
+#define __NV30_SCREEN_H__
+
+#include <stdio.h>
+
+#define NOUVEAU_ERR(fmt, args...) \
+   fprintf(stderr, "%s:%d -  "fmt, __FUNCTION__, __LINE__, ##args);
+
+#include "util/u_double_list.h"
+
+#include "nouveau_screen.h"
+#include "nouveau_fence.h"
+#include "nouveau_heap.h"
+#include "nv30/nv30_winsys.h"
+#include "nv30/nv30_resource.h"
+
+struct nv30_context;
+
+struct nv30_screen {
+   struct nouveau_screen base;
+
+   struct nv30_context *cur_ctx;
+
+   struct nouveau_bo *notify;
+
+   struct nouveau_object *ntfy;
+   struct nouveau_object *fence;
+
+   struct nouveau_object *query;
+   struct nouveau_heap *query_heap;
+   struct list_head queries;
+
+   struct nouveau_object *null;
+   struct nouveau_object *eng3d;
+   struct nouveau_object *m2mf;
+   struct nouveau_object *surf2d;
+   struct nouveau_object *swzsurf;
+   struct nouveau_object *sifm;
+
+   /*XXX: nvfx state */
+   struct nouveau_heap *vp_exec_heap;
+   struct nouveau_heap *vp_data_heap;
+};
+
+static INLINE struct nv30_screen *
+nv30_screen(struct pipe_screen *pscreen)
+{
+   return (struct nv30_screen *)pscreen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
new file mode 100644
index 00000000000..64be1b78f79
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+
+#include "nouveau_gldefs.h"
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_winsys.h"
+
+#define NV40_3D_MRT_BLEND_ENABLE 0x0000036c
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+                        const struct pipe_blend_state *cso)
+{
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
+   struct nv30_blend_stateobj *so;
+   uint32_t blend[2], cmask[2];
+   int i;
+
+   so = CALLOC_STRUCT(nv30_blend_stateobj);
+   if (!so)
+      return NULL;
+   so->pipe = *cso;
+
+   if (cso->logicop_enable) {
+      SB_MTHD30(so, COLOR_LOGIC_OP_ENABLE, 2);
+      SB_DATA  (so, 1);
+      SB_DATA  (so, nvgl_logicop_func(cso->logicop_func));
+   } else {
+      SB_MTHD30(so, COLOR_LOGIC_OP_ENABLE, 1);
+      SB_DATA  (so, 0);
+   }
+
+   SB_MTHD30(so, DITHER_ENABLE, 1);
+   SB_DATA  (so, cso->dither);
+
+   blend[0] = cso->rt[0].blend_enable;
+   cmask[0] = !!(cso->rt[0].colormask & PIPE_MASK_A) << 24 |
+              !!(cso->rt[0].colormask & PIPE_MASK_R) << 16 |
+              !!(cso->rt[0].colormask & PIPE_MASK_G) <<  8 |
+              !!(cso->rt[0].colormask & PIPE_MASK_B);
+   if (cso->independent_blend_enable) {
+      blend[1] = 0;
+      cmask[1] = 0;
+      for (i = 1; i < 4; i++) {
+         blend[1] |= cso->rt[i].blend_enable << i;
+         cmask[1] |= !!(cso->rt[i].colormask & PIPE_MASK_A) << (0 + (i * 4)) |
+                     !!(cso->rt[i].colormask & PIPE_MASK_R) << (1 + (i * 4)) |
+                     !!(cso->rt[i].colormask & PIPE_MASK_G) << (2 + (i * 4)) |
+                     !!(cso->rt[i].colormask & PIPE_MASK_B) << (3 + (i * 4));
+      }
+   } else {
+      blend[1]  = 0x0000000e *   (blend[0] & 0x00000001);
+      cmask[1]  = 0x00001110 * !!(cmask[0] & 0x01000000);
+      cmask[1] |= 0x00002220 * !!(cmask[0] & 0x00010000);
+      cmask[1] |= 0x00004440 * !!(cmask[0] & 0x00000100);
+      cmask[1] |= 0x00008880 * !!(cmask[0] & 0x00000001);
+   }
+
+   if (eng3d->oclass >= NV40_3D_CLASS) {
+      SB_MTHD40(so, MRT_BLEND_ENABLE, 2);
+      SB_DATA  (so, blend[1]);
+      SB_DATA  (so, cmask[1]);
+   }
+
+   if (blend[0] || blend[1]) {
+      SB_MTHD30(so, BLEND_FUNC_ENABLE, 3);
+      SB_DATA  (so, blend[0]);
+      SB_DATA  (so, (nvgl_blend_func(cso->rt[0].alpha_src_factor) << 16) |
+                     nvgl_blend_func(cso->rt[0].rgb_src_factor));
+      SB_DATA  (so, (nvgl_blend_func(cso->rt[0].alpha_dst_factor) << 16) |
+                     nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+      if (eng3d->oclass < NV40_3D_CLASS) {
+         SB_MTHD30(so, BLEND_EQUATION, 1);
+         SB_DATA  (so, nvgl_blend_eqn(cso->rt[0].rgb_func));
+      } else {
+         SB_MTHD40(so, BLEND_EQUATION, 1);
+         SB_DATA  (so, (nvgl_blend_eqn(cso->rt[0].alpha_func) << 16) |
+                        nvgl_blend_eqn(cso->rt[0].rgb_func));
+      }
+   } else {
+      SB_MTHD30(so, BLEND_FUNC_ENABLE, 1);
+      SB_DATA  (so, blend[0]);
+   }
+
+   SB_MTHD30(so, COLOR_MASK, 1);
+   SB_DATA  (so, cmask[0]);
+   return so;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->blend = hwcso;
+   nv30->dirty |= NV30_NEW_BLEND;
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *cso)
+{
+   struct nv30_rasterizer_stateobj *so;
+
+   so = CALLOC_STRUCT(nv30_rasterizer_stateobj);
+   if (!so)
+      return NULL;
+   so->pipe = *cso;
+
+   SB_MTHD30(so, SHADE_MODEL, 1);
+   SB_DATA  (so, cso->flatshade ? NV30_3D_SHADE_MODEL_FLAT :
+                                  NV30_3D_SHADE_MODEL_SMOOTH);
+
+   SB_MTHD30(so, POLYGON_MODE_FRONT, 6);
+   SB_DATA  (so, nvgl_polygon_mode(cso->fill_front));
+   SB_DATA  (so, nvgl_polygon_mode(cso->fill_back));
+   if (cso->cull_face == PIPE_FACE_FRONT_AND_BACK)
+      SB_DATA  (so, NV30_3D_CULL_FACE_FRONT_AND_BACK);
+   else
+   if (cso->cull_face == PIPE_FACE_FRONT)
+      SB_DATA  (so, NV30_3D_CULL_FACE_FRONT);
+   else
+      SB_DATA  (so, NV30_3D_CULL_FACE_BACK);
+   SB_DATA  (so, cso->front_ccw ? NV30_3D_FRONT_FACE_CCW :
+                                  NV30_3D_FRONT_FACE_CW);
+   SB_DATA  (so, cso->poly_smooth);
+   SB_DATA  (so, cso->cull_face != PIPE_FACE_NONE);
+
+   SB_MTHD30(so, POLYGON_OFFSET_POINT_ENABLE, 3);
+   SB_DATA  (so, cso->offset_point);
+   SB_DATA  (so, cso->offset_line);
+   SB_DATA  (so, cso->offset_tri);
+   if (cso->offset_point || cso->offset_line || cso->offset_tri) {
+      SB_MTHD30(so, POLYGON_OFFSET_FACTOR, 2);
+      SB_DATA  (so, fui(cso->offset_scale));
+      SB_DATA  (so, fui(cso->offset_units * 2.0));
+   }
+
+   SB_MTHD30(so, LINE_WIDTH, 2);
+   SB_DATA  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+   SB_DATA  (so, cso->line_smooth);
+   SB_MTHD30(so, LINE_STIPPLE_ENABLE, 2);
+   SB_DATA  (so, cso->line_stipple_enable);
+   SB_DATA  (so, (cso->line_stipple_pattern << 16) |
+                  cso->line_stipple_factor);
+
+   SB_MTHD30(so, VERTEX_TWO_SIDE_ENABLE, 1);
+   SB_DATA  (so, cso->light_twoside);
+   SB_MTHD30(so, POLYGON_STIPPLE_ENABLE, 1);
+   SB_DATA  (so, cso->poly_stipple_enable);
+   SB_MTHD30(so, POINT_SIZE, 1);
+   SB_DATA  (so, fui(cso->point_size));
+   SB_MTHD30(so, FLATSHADE_FIRST, 1);
+   SB_DATA  (so, cso->flatshade_first);
+
+   SB_MTHD30(so, DEPTH_CONTROL, 1);
+   SB_DATA  (so, cso->depth_clip ? 0x00000001 : 0x00000010);
+   return so;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->rast = hwcso;
+   nv30->dirty |= NV30_NEW_RASTERIZER;
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static void *
+nv30_zsa_state_create(struct pipe_context *pipe,
+                      const struct pipe_depth_stencil_alpha_state *cso)
+{
+   struct nv30_zsa_stateobj *so;
+
+   so = CALLOC_STRUCT(nv30_zsa_stateobj);
+   if (!so)
+      return NULL;
+   so->pipe = *cso;
+
+   SB_MTHD30(so, DEPTH_FUNC, 3);
+   SB_DATA  (so, nvgl_comparison_op(cso->depth.func));
+   SB_DATA  (so, cso->depth.writemask);
+   SB_DATA  (so, cso->depth.enabled);
+
+   if (cso->stencil[0].enabled) {
+      SB_MTHD30(so, STENCIL_ENABLE(0), 3);
+      SB_DATA  (so, 1);
+      SB_DATA  (so, cso->stencil[0].writemask);
+      SB_DATA  (so, nvgl_comparison_op(cso->stencil[0].func));
+      SB_MTHD30(so, STENCIL_FUNC_MASK(0), 4);
+      SB_DATA  (so, cso->stencil[0].valuemask);
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+   } else {
+      SB_MTHD30(so, STENCIL_ENABLE(0), 2);
+      SB_DATA  (so, 0);
+      SB_DATA  (so, 0x000000ff);
+   }
+
+   if (cso->stencil[1].enabled) {
+      SB_MTHD30(so, STENCIL_ENABLE(1), 3);
+      SB_DATA  (so, 1);
+      SB_DATA  (so, cso->stencil[1].writemask);
+      SB_DATA  (so, nvgl_comparison_op(cso->stencil[1].func));
+      SB_MTHD30(so, STENCIL_FUNC_MASK(1), 4);
+      SB_DATA  (so, cso->stencil[1].valuemask);
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+      SB_DATA  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+   } else {
+      SB_MTHD30(so, STENCIL_ENABLE(1), 1);
+      SB_DATA  (so, 0);
+   }
+
+   SB_MTHD30(so, ALPHA_FUNC_ENABLE, 3);
+   SB_DATA  (so, cso->alpha.enabled ? 1 : 0);
+   SB_DATA  (so, nvgl_comparison_op(cso->alpha.func));
+   SB_DATA  (so, float_to_ubyte(cso->alpha.ref_value));
+
+   return so;
+}
+
+static void
+nv30_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->zsa = hwcso;
+   nv30->dirty |= NV30_NEW_ZSA;
+}
+
+static void
+nv30_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *bcol)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->blend_colour = *bcol;
+    nv30->dirty |= NV30_NEW_BLEND_COLOUR;
+}
+
+static void
+nv30_set_stencil_ref(struct pipe_context *pipe,
+                     const struct pipe_stencil_ref *sr)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->stencil_ref = *sr;
+    nv30->dirty |= NV30_NEW_STENCIL_REF;
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    memcpy(nv30->clip.ucp, clip->ucp, sizeof(clip->ucp));
+
+    nv30->dirty |= NV30_NEW_CLIP;
+}
+
+static void
+nv30_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->sample_mask = sample_mask;
+    nv30->dirty |= NV30_NEW_SAMPLE_MASK;
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+                         struct pipe_constant_buffer *cb)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct pipe_resource *buf = cb ? cb->buffer : NULL;
+   unsigned size;
+
+   if (cb && cb->user_buffer) {
+      buf = nouveau_user_buffer_create(pipe->screen, (void*)cb->user_buffer,
+                                       cb->buffer_size,
+                                       PIPE_BIND_CONSTANT_BUFFER);
+   }
+
+   size = 0;
+   if (buf)
+      size = buf->width0 / (4 * sizeof(float));
+
+   if (shader == PIPE_SHADER_VERTEX) {
+      pipe_resource_reference(&nv30->vertprog.constbuf, buf);
+      nv30->vertprog.constbuf_nr = size;
+      nv30->dirty |= NV30_NEW_VERTCONST;
+   } else
+   if (shader == PIPE_SHADER_FRAGMENT) {
+      pipe_resource_reference(&nv30->fragprog.constbuf, buf);
+      nv30->fragprog.constbuf_nr = size;
+      nv30->dirty |= NV30_NEW_FRAGCONST;
+   }
+
+   if (cb && cb->user_buffer) {
+      pipe_resource_reference(&buf, NULL);
+   }
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nouveau_bufctx_reset(nv30->bufctx, BUFCTX_FB);
+
+    nv30->framebuffer = *fb;
+    nv30->dirty |= NV30_NEW_FRAMEBUFFER;
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+                         const struct pipe_poly_stipple *stipple)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->stipple = *stipple;
+    nv30->dirty |= NV30_NEW_STIPPLE;
+}
+
+static void
+nv30_set_scissor_states(struct pipe_context *pipe,
+                        unsigned start_slot,
+                        unsigned num_viewports,
+                        const struct pipe_scissor_state *scissor)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->scissor = *scissor;
+    nv30->dirty |= NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_set_viewport_states(struct pipe_context *pipe,
+                         unsigned start_slot,
+                         unsigned num_viewports,
+                         const struct pipe_viewport_state *vpt)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nv30->viewport = *vpt;
+    nv30->dirty |= NV30_NEW_VIEWPORT;
+}
+
+static void
+nv30_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned start_slot, unsigned count,
+                        const struct pipe_vertex_buffer *vb)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXBUF);
+
+    util_set_vertex_buffers_count(nv30->vtxbuf, &nv30->num_vtxbufs,
+                                  vb, start_slot, count);
+
+    nv30->dirty |= NV30_NEW_ARRAYS;
+}
+
+static void
+nv30_set_index_buffer(struct pipe_context *pipe,
+                      const struct pipe_index_buffer *ib)
+{
+    struct nv30_context *nv30 = nv30_context(pipe);
+
+    if (ib) {
+       pipe_resource_reference(&nv30->idxbuf.buffer, ib->buffer);
+       nv30->idxbuf.index_size = ib->index_size;
+       nv30->idxbuf.offset = ib->offset;
+       nv30->idxbuf.user_buffer = ib->user_buffer;
+    } else {
+       pipe_resource_reference(&nv30->idxbuf.buffer, NULL);
+       nv30->idxbuf.user_buffer = NULL;
+    }
+}
+
+void
+nv30_state_init(struct pipe_context *pipe)
+{
+   pipe->create_blend_state = nv30_blend_state_create;
+   pipe->bind_blend_state = nv30_blend_state_bind;
+   pipe->delete_blend_state = nv30_blend_state_delete;
+
+   pipe->create_rasterizer_state = nv30_rasterizer_state_create;
+   pipe->bind_rasterizer_state = nv30_rasterizer_state_bind;
+   pipe->delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+   pipe->create_depth_stencil_alpha_state = nv30_zsa_state_create;
+   pipe->bind_depth_stencil_alpha_state = nv30_zsa_state_bind;
+   pipe->delete_depth_stencil_alpha_state = nv30_zsa_state_delete;
+
+   pipe->set_blend_color = nv30_set_blend_color;
+   pipe->set_stencil_ref = nv30_set_stencil_ref;
+   pipe->set_clip_state = nv30_set_clip_state;
+   pipe->set_sample_mask = nv30_set_sample_mask;
+   pipe->set_constant_buffer = nv30_set_constant_buffer;
+   pipe->set_framebuffer_state = nv30_set_framebuffer_state;
+   pipe->set_polygon_stipple = nv30_set_polygon_stipple;
+   pipe->set_scissor_states = nv30_set_scissor_states;
+   pipe->set_viewport_states = nv30_set_viewport_states;
+
+   pipe->set_vertex_buffers = nv30_set_vertex_buffers;
+   pipe->set_index_buffer = nv30_set_index_buffer;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.h b/src/gallium/drivers/nouveau/nv30/nv30_state.h
new file mode 100644
index 00000000000..e27e16fae82
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.h
@@ -0,0 +1,144 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_dynarray.h"
+
+#define NV30_QUERY_ZCULL_0 (PIPE_QUERY_TYPES + 0)
+#define NV30_QUERY_ZCULL_1 (PIPE_QUERY_TYPES + 1)
+#define NV30_QUERY_ZCULL_2 (PIPE_QUERY_TYPES + 2)
+#define NV30_QUERY_ZCULL_3 (PIPE_QUERY_TYPES + 3)
+
+#define SB_DATA(so, u)        (so)->data[(so)->size++] = (u)
+#define SB_MTHD30(so, mthd, size)                                          \
+   SB_DATA((so), ((size) << 18) | (7 << 13) | NV30_3D_##mthd)
+#define SB_MTHD40(so, mthd, size)                                          \
+   SB_DATA((so), ((size) << 18) | (7 << 13) | NV40_3D_##mthd)
+
+struct nv30_blend_stateobj {
+   struct pipe_blend_state pipe;
+   unsigned data[16];
+   unsigned size;
+};
+
+struct nv30_rasterizer_stateobj {
+   struct pipe_rasterizer_state pipe;
+   unsigned data[32];
+   unsigned size;
+};
+
+struct nv30_zsa_stateobj {
+   struct pipe_depth_stencil_alpha_state pipe;
+   unsigned data[32];
+   unsigned size;
+};
+
+struct nv30_sampler_state {
+   struct pipe_sampler_state pipe;
+   unsigned fmt;
+   unsigned wrap;
+   unsigned en;
+   unsigned filt;
+   unsigned bcol;
+   /* 4.8 */
+   unsigned min_lod;
+   unsigned max_lod;
+};
+
+struct nv30_sampler_view {
+   struct pipe_sampler_view pipe;
+   unsigned fmt;
+   unsigned swz;
+   unsigned filt;
+   unsigned filt_mask;
+   unsigned wrap;
+   unsigned wrap_mask;
+   unsigned npot_size0;
+   unsigned npot_size1;
+   /* 4.8 */
+   unsigned base_lod;
+   unsigned high_lod;
+};
+
+struct nv30_shader_reloc {
+   unsigned location;
+   int target;
+};
+
+struct nv30_vertprog_exec {
+   uint32_t data[4];
+};
+
+struct nv30_vertprog_data {
+   int index; /* immediates == -1 */
+   float value[4];
+};
+
+struct nv30_vertprog {
+   struct pipe_shader_state pipe;
+   struct tgsi_shader_info info;
+
+   struct draw_vertex_shader *draw;
+   boolean translated;
+   unsigned enabled_ucps;
+   uint16_t texcoord[10];
+
+   struct util_dynarray branch_relocs;
+   struct nv30_vertprog_exec *insns;
+   unsigned nr_insns;
+
+   struct util_dynarray const_relocs;
+   struct nv30_vertprog_data *consts;
+   unsigned nr_consts;
+
+   struct nouveau_heap *exec;
+   struct nouveau_heap *data;
+   uint32_t ir;
+   uint32_t or;
+   void *nvfx;
+};
+
+struct nv30_fragprog_data {
+   unsigned offset;
+   unsigned index;
+};
+
+struct nv30_fragprog {
+   struct pipe_shader_state pipe;
+   struct tgsi_shader_info info;
+
+   struct draw_fragment_shader *draw;
+   boolean translated;
+
+   uint32_t *insn;
+   unsigned insn_len;
+
+   uint16_t texcoord[10];
+   struct nv30_fragprog_data *consts;
+   unsigned nr_consts;
+
+   struct pipe_resource *buffer;
+   uint32_t vp_or; /* appended to VP_RESULT_EN */
+   uint32_t fp_control;
+   uint32_t point_sprite_control;
+   uint32_t coord_conventions;
+   uint32_t texcoords;
+   uint32_t rt_enable;
+};
+
+struct nv30_vertex_element {
+   unsigned state;
+};
+
+struct nv30_vertex_stateobj {
+   struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS];
+   struct translate *translate;
+   bool need_conversion;
+   unsigned num_elements;
+   unsigned vtx_size;
+   unsigned vtx_per_packet_max;
+   struct nv30_vertex_element element[];
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
new file mode 100644
index 00000000000..f22755983ba
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_half.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+static void
+nv30_validate_fb(struct nv30_context *nv30)
+{
+   struct pipe_screen *pscreen = &nv30->screen->base.base;
+   struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   uint32_t rt_format;
+   int h = fb->height;
+   int w = fb->width;
+   int x = 0;
+   int y = 0;
+
+   nv30->state.rt_enable = (NV30_3D_RT_ENABLE_COLOR0 << fb->nr_cbufs) - 1;
+   if (nv30->state.rt_enable > 1)
+      nv30->state.rt_enable |= NV30_3D_RT_ENABLE_MRT;
+
+   rt_format = 0;
+   if (fb->nr_cbufs > 0) {
+      struct nv30_miptree *mt = nv30_miptree(fb->cbufs[0]->texture);
+      rt_format |= nv30_format(pscreen, fb->cbufs[0]->format)->hw;
+      rt_format |= mt->ms_mode;
+      if (mt->swizzled)
+         rt_format |= NV30_3D_RT_FORMAT_TYPE_SWIZZLED;
+      else
+         rt_format |= NV30_3D_RT_FORMAT_TYPE_LINEAR;
+   } else {
+      if (fb->zsbuf && util_format_get_blocksize(fb->zsbuf->format) > 2)
+         rt_format |= NV30_3D_RT_FORMAT_COLOR_A8R8G8B8;
+      else
+         rt_format |= NV30_3D_RT_FORMAT_COLOR_R5G6B5;
+   }
+
+   if (fb->zsbuf) {
+      rt_format |= nv30_format(pscreen, fb->zsbuf->format)->hw;
+      if (nv30_miptree(fb->zsbuf->texture)->swizzled)
+         rt_format |= NV30_3D_RT_FORMAT_TYPE_SWIZZLED;
+      else
+         rt_format |= NV30_3D_RT_FORMAT_TYPE_LINEAR;
+   } else {
+      if (fb->nr_cbufs && util_format_get_blocksize(fb->cbufs[0]->format) > 2)
+         rt_format |= NV30_3D_RT_FORMAT_ZETA_Z24S8;
+      else
+         rt_format |= NV30_3D_RT_FORMAT_ZETA_Z16;
+   }
+
+   /* hardware rounds down render target offset to 64 bytes, but surfaces
+    * with a size of 2x2 pixel (16bpp) or 1x1 pixel (32bpp) have an
+    * unaligned start aaddress.  For these two important square formats
+    * we can hack around this limitation by adjusting the viewport origin
+    */
+   if (nv30->state.rt_enable) {
+      int off = nv30_surface(fb->cbufs[0])->offset & 63;
+      if (off) {
+         x += off / (util_format_get_blocksize(fb->cbufs[0]->format) * 2);
+         w  = 16;
+         h  = 2;
+      }
+   }
+
+   if (rt_format & NV30_3D_RT_FORMAT_TYPE_SWIZZLED) {
+      rt_format |= util_logbase2(w) << 16;
+      rt_format |= util_logbase2(h) << 24;
+   }
+
+   if (!PUSH_SPACE(push, 64))
+      return;
+   PUSH_RESET(push, BUFCTX_FB);
+
+   BEGIN_NV04(push, SUBC_3D(0x1da4), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(RT_HORIZ), 3);
+   PUSH_DATA (push, w << 16);
+   PUSH_DATA (push, h << 16);
+   PUSH_DATA (push, rt_format);
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_HORIZ), 2);
+   PUSH_DATA (push, w << 16);
+   PUSH_DATA (push, h << 16);
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_TX_ORIGIN), 4);
+   PUSH_DATA (push, (y << 16) | x);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, ((w - 1) << 16) | 0);
+   PUSH_DATA (push, ((h - 1) << 16) | 0);
+
+   if ((nv30->state.rt_enable & NV30_3D_RT_ENABLE_COLOR0) || fb->zsbuf) {
+      struct nv30_surface *rsf = nv30_surface(fb->cbufs[0]);
+      struct nv30_surface *zsf = nv30_surface(fb->zsbuf);
+      struct nouveau_bo *rbo, *zbo;
+
+      if (!rsf)      rsf = zsf;
+      else if (!zsf) zsf = rsf;
+      rbo = nv30_miptree(rsf->base.texture)->base.bo;
+      zbo = nv30_miptree(zsf->base.texture)->base.bo;
+
+      if (eng3d->oclass >= NV40_3D_CLASS) {
+         BEGIN_NV04(push, NV40_3D(ZETA_PITCH), 1);
+         PUSH_DATA (push, zsf->pitch);
+         BEGIN_NV04(push, NV40_3D(COLOR0_PITCH), 3);
+         PUSH_DATA (push, rsf->pitch);
+      } else {
+         BEGIN_NV04(push, NV30_3D(COLOR0_PITCH), 3);
+         PUSH_DATA (push, (zsf->pitch << 16) | rsf->pitch);
+      }
+      PUSH_MTHDl(push, NV30_3D(COLOR0_OFFSET), BUFCTX_FB, rbo, rsf->offset & ~63,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+      PUSH_MTHDl(push, NV30_3D(ZETA_OFFSET), BUFCTX_FB, zbo, zsf->offset & ~63,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+   }
+
+   if (nv30->state.rt_enable & NV30_3D_RT_ENABLE_COLOR1) {
+      struct nv30_surface *sf = nv30_surface(fb->cbufs[1]);
+      struct nouveau_bo *bo = nv30_miptree(sf->base.texture)->base.bo;
+
+      BEGIN_NV04(push, NV30_3D(COLOR1_OFFSET), 2);
+      PUSH_MTHDl(push, NV30_3D(COLOR1_OFFSET), BUFCTX_FB, bo, sf->offset,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+      PUSH_DATA (push, sf->pitch);
+   }
+
+   if (nv30->state.rt_enable & NV40_3D_RT_ENABLE_COLOR2) {
+      struct nv30_surface *sf = nv30_surface(fb->cbufs[2]);
+      struct nouveau_bo *bo = nv30_miptree(sf->base.texture)->base.bo;
+
+      BEGIN_NV04(push, NV40_3D(COLOR2_OFFSET), 1);
+      PUSH_MTHDl(push, NV40_3D(COLOR2_OFFSET), BUFCTX_FB, bo, sf->offset,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+      BEGIN_NV04(push, NV40_3D(COLOR2_PITCH), 1);
+      PUSH_DATA (push, sf->pitch);
+   }
+
+   if (nv30->state.rt_enable & NV40_3D_RT_ENABLE_COLOR3) {
+      struct nv30_surface *sf = nv30_surface(fb->cbufs[3]);
+      struct nouveau_bo *bo = nv30_miptree(sf->base.texture)->base.bo;
+
+      BEGIN_NV04(push, NV40_3D(COLOR3_OFFSET), 1);
+      PUSH_MTHDl(push, NV40_3D(COLOR3_OFFSET), BUFCTX_FB, bo, sf->offset,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+      BEGIN_NV04(push, NV40_3D(COLOR3_PITCH), 1);
+      PUSH_DATA (push, sf->pitch);
+   }
+}
+
+static void
+nv30_validate_blend_colour(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   float *rgba = nv30->blend_colour.color;
+
+   if (nv30->framebuffer.nr_cbufs) {
+      switch (nv30->framebuffer.cbufs[0]->format) {
+      case PIPE_FORMAT_R16G16B16A16_FLOAT:
+      case PIPE_FORMAT_R32G32B32A32_FLOAT:
+         BEGIN_NV04(push, NV30_3D(BLEND_COLOR), 1);
+         PUSH_DATA (push, (util_float_to_half(rgba[0]) <<  0) |
+                          (util_float_to_half(rgba[1]) << 16));
+         BEGIN_NV04(push, SUBC_3D(0x037c), 1);
+         PUSH_DATA (push, (util_float_to_half(rgba[2]) <<  0) |
+                          (util_float_to_half(rgba[3]) << 16));
+         break;
+      default:
+         break;
+      }
+   }
+
+   BEGIN_NV04(push, NV30_3D(BLEND_COLOR), 1);
+   PUSH_DATA (push, (float_to_ubyte(rgba[3]) << 24) |
+                    (float_to_ubyte(rgba[0]) << 16) |
+                    (float_to_ubyte(rgba[1]) <<  8) |
+                    (float_to_ubyte(rgba[2]) <<  0));
+}
+
+static void
+nv30_validate_stencil_ref(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   BEGIN_NV04(push, NV30_3D(STENCIL_FUNC_REF(0)), 1);
+   PUSH_DATA (push, nv30->stencil_ref.ref_value[0]);
+   BEGIN_NV04(push, NV30_3D(STENCIL_FUNC_REF(1)), 1);
+   PUSH_DATA (push, nv30->stencil_ref.ref_value[1]);
+}
+
+static void
+nv30_validate_stipple(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   BEGIN_NV04(push, NV30_3D(POLYGON_STIPPLE_PATTERN(0)), 32);
+   PUSH_DATAp(push, nv30->stipple.stipple, 32);
+}
+
+static void
+nv30_validate_scissor(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct pipe_scissor_state *s = &nv30->scissor;
+
+   if (!(nv30->dirty & NV30_NEW_SCISSOR) &&
+       nv30->rast->pipe.scissor != nv30->state.scissor_off)
+      return;
+   nv30->state.scissor_off = !nv30->rast->pipe.scissor;
+
+   BEGIN_NV04(push, NV30_3D(SCISSOR_HORIZ), 2);
+   if (nv30->rast->pipe.scissor) {
+      PUSH_DATA (push, ((s->maxx - s->minx) << 16) | s->minx);
+      PUSH_DATA (push, ((s->maxy - s->miny) << 16) | s->miny);
+   } else {
+      PUSH_DATA (push, 0x10000000);
+      PUSH_DATA (push, 0x10000000);
+   }
+}
+
+static void
+nv30_validate_viewport(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct pipe_viewport_state *vp = &nv30->viewport;
+
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_TRANSLATE_X), 8);
+   PUSH_DATAf(push, vp->translate[0]);
+   PUSH_DATAf(push, vp->translate[1]);
+   PUSH_DATAf(push, vp->translate[2]);
+   PUSH_DATAf(push, vp->translate[3]);
+   PUSH_DATAf(push, vp->scale[0]);
+   PUSH_DATAf(push, vp->scale[1]);
+   PUSH_DATAf(push, vp->scale[2]);
+   PUSH_DATAf(push, vp->scale[3]);
+   BEGIN_NV04(push, NV30_3D(DEPTH_RANGE_NEAR), 2);
+   PUSH_DATAf(push, vp->translate[2] - fabsf(vp->scale[2]));
+   PUSH_DATAf(push, vp->translate[2] + fabsf(vp->scale[2]));
+}
+
+static void
+nv30_validate_clip(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   unsigned i;
+   uint32_t clpd_enable = 0;
+
+   for (i = 0; i < 6; i++) {
+      if (nv30->rast->pipe.clip_plane_enable & (1 << i)) {
+         if (nv30->dirty & NV30_NEW_CLIP) {
+            BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
+            PUSH_DATA (push, i);
+            PUSH_DATAp(push, nv30->clip.ucp[i], 4);
+         }
+
+         clpd_enable |= 1 << (1 + 4*i);
+      }
+   }
+
+   BEGIN_NV04(push, NV30_3D(VP_CLIP_PLANES_ENABLE), 1);
+   PUSH_DATA (push, clpd_enable);
+}
+
+static void
+nv30_validate_blend(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   PUSH_SPACE(push, nv30->blend->size);
+   PUSH_DATAp(push, nv30->blend->data, nv30->blend->size);
+}
+
+static void
+nv30_validate_zsa(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   PUSH_SPACE(push, nv30->zsa->size);
+   PUSH_DATAp(push, nv30->zsa->data, nv30->zsa->size);
+}
+
+static void
+nv30_validate_rasterizer(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   PUSH_SPACE(push, nv30->rast->size);
+   PUSH_DATAp(push, nv30->rast->data, nv30->rast->size);
+}
+
+static void
+nv30_validate_multisample(struct nv30_context *nv30)
+{
+   struct pipe_rasterizer_state *rasterizer = &nv30->rast->pipe;
+   struct pipe_blend_state *blend = &nv30->blend->pipe;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   uint32_t ctrl = nv30->sample_mask << 16;
+
+   if (blend->alpha_to_one)
+      ctrl |= 0x00000100;
+   if (blend->alpha_to_coverage)
+      ctrl |= 0x00000010;
+   if (rasterizer->multisample)
+      ctrl |= 0x00000001;
+
+   BEGIN_NV04(push, NV30_3D(MULTISAMPLE_CONTROL), 1);
+   PUSH_DATA (push, ctrl);
+}
+
+static void
+nv30_validate_fragment(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nv30_fragprog *fp = nv30->fragprog.program;
+
+   BEGIN_NV04(push, NV30_3D(RT_ENABLE), 1);
+   PUSH_DATA (push, nv30->state.rt_enable & ~fp->rt_enable);
+   BEGIN_NV04(push, NV30_3D(COORD_CONVENTIONS), 1);
+   PUSH_DATA (push, fp->coord_conventions | nv30->framebuffer.height);
+}
+
+static void
+nv30_validate_point_coord(struct nv30_context *nv30)
+{
+   struct pipe_rasterizer_state *rasterizer = &nv30->rast->pipe;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nv30_fragprog *fp = nv30->fragprog.program;
+   uint32_t hw = 0x00000000;
+
+   if (rasterizer) {
+      hw |= (nv30->rast->pipe.sprite_coord_enable & 0xff) << 8;
+      if (fp)
+         hw |= fp->point_sprite_control;
+
+      if (rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) {
+         if (hw)
+            nv30->draw_flags |= NV30_NEW_RASTERIZER;
+      } else
+      if (rasterizer->point_quad_rasterization) {
+         hw |= NV30_3D_POINT_SPRITE_ENABLE;
+      }
+   }
+
+   BEGIN_NV04(push, NV30_3D(POINT_SPRITE), 1);
+   PUSH_DATA (push, hw);
+}
+
+struct state_validate {
+   void (*func)(struct nv30_context *);
+   uint32_t mask;
+};
+
+static struct state_validate hwtnl_validate_list[] = {
+    { nv30_validate_fb,            NV30_NEW_FRAMEBUFFER },
+    { nv30_validate_blend,         NV30_NEW_BLEND },
+    { nv30_validate_zsa,           NV30_NEW_ZSA },
+    { nv30_validate_rasterizer,    NV30_NEW_RASTERIZER },
+    { nv30_validate_multisample,   NV30_NEW_SAMPLE_MASK | NV30_NEW_BLEND |
+                                   NV30_NEW_RASTERIZER },
+    { nv30_validate_blend_colour,  NV30_NEW_BLEND_COLOUR |
+                                   NV30_NEW_FRAMEBUFFER },
+    { nv30_validate_stencil_ref,   NV30_NEW_STENCIL_REF },
+    { nv30_validate_stipple,       NV30_NEW_STIPPLE },
+    { nv30_validate_scissor,       NV30_NEW_SCISSOR | NV30_NEW_RASTERIZER },
+    { nv30_validate_viewport,      NV30_NEW_VIEWPORT },
+    { nv30_validate_clip,          NV30_NEW_CLIP },
+    { nv30_fragprog_validate,      NV30_NEW_FRAGPROG | NV30_NEW_FRAGCONST },
+    { nv30_vertprog_validate,      NV30_NEW_VERTPROG | NV30_NEW_VERTCONST |
+                                   NV30_NEW_FRAGPROG | NV30_NEW_RASTERIZER },
+    { nv30_validate_fragment,      NV30_NEW_FRAMEBUFFER | NV30_NEW_FRAGPROG },
+    { nv30_validate_point_coord,   NV30_NEW_RASTERIZER | NV30_NEW_FRAGPROG },
+    { nv30_fragtex_validate,       NV30_NEW_FRAGTEX },
+    { nv40_verttex_validate,       NV30_NEW_VERTTEX },
+    { nv30_vbo_validate,           NV30_NEW_VERTEX | NV30_NEW_ARRAYS },
+    {}
+};
+
+#define NV30_SWTNL_MASK (NV30_NEW_VIEWPORT |  \
+                         NV30_NEW_CLIP |      \
+                         NV30_NEW_VERTPROG |  \
+                         NV30_NEW_VERTCONST | \
+                         NV30_NEW_VERTTEX |   \
+                         NV30_NEW_VERTEX |    \
+                         NV30_NEW_ARRAYS)
+
+static struct state_validate swtnl_validate_list[] = {
+    { nv30_validate_fb,            NV30_NEW_FRAMEBUFFER },
+    { nv30_validate_blend,         NV30_NEW_BLEND },
+    { nv30_validate_zsa,           NV30_NEW_ZSA },
+    { nv30_validate_rasterizer,    NV30_NEW_RASTERIZER },
+    { nv30_validate_multisample,   NV30_NEW_SAMPLE_MASK | NV30_NEW_BLEND |
+                                   NV30_NEW_RASTERIZER },
+    { nv30_validate_blend_colour,  NV30_NEW_BLEND_COLOUR |
+                                   NV30_NEW_FRAMEBUFFER },
+    { nv30_validate_stencil_ref,   NV30_NEW_STENCIL_REF },
+    { nv30_validate_stipple,       NV30_NEW_STIPPLE },
+    { nv30_validate_scissor,       NV30_NEW_SCISSOR | NV30_NEW_RASTERIZER },
+    { nv30_fragprog_validate,      NV30_NEW_FRAGPROG | NV30_NEW_FRAGCONST },
+    { nv30_validate_fragment,      NV30_NEW_FRAMEBUFFER | NV30_NEW_FRAGPROG },
+    { nv30_fragtex_validate,       NV30_NEW_FRAGTEX },
+    {}
+};
+
+static void
+nv30_state_context_switch(struct nv30_context *nv30)
+{
+   struct nv30_context *prev = nv30->screen->cur_ctx;
+
+   if (prev)
+      nv30->state = prev->state;
+   nv30->dirty = NV30_NEW_ALL;
+
+   if (!nv30->vertex)
+      nv30->dirty &= ~(NV30_NEW_VERTEX | NV30_NEW_ARRAYS);
+
+   if (!nv30->vertprog.program)
+      nv30->dirty &= ~NV30_NEW_VERTPROG;
+   if (!nv30->fragprog.program)
+      nv30->dirty &= ~NV30_NEW_FRAGPROG;
+
+   if (!nv30->blend)
+      nv30->dirty &= ~NV30_NEW_BLEND;
+   if (!nv30->rast)
+      nv30->dirty &= ~NV30_NEW_RASTERIZER;
+   if (!nv30->zsa)
+      nv30->dirty &= ~NV30_NEW_ZSA;
+
+   nv30->screen->cur_ctx = nv30;
+   nv30->base.pushbuf->user_priv = &nv30->bufctx;
+}
+
+boolean
+nv30_state_validate(struct nv30_context *nv30, boolean hwtnl)
+{
+   struct nouveau_screen *screen = &nv30->screen->base;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_bufctx *bctx = nv30->bufctx;
+   struct nouveau_bufref *bref;
+   struct state_validate *validate;
+
+   if (nv30->screen->cur_ctx != nv30)
+      nv30_state_context_switch(nv30);
+
+   if (hwtnl) {
+      nv30->draw_dirty |= nv30->dirty;
+      if (nv30->draw_flags) {
+         nv30->draw_flags &= ~nv30->dirty;
+         if (!nv30->draw_flags)
+            nv30->dirty |= NV30_SWTNL_MASK;
+      }
+   }
+
+   if (!nv30->draw_flags)
+      validate = hwtnl_validate_list;
+   else
+      validate = swtnl_validate_list;
+
+   if (nv30->dirty) {
+      while (validate->func) {
+         if (nv30->dirty & validate->mask)
+            validate->func(nv30);
+         validate++;
+      }
+
+      nv30->dirty = 0;
+   }
+
+   nouveau_pushbuf_bufctx(push, bctx);
+   if (nouveau_pushbuf_validate(push)) {
+      nouveau_pushbuf_bufctx(push, NULL);
+      return FALSE;
+   }
+
+   /*XXX*/
+   BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1);
+   PUSH_DATA (push, 0);
+   if (nv30->screen->eng3d->oclass >= NV40_3D_CLASS) {
+      BEGIN_NV04(push, NV40_3D(TEX_CACHE_CTL), 1);
+      PUSH_DATA (push, 2);
+      BEGIN_NV04(push, NV40_3D(TEX_CACHE_CTL), 1);
+      PUSH_DATA (push, 1);
+      BEGIN_NV04(push, NV30_3D(R1718), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV30_3D(R1718), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV30_3D(R1718), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   LIST_FOR_EACH_ENTRY(bref, &bctx->current, thead) {
+      struct nv04_resource *res = bref->priv;
+      if (res && res->mm) {
+         nouveau_fence_ref(screen->fence.current, &res->fence);
+
+         if (bref->flags & NOUVEAU_BO_RD)
+            res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+         if (bref->flags & NOUVEAU_BO_WR) {
+            nouveau_fence_ref(screen->fence.current, &res->fence_wr);
+            res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+void
+nv30_state_release(struct nv30_context *nv30)
+{
+   nouveau_pushbuf_bufctx(nv30->base.pushbuf, NULL);
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_texture.c b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
new file mode 100644
index 00000000000..d3cffcfb261
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_texture.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+#define NV30_3D_TEX_WRAP_S_MIRROR_REPEAT NV30_3D_TEX_WRAP_S_MIRRORED_REPEAT
+#define NV30_WRAP(n) \
+   case PIPE_TEX_WRAP_##n: ret = NV30_3D_TEX_WRAP_S_##n; break
+#define NV40_WRAP(n) \
+   case PIPE_TEX_WRAP_##n: ret = NV40_3D_TEX_WRAP_S_##n; break
+
+static INLINE unsigned
+wrap_mode(unsigned pipe)
+{
+   unsigned ret = NV30_3D_TEX_WRAP_S_REPEAT;
+
+   switch (pipe) {
+   NV30_WRAP(REPEAT);
+   NV30_WRAP(MIRROR_REPEAT);
+   NV30_WRAP(CLAMP_TO_EDGE);
+   NV30_WRAP(CLAMP_TO_BORDER);
+   NV30_WRAP(CLAMP);
+   NV40_WRAP(MIRROR_CLAMP_TO_EDGE);
+   NV40_WRAP(MIRROR_CLAMP_TO_BORDER);
+   NV40_WRAP(MIRROR_CLAMP);
+   default:
+      break;
+   }
+
+   return ret >> NV30_3D_TEX_WRAP_S__SHIFT;
+}
+
+static INLINE unsigned
+filter_mode(const struct pipe_sampler_state *cso)
+{
+   unsigned filter;
+
+   switch (cso->mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      filter = NV30_3D_TEX_FILTER_MAG_LINEAR;
+      break;
+   default:
+      filter = NV30_3D_TEX_FILTER_MAG_NEAREST;
+      break;
+   }
+
+   switch (cso->min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      switch (cso->min_mip_filter) {
+      case PIPE_TEX_MIPFILTER_NEAREST:
+         filter |= NV30_3D_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+         break;
+      case PIPE_TEX_MIPFILTER_LINEAR:
+         filter |= NV30_3D_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+         break;
+      default:
+         filter |= NV30_3D_TEX_FILTER_MIN_LINEAR;
+         break;
+      }
+      break;
+   default:
+      switch (cso->min_mip_filter) {
+      case PIPE_TEX_MIPFILTER_NEAREST:
+         filter |= NV30_3D_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+         break;
+      case PIPE_TEX_MIPFILTER_LINEAR:
+         filter |= NV30_3D_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+         break;
+      default:
+         filter |= NV30_3D_TEX_FILTER_MIN_NEAREST;
+         break;
+      }
+      break;
+   }
+
+   return filter;
+}
+
+static INLINE unsigned
+compare_mode(const struct pipe_sampler_state *cso)
+{
+   if (cso->compare_mode != PIPE_TEX_COMPARE_R_TO_TEXTURE)
+      return 0;
+
+   switch (cso->compare_func) {
+   case PIPE_FUNC_NEVER   : return NV30_3D_TEX_WRAP_RCOMP_NEVER;
+   case PIPE_FUNC_GREATER : return NV30_3D_TEX_WRAP_RCOMP_GREATER;
+   case PIPE_FUNC_EQUAL   : return NV30_3D_TEX_WRAP_RCOMP_EQUAL;
+   case PIPE_FUNC_GEQUAL  : return NV30_3D_TEX_WRAP_RCOMP_GEQUAL;
+   case PIPE_FUNC_LESS    : return NV30_3D_TEX_WRAP_RCOMP_LESS;
+   case PIPE_FUNC_NOTEQUAL: return NV30_3D_TEX_WRAP_RCOMP_NOTEQUAL;
+   case PIPE_FUNC_LEQUAL  : return NV30_3D_TEX_WRAP_RCOMP_LEQUAL;
+   case PIPE_FUNC_ALWAYS  : return NV30_3D_TEX_WRAP_RCOMP_ALWAYS;
+   default:
+      return 0;
+   }
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *cso)
+{
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
+   struct nv30_sampler_state *so;
+   const float max_lod = 15.0 + (255.0 / 256.0);
+
+   so = MALLOC_STRUCT(nv30_sampler_state);
+   if (!so)
+      return NULL;
+
+   so->pipe  = *cso;
+   so->fmt   = 0;
+   so->wrap  = (wrap_mode(cso->wrap_s) << NV30_3D_TEX_WRAP_S__SHIFT) |
+               (wrap_mode(cso->wrap_t) << NV30_3D_TEX_WRAP_T__SHIFT) |
+               (wrap_mode(cso->wrap_r) << NV30_3D_TEX_WRAP_R__SHIFT);
+   so->en    = 0;
+   so->wrap |= compare_mode(cso);
+   so->filt  = filter_mode(cso) | 0x00002000;
+   so->bcol  = (float_to_ubyte(cso->border_color.f[3]) << 24) |
+               (float_to_ubyte(cso->border_color.f[0]) << 16) |
+               (float_to_ubyte(cso->border_color.f[1]) <<  8) |
+               (float_to_ubyte(cso->border_color.f[2]) <<  0);
+
+   if (eng3d->oclass >= NV40_3D_CLASS) {
+      unsigned aniso = cso->max_anisotropy;
+
+      if (!cso->normalized_coords)
+         so->fmt |= NV40_3D_TEX_FORMAT_RECT;
+
+      if (aniso > 1) {
+         if      (aniso >= 16) so->en |= NV40_3D_TEX_ENABLE_ANISO_16X;
+         else if (aniso >= 12) so->en |= NV40_3D_TEX_ENABLE_ANISO_12X;
+         else if (aniso >= 10) so->en |= NV40_3D_TEX_ENABLE_ANISO_10X;
+         else if (aniso >=  8) so->en |= NV40_3D_TEX_ENABLE_ANISO_8X;
+         else if (aniso >=  6) so->en |= NV40_3D_TEX_ENABLE_ANISO_6X;
+         else if (aniso >=  4) so->en |= NV40_3D_TEX_ENABLE_ANISO_4X;
+         else                  so->en |= NV40_3D_TEX_ENABLE_ANISO_2X;
+
+         so->wrap |= nv30_context(pipe)->config.aniso;
+      }
+   } else {
+      so->en |= NV30_3D_TEX_ENABLE_ENABLE;
+
+      if      (cso->max_anisotropy >= 8) so->en |= NV30_3D_TEX_ENABLE_ANISO_8X;
+      else if (cso->max_anisotropy >= 4) so->en |= NV30_3D_TEX_ENABLE_ANISO_4X;
+      else if (cso->max_anisotropy >= 2) so->en |= NV30_3D_TEX_ENABLE_ANISO_2X;
+   }
+
+   so->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+   so->max_lod = (int)(CLAMP(cso->max_lod, 0.0, max_lod) * 256.0);
+   so->min_lod = (int)(CLAMP(cso->min_lod, 0.0, max_lod) * 256.0);
+   return so;
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static INLINE uint32_t
+swizzle(const struct nv30_texfmt *fmt, unsigned cmp, unsigned swz)
+{
+   uint32_t data = fmt->swz[swz].src << 8;
+   if (swz <= PIPE_SWIZZLE_ALPHA)
+      data |= fmt->swz[swz].cmp;
+   else
+      data |= fmt->swz[cmp].cmp;
+   return data;
+}
+
+static struct pipe_sampler_view *
+nv30_sampler_view_create(struct pipe_context *pipe, struct pipe_resource *pt,
+                         const struct pipe_sampler_view *tmpl)
+{
+   const struct nv30_texfmt *fmt = nv30_texfmt(pipe->screen, tmpl->format);
+   struct nouveau_object *eng3d = nv30_context(pipe)->screen->eng3d;
+   struct nv30_miptree *mt = nv30_miptree(pt);
+   struct nv30_sampler_view *so;
+
+   so = MALLOC_STRUCT(nv30_sampler_view);
+   if (!so)
+      return NULL;
+   so->pipe = *tmpl;
+   so->pipe.reference.count = 1;
+   so->pipe.texture = NULL;
+   so->pipe.context = pipe;
+   pipe_resource_reference(&so->pipe.texture, pt);
+
+   so->fmt = NV30_3D_TEX_FORMAT_NO_BORDER;
+   switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+      so->fmt |= NV30_3D_TEX_FORMAT_DIMS_1D;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      so->fmt |= NV30_3D_TEX_FORMAT_CUBIC;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+      so->fmt |= NV30_3D_TEX_FORMAT_DIMS_2D;
+      break;
+   case PIPE_TEXTURE_3D:
+      so->fmt |= NV30_3D_TEX_FORMAT_DIMS_3D;
+      break;
+   default:
+      assert(0);
+      so->fmt |= NV30_3D_TEX_FORMAT_DIMS_1D;
+      break;
+   }
+
+   so->filt = fmt->filter;
+   so->wrap = fmt->wrap;
+   so->swz  = fmt->swizzle;
+   so->swz |= swizzle(fmt, 3, tmpl->swizzle_a);
+   so->swz |= swizzle(fmt, 0, tmpl->swizzle_r) << 2;
+   so->swz |= swizzle(fmt, 1, tmpl->swizzle_g) << 4;
+   so->swz |= swizzle(fmt, 2, tmpl->swizzle_b) << 6;
+
+   /* apparently, we need to ignore the t coordinate for 1D textures to
+    * fix piglit tex1d-2dborder
+    */
+   so->wrap_mask = ~0;
+   if (pt->target == PIPE_TEXTURE_1D) {
+      so->wrap_mask &= ~NV30_3D_TEX_WRAP_T__MASK;
+      so->wrap      |=  NV30_3D_TEX_WRAP_T_REPEAT;
+   }
+
+   /* yet more hardware suckage, can't filter 32-bit float formats */
+   switch (tmpl->format) {
+   case PIPE_FORMAT_R32_FLOAT:
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      so->filt_mask = ~(NV30_3D_TEX_FILTER_MIN__MASK |
+                        NV30_3D_TEX_FILTER_MAG__MASK);
+      so->filt     |= NV30_3D_TEX_FILTER_MIN_NEAREST |
+                      NV30_3D_TEX_FILTER_MAG_NEAREST;
+      break;
+   default:
+      so->filt_mask = ~0;
+      break;
+   }
+
+   so->npot_size0 = (pt->width0 << 16) | pt->height0;
+   if (eng3d->oclass >= NV40_3D_CLASS) {
+      so->npot_size1 = (pt->depth0 << 20) | mt->uniform_pitch;
+      if (!mt->swizzled)
+         so->fmt |= NV40_3D_TEX_FORMAT_LINEAR;
+      so->fmt |= 0x00008000;
+      so->fmt |= (pt->last_level + 1) << NV40_3D_TEX_FORMAT_MIPMAP_COUNT__SHIFT;
+   } else {
+      so->swz |= mt->uniform_pitch << NV30_3D_TEX_SWIZZLE_RECT_PITCH__SHIFT;
+      if (pt->last_level)
+         so->fmt |= NV30_3D_TEX_FORMAT_MIPMAP;
+      so->fmt |= util_logbase2(pt->width0)  << 20;
+      so->fmt |= util_logbase2(pt->height0) << 24;
+      so->fmt |= util_logbase2(pt->depth0)  << 28;
+      so->fmt |= 0x00010000;
+   }
+
+   so->base_lod = so->pipe.u.tex.first_level << 8;
+   so->high_lod = MIN2(pt->last_level, so->pipe.u.tex.last_level) << 8;
+   return &so->pipe;
+}
+
+static void
+nv30_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+void
+nv30_texture_init(struct pipe_context *pipe)
+{
+   pipe->create_sampler_state = nv30_sampler_state_create;
+   pipe->delete_sampler_state = nv30_sampler_state_delete;
+   pipe->create_sampler_view = nv30_sampler_view_create;
+   pipe->sampler_view_destroy = nv30_sampler_view_destroy;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
new file mode 100644
index 00000000000..99bc0994ac2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -0,0 +1,754 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#define XFER_ARGS                                                              \
+   struct nv30_context *nv30, enum nv30_transfer_filter filter,                \
+   struct nv30_rect *src, struct nv30_rect *dst
+
+#include "util/u_math.h"
+
+#include "nv_object.xml.h"
+#include "nv_m2mf.xml.h"
+#include "nv30/nv01_2d.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_transfer.h"
+
+/* Various helper functions to transfer different types of data in a number
+ * of different ways.
+ */
+
+static INLINE boolean
+nv30_transfer_scaled(struct nv30_rect *src, struct nv30_rect *dst)
+{
+   if (src->x1 - src->x0 != dst->x1 - dst->x0)
+      return TRUE;
+   if (src->y1 - src->y0 != dst->y1 - dst->y0)
+      return TRUE;
+   return FALSE;
+}
+
+static INLINE boolean
+nv30_transfer_blit(XFER_ARGS)
+{
+   if (nv30->screen->eng3d->oclass < NV40_3D_CLASS)
+      return FALSE;
+   if (dst->offset & 63 || dst->pitch & 63 || dst->d > 1)
+      return FALSE;
+   if (dst->w < 2 || dst->h < 2)
+      return FALSE;
+   if (dst->cpp > 4 || (dst->cpp == 1 && !dst->pitch))
+      return FALSE;
+   if (src->cpp > 4)
+      return FALSE;
+   return TRUE;
+}
+
+static INLINE struct nouveau_heap *
+nv30_transfer_rect_vertprog(struct nv30_context *nv30)
+{
+   struct nouveau_heap *heap = nv30->screen->vp_exec_heap;
+   struct nouveau_heap *vp;
+
+   vp = nv30->blit_vp;
+   if (!vp) {
+      if (nouveau_heap_alloc(heap, 2, &nv30->blit_vp, &nv30->blit_vp)) {
+         while (heap->next && heap->size < 2) {
+            struct nouveau_heap **evict = heap->next->priv;
+            nouveau_heap_free(evict);
+         }
+
+         if (nouveau_heap_alloc(heap, 2, &nv30->blit_vp, &nv30->blit_vp))
+            return NULL;
+      }
+
+      vp = nv30->blit_vp;
+      if (vp) {
+         struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_FROM_ID), 1);
+         PUSH_DATA (push, vp->start);
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_INST(0)), 4);
+         PUSH_DATA (push, 0x401f9c6c); /* mov o[hpos], a[0]; */
+         PUSH_DATA (push, 0x0040000d);
+         PUSH_DATA (push, 0x8106c083);
+         PUSH_DATA (push, 0x6041ff80);
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_INST(0)), 4);
+         PUSH_DATA (push, 0x401f9c6c); /* mov o[tex0], a[8]; end; */
+         PUSH_DATA (push, 0x0040080d);
+         PUSH_DATA (push, 0x8106c083);
+         PUSH_DATA (push, 0x6041ff9d);
+      }
+   }
+
+   return vp;
+}
+
+
+static INLINE struct nv04_resource *
+nv30_transfer_rect_fragprog(struct nv30_context *nv30)
+{
+   struct nv04_resource *fp = nv04_resource(nv30->blit_fp);
+   struct pipe_context *pipe = &nv30->base.pipe;
+
+   if (!fp) {
+      nv30->blit_fp = pipe_buffer_create(pipe->screen, 0, 0, 12 * 4);
+      if (nv30->blit_fp) {
+         struct pipe_transfer *transfer;
+         u32 *map = pipe_buffer_map(pipe, nv30->blit_fp,
+                                    PIPE_TRANSFER_WRITE, &transfer);
+         if (map) {
+            map[0] = 0x17009e00; /* texr r0, i[tex0], texture[0]; end; */
+            map[1] = 0x1c9dc801;
+            map[2] = 0x0001c800;
+            map[3] = 0x3fe1c800;
+            map[4] = 0x01401e81; /* end; */
+            map[5] = 0x1c9dc800;
+            map[6] = 0x0001c800;
+            map[7] = 0x0001c800;
+            pipe_buffer_unmap(pipe, transfer);
+         }
+
+         fp = nv04_resource(nv30->blit_fp);
+         nouveau_buffer_migrate(&nv30->base, fp, NOUVEAU_BO_VRAM);
+      }
+   }
+
+   return fp;
+}
+
+static void
+nv30_transfer_rect_blit(XFER_ARGS)
+{
+   struct nv04_resource *fp = nv30_transfer_rect_fragprog(nv30);
+   struct nouveau_heap *vp = nv30_transfer_rect_vertprog(nv30);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_pushbuf_refn refs[] = {
+      { fp->bo, fp->domain | NOUVEAU_BO_RD },
+      { src->bo, src->domain | NOUVEAU_BO_RD },
+      { dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR },
+   };
+   u32 texfmt, texswz;
+   u32 format, stride;
+
+   if (nouveau_pushbuf_space(push, 512, 8, 0) ||
+       nouveau_pushbuf_refn (push, refs, sizeof(refs) / sizeof(refs[0])))
+      return;
+
+   /* various switches depending on cpp of the transfer */
+   switch (dst->cpp) {
+   case 4:
+      format = NV30_3D_RT_FORMAT_COLOR_A8R8G8B8 |
+               NV30_3D_RT_FORMAT_ZETA_Z24S8;
+      texfmt = NV40_3D_TEX_FORMAT_FORMAT_A8R8G8B8;
+      texswz = 0x0000aae4;
+      break;
+   case 2:
+      format = NV30_3D_RT_FORMAT_COLOR_R5G6B5 |
+               NV30_3D_RT_FORMAT_ZETA_Z16;
+      texfmt = NV40_3D_TEX_FORMAT_FORMAT_R5G6B5;
+      texswz = 0x0000a9e4;
+      break;
+   case 1:
+      format = NV30_3D_RT_FORMAT_COLOR_B8 |
+               NV30_3D_RT_FORMAT_ZETA_Z16;
+      texfmt = NV40_3D_TEX_FORMAT_FORMAT_L8;
+      texswz = 0x0000aaff;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   /* render target */
+   if (!dst->pitch) {
+      format |= NV30_3D_RT_FORMAT_TYPE_SWIZZLED;
+      format |= util_logbase2(dst->w) << 16;
+      format |= util_logbase2(dst->h) << 24;
+      stride  = 64;
+   } else {
+      format |= NV30_3D_RT_FORMAT_TYPE_LINEAR;
+      stride  = dst->pitch;
+   }
+
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_HORIZ), 2);
+   PUSH_DATA (push, dst->w << 16);
+   PUSH_DATA (push, dst->h << 16);
+   BEGIN_NV04(push, NV30_3D(RT_HORIZ), 5);
+   PUSH_DATA (push, dst->w << 16);
+   PUSH_DATA (push, dst->h << 16);
+   PUSH_DATA (push, format);
+   PUSH_DATA (push, stride);
+   PUSH_RELOC(push, dst->bo, dst->offset, NOUVEAU_BO_LOW, 0, 0);
+   BEGIN_NV04(push, NV30_3D(RT_ENABLE), 1);
+   PUSH_DATA (push, NV30_3D_RT_ENABLE_COLOR0);
+
+   nv30->dirty |= NV30_NEW_FRAMEBUFFER;
+
+   /* viewport state */
+   BEGIN_NV04(push, NV30_3D(VIEWPORT_TRANSLATE_X), 8);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   PUSH_DATAf(push, 1.0);
+   BEGIN_NV04(push, NV30_3D(DEPTH_RANGE_NEAR), 2);
+   PUSH_DATAf(push, 0.0);
+   PUSH_DATAf(push, 1.0);
+
+   nv30->dirty |= NV30_NEW_VIEWPORT;
+
+   /* blend state */
+   BEGIN_NV04(push, NV30_3D(COLOR_LOGIC_OP_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(DITHER_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(BLEND_FUNC_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(COLOR_MASK), 1);
+   PUSH_DATA (push, 0x01010101);
+
+   nv30->dirty |= NV30_NEW_BLEND;
+
+   /* depth-stencil-alpha state */
+   BEGIN_NV04(push, NV30_3D(DEPTH_WRITE_ENABLE), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(STENCIL_ENABLE(0)), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(STENCIL_ENABLE(1)), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(ALPHA_FUNC_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   nv30->dirty |= NV30_NEW_ZSA;
+
+   /* rasterizer state */
+   BEGIN_NV04(push, NV30_3D(SHADE_MODEL), 1);
+   PUSH_DATA (push, NV30_3D_SHADE_MODEL_FLAT);
+   BEGIN_NV04(push, NV30_3D(CULL_FACE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(POLYGON_MODE_FRONT), 2);
+   PUSH_DATA (push, NV30_3D_POLYGON_MODE_FRONT_FILL);
+   PUSH_DATA (push, NV30_3D_POLYGON_MODE_BACK_FILL);
+   BEGIN_NV04(push, NV30_3D(POLYGON_OFFSET_FILL_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV30_3D(POLYGON_STIPPLE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   nv30->state.scissor_off = 0;
+   nv30->dirty |= NV30_NEW_RASTERIZER;
+
+   /* vertex program */
+   BEGIN_NV04(push, NV30_3D(VP_START_FROM_ID), 1);
+   PUSH_DATA (push, vp->start);
+   BEGIN_NV04(push, NV40_3D(VP_ATTRIB_EN), 2);
+   PUSH_DATA (push, 0x00000101); /* attrib: 0, 8 */
+   PUSH_DATA (push, 0x00004000); /* result: hpos, tex0 */
+   BEGIN_NV04(push, NV30_3D(ENGINE), 1);
+   PUSH_DATA (push, 0x00000103);
+   BEGIN_NV04(push, NV30_3D(VP_CLIP_PLANES_ENABLE), 1);
+   PUSH_DATA (push, 0x00000000);
+
+   nv30->dirty |= NV30_NEW_VERTPROG;
+   nv30->dirty |= NV30_NEW_CLIP;
+
+   /* fragment program */
+   BEGIN_NV04(push, NV30_3D(FP_ACTIVE_PROGRAM), 1);
+   PUSH_RELOC(push, fp->bo, fp->offset, fp->domain |
+                    NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+                    NV30_3D_FP_ACTIVE_PROGRAM_DMA0,
+                    NV30_3D_FP_ACTIVE_PROGRAM_DMA1);
+   BEGIN_NV04(push, NV30_3D(FP_CONTROL), 1);
+   PUSH_DATA (push, 0x02000000);
+
+   nv30->state.fragprog = NULL;
+   nv30->dirty |= NV30_NEW_FRAGPROG;
+
+   /* texture */
+   texfmt |= 1 << NV40_3D_TEX_FORMAT_MIPMAP_COUNT__SHIFT;
+   texfmt |= NV30_3D_TEX_FORMAT_NO_BORDER;
+   texfmt |= NV40_3D_TEX_FORMAT_RECT;
+   texfmt |= 0x00008000;
+   if (src->d < 2)
+      texfmt |= NV30_3D_TEX_FORMAT_DIMS_2D;
+   else
+      texfmt |= NV30_3D_TEX_FORMAT_DIMS_3D;
+   if (src->pitch)
+      texfmt |= NV40_3D_TEX_FORMAT_LINEAR;
+
+   BEGIN_NV04(push, NV30_3D(TEX_OFFSET(0)), 8);
+   PUSH_RELOC(push, src->bo, src->offset, NOUVEAU_BO_LOW, 0, 0);
+   PUSH_RELOC(push, src->bo, texfmt, NOUVEAU_BO_OR,
+                    NV30_3D_TEX_FORMAT_DMA0, NV30_3D_TEX_FORMAT_DMA1);
+   PUSH_DATA (push, NV30_3D_TEX_WRAP_S_CLAMP_TO_EDGE |
+                    NV30_3D_TEX_WRAP_T_CLAMP_TO_EDGE |
+                    NV30_3D_TEX_WRAP_R_CLAMP_TO_EDGE);
+   PUSH_DATA (push, NV40_3D_TEX_ENABLE_ENABLE);
+   PUSH_DATA (push, texswz);
+   switch (filter) {
+   case BILINEAR:
+      PUSH_DATA (push, NV30_3D_TEX_FILTER_MIN_LINEAR |
+                       NV30_3D_TEX_FILTER_MAG_LINEAR | 0x00002000);
+      break;
+   default:
+      PUSH_DATA (push, NV30_3D_TEX_FILTER_MIN_NEAREST |
+                       NV30_3D_TEX_FILTER_MAG_NEAREST | 0x00002000);
+      break;
+   }
+   PUSH_DATA (push, (src->w << 16) | src->h);
+   PUSH_DATA (push, 0x00000000);
+   BEGIN_NV04(push, NV40_3D(TEX_SIZE1(0)), 1);
+   PUSH_DATA (push, 0x00100000 | src->pitch);
+   BEGIN_NV04(push, SUBC_3D(0x0b40), 1);
+   PUSH_DATA (push, src->d < 2 ? 0x00000001 : 0x00000000);
+   BEGIN_NV04(push, NV40_3D(TEX_CACHE_CTL), 1);
+   PUSH_DATA (push, 1);
+
+   nv30->fragprog.dirty_samplers |= 1;
+   nv30->dirty |= NV30_NEW_FRAGTEX;
+
+   /* blit! */
+   BEGIN_NV04(push, NV30_3D(SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, (dst->x1 - dst->x0) << 16 | dst->x0);
+   PUSH_DATA (push, (dst->y1 - dst->y0) << 16 | dst->y0);
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_QUADS);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_3F(8)), 3);
+   PUSH_DATAf(push, src->x0);
+   PUSH_DATAf(push, src->y0);
+   PUSH_DATAf(push, src->z);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_2I(0)), 1);
+   PUSH_DATA (push, (dst->y0 << 16) | dst->x0);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_3F(8)), 3);
+   PUSH_DATAf(push, src->x1);
+   PUSH_DATAf(push, src->y0);
+   PUSH_DATAf(push, src->z);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_2I(0)), 1);
+   PUSH_DATA (push, (dst->y0 << 16) | dst->x1);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_3F(8)), 3);
+   PUSH_DATAf(push, src->x1);
+   PUSH_DATAf(push, src->y1);
+   PUSH_DATAf(push, src->z);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_2I(0)), 1);
+   PUSH_DATA (push, (dst->y1 << 16) | dst->x1);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_3F(8)), 3);
+   PUSH_DATAf(push, src->x0);
+   PUSH_DATAf(push, src->y1);
+   PUSH_DATAf(push, src->z);
+   BEGIN_NV04(push, NV30_3D(VTX_ATTR_2I(0)), 1);
+   PUSH_DATA (push, (dst->y1 << 16) | dst->x0);
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+}
+
+static boolean
+nv30_transfer_sifm(XFER_ARGS)
+{
+   if (!src->pitch || (src->w | src->h) > 1024 || src->w < 2 || src->h < 2)
+      return FALSE;
+
+   if (src->d > 1 || dst->d > 1)
+      return FALSE;
+
+   if (dst->offset & 63)
+      return FALSE;
+
+   if (!dst->pitch) {
+      if ((dst->w | dst->h) > 2048 || dst->w < 2 || dst->h < 2)
+         return FALSE;
+   } else {
+      if (dst->domain != NOUVEAU_BO_VRAM)
+         return FALSE;
+      if (dst->pitch & 63)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static void
+nv30_transfer_rect_sifm(XFER_ARGS)
+
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_pushbuf_refn refs[] = {
+      { src->bo, src->domain | NOUVEAU_BO_RD },
+      { dst->bo, dst->domain | NOUVEAU_BO_WR },
+   };
+   struct nv04_fifo *fifo = push->channel->data;
+   unsigned si_fmt, si_arg;
+   unsigned ss_fmt;
+
+   switch (dst->cpp) {
+   case 4: ss_fmt = NV04_SURFACE_SWZ_FORMAT_COLOR_A8R8G8B8; break;
+   case 2: ss_fmt = NV04_SURFACE_SWZ_FORMAT_COLOR_R5G6B5; break;
+   default:
+      ss_fmt = NV04_SURFACE_SWZ_FORMAT_COLOR_Y8;
+      break;
+   }
+
+   switch (src->cpp) {
+   case 4: si_fmt = NV03_SIFM_COLOR_FORMAT_A8R8G8B8; break;
+   case 2: si_fmt = NV03_SIFM_COLOR_FORMAT_R5G6B5; break;
+   default:
+      si_fmt = NV03_SIFM_COLOR_FORMAT_AY8;
+      break;
+   }
+
+   if (filter == NEAREST) {
+      si_arg  = NV03_SIFM_FORMAT_ORIGIN_CENTER;
+      si_arg |= NV03_SIFM_FORMAT_FILTER_POINT_SAMPLE;
+   } else {
+      si_arg  = NV03_SIFM_FORMAT_ORIGIN_CORNER;
+      si_arg |= NV03_SIFM_FORMAT_FILTER_BILINEAR;
+   }
+
+   if (nouveau_pushbuf_space(push, 32, 6, 0) ||
+       nouveau_pushbuf_refn (push, refs, 2))
+      return;
+
+   if (dst->pitch) {
+      BEGIN_NV04(push, NV04_SF2D(DMA_IMAGE_SOURCE), 2);
+      PUSH_RELOC(push, dst->bo, 0, NOUVEAU_BO_OR, fifo->vram, fifo->gart);
+      PUSH_RELOC(push, dst->bo, 0, NOUVEAU_BO_OR, fifo->vram, fifo->gart);
+      BEGIN_NV04(push, NV04_SF2D(FORMAT), 4);
+      PUSH_DATA (push, ss_fmt);
+      PUSH_DATA (push, dst->pitch << 16 | dst->pitch);
+      PUSH_RELOC(push, dst->bo, dst->offset, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_RELOC(push, dst->bo, dst->offset, NOUVEAU_BO_LOW, 0, 0);
+      BEGIN_NV04(push, NV05_SIFM(SURFACE), 1);
+      PUSH_DATA (push, nv30->screen->surf2d->handle);
+   } else {
+      BEGIN_NV04(push, NV04_SSWZ(DMA_IMAGE), 1);
+      PUSH_RELOC(push, dst->bo, 0, NOUVEAU_BO_OR, fifo->vram, fifo->gart);
+      BEGIN_NV04(push, NV04_SSWZ(FORMAT), 2);
+      PUSH_DATA (push, ss_fmt | (util_logbase2(dst->w) << 16) |
+                                (util_logbase2(dst->h) << 24));
+      PUSH_RELOC(push, dst->bo, dst->offset, NOUVEAU_BO_LOW, 0, 0);
+      BEGIN_NV04(push, NV05_SIFM(SURFACE), 1);
+      PUSH_DATA (push, nv30->screen->swzsurf->handle);
+   }
+
+   BEGIN_NV04(push, NV03_SIFM(DMA_IMAGE), 1);
+   PUSH_RELOC(push, src->bo, 0, NOUVEAU_BO_OR, fifo->vram, fifo->gart);
+   BEGIN_NV04(push, NV03_SIFM(COLOR_FORMAT), 8);
+   PUSH_DATA (push, si_fmt);
+   PUSH_DATA (push, NV03_SIFM_OPERATION_SRCCOPY);
+   PUSH_DATA (push, (           dst->y0  << 16) |            dst->x0);
+   PUSH_DATA (push, ((dst->y1 - dst->y0) << 16) | (dst->x1 - dst->x0));
+   PUSH_DATA (push, (           dst->y0  << 16) |            dst->x0);
+   PUSH_DATA (push, ((dst->y1 - dst->y0) << 16) | (dst->x1 - dst->x0));
+   PUSH_DATA (push, ((src->x1 - src->x0) << 20) / (dst->x1 - dst->x0));
+   PUSH_DATA (push, ((src->y1 - src->y0) << 20) / (dst->y1 - dst->y0));
+   BEGIN_NV04(push, NV03_SIFM(SIZE), 4);
+   PUSH_DATA (push, align(src->h, 2) << 16 | align(src->w, 2));
+   PUSH_DATA (push, src->pitch | si_arg);
+   PUSH_RELOC(push, src->bo, src->offset, NOUVEAU_BO_LOW, 0, 0);
+   PUSH_DATA (push, (src->y0 << 20) | src->x0 << 4);
+}
+
+/* The NOP+OFFSET_OUT stuff after each M2MF transfer *is* actually required
+ * to prevent some odd things from happening, easily reproducible by
+ * attempting to do conditional rendering that has a M2MF transfer done
+ * some time before it.  0x1e98 will fail with a DMA_W_PROTECTION (assuming
+ * that name is still accurate on nv4x) error.
+ */
+
+static boolean
+nv30_transfer_m2mf(XFER_ARGS)
+{
+   if (!src->pitch || !dst->pitch)
+      return FALSE;
+   if (nv30_transfer_scaled(src, dst))
+      return FALSE;
+   return TRUE;
+}
+
+static void
+nv30_transfer_rect_m2mf(XFER_ARGS)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_pushbuf_refn refs[] = {
+      { src->bo, src->domain | NOUVEAU_BO_RD },
+      { dst->bo, dst->domain | NOUVEAU_BO_WR },
+   };
+   struct nv04_fifo *fifo = push->channel->data;
+   unsigned src_offset = src->offset;
+   unsigned dst_offset = dst->offset;
+   unsigned w = dst->x1 - dst->x0;
+   unsigned h = dst->y1 - dst->y0;
+
+   src_offset += (src->y0 * src->pitch) + (src->x0 * src->cpp);
+   dst_offset += (dst->y0 * dst->pitch) + (dst->x0 * dst->cpp);
+
+   BEGIN_NV04(push, NV03_M2MF(DMA_BUFFER_IN), 2);
+   PUSH_DATA (push, (src->domain == NOUVEAU_BO_VRAM) ? fifo->vram : fifo->gart);
+   PUSH_DATA (push, (dst->domain == NOUVEAU_BO_VRAM) ? fifo->vram : fifo->gart);
+
+   while (h) {
+      unsigned lines = (h > 2047) ? 2047 : h;
+
+      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+          nouveau_pushbuf_refn (push, refs, 2))
+         return;
+
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_IN), 8);
+      PUSH_RELOC(push, src->bo, src_offset, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_RELOC(push, dst->bo, dst_offset, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_DATA (push, src->pitch);
+      PUSH_DATA (push, dst->pitch);
+      PUSH_DATA (push, w * src->cpp);
+      PUSH_DATA (push, lines);
+      PUSH_DATA (push, NV03_M2MF_FORMAT_INPUT_INC_1 |
+                       NV03_M2MF_FORMAT_OUTPUT_INC_1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV04_GRAPH(M2MF, NOP), 1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_OUT), 1);
+      PUSH_DATA (push, 0x00000000);
+
+      h -= lines;
+      src_offset += src->pitch * lines;
+      dst_offset += dst->pitch * lines;
+   }
+}
+
+static boolean
+nv30_transfer_cpu(XFER_ARGS)
+{
+   if (nv30_transfer_scaled(src, dst))
+      return FALSE;
+   return TRUE;
+}
+
+static char *
+linear_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
+{
+   return base + (y * rect->pitch) + (x * rect->cpp);
+}
+
+static INLINE unsigned
+swizzle2d(unsigned v, unsigned s)
+{
+   v = (v | (v << 8)) & 0x00ff00ff;
+   v = (v | (v << 4)) & 0x0f0f0f0f;
+   v = (v | (v << 2)) & 0x33333333;
+   v = (v | (v << 1)) & 0x55555555;
+   return v << s;
+}
+
+static char *
+swizzle2d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
+{
+   unsigned k = util_logbase2(MIN2(rect->w, rect->h));
+   unsigned km = (1 << k) - 1;
+   unsigned nx = rect->w >> k;
+   unsigned tx = x >> k;
+   unsigned ty = y >> k;
+   unsigned m;
+
+   m  = swizzle2d(x & km, 0);
+   m |= swizzle2d(y & km, 1);
+   m += ((ty * nx) + tx) << k << k;
+
+   return base + (m * rect->cpp);
+}
+
+static char *
+swizzle3d_ptr(struct nv30_rect *rect, char *base, int x, int y, int z)
+{
+   unsigned w = rect->w >> 1;
+   unsigned h = rect->h >> 1;
+   unsigned d = rect->d >> 1;
+   unsigned i = 0, o;
+   unsigned v = 0;
+
+   do {
+      o = i;
+      if (w) {
+         v |= (x & 1) << i++;
+         x >>= 1;
+         w >>= 1;
+      }
+      if (h) {
+         v |= (y & 1) << i++;
+         y >>= 1;
+         h >>= 1;
+      }
+      if (d) {
+         v |= (z & 1) << i++;
+         z >>= 1;
+         d >>= 1;
+      }
+   } while(o != i);
+
+   return base + (v * rect->cpp);
+}
+
+typedef char *(*get_ptr_t)(struct nv30_rect *, char *, int, int, int);
+
+static INLINE get_ptr_t
+get_ptr(struct nv30_rect *rect)
+{
+   if (rect->pitch)
+      return linear_ptr;
+
+   if (rect->d <= 1)
+      return swizzle2d_ptr;
+
+   return swizzle3d_ptr;
+}
+
+static void
+nv30_transfer_rect_cpu(XFER_ARGS)
+{
+   get_ptr_t sp = get_ptr(src);
+   get_ptr_t dp = get_ptr(dst);
+   char *srcmap, *dstmap;
+   int x, y;
+
+   nouveau_bo_map(src->bo, NOUVEAU_BO_RD, nv30->base.client);
+   nouveau_bo_map(dst->bo, NOUVEAU_BO_WR, nv30->base.client);
+   srcmap = src->bo->map + src->offset;
+   dstmap = dst->bo->map + dst->offset;
+
+   for (y = 0; y < (dst->y1 - dst->y0); y++) {
+      for (x = 0; x < (dst->x1 - dst->x0); x++) {
+         memcpy(dp(dst, dstmap, dst->x0 + x, dst->y0 + y, dst->z),
+                sp(src, srcmap, src->x0 + x, src->y0 + y, src->z), dst->cpp);
+      }
+   }
+}
+
+void
+nv30_transfer_rect(struct nv30_context *nv30, enum nv30_transfer_filter filter,
+                   struct nv30_rect *src, struct nv30_rect *dst)
+{
+   static const struct {
+      char *name;
+      boolean (*possible)(XFER_ARGS);
+      void (*execute)(XFER_ARGS);
+   } *method, methods[] = {
+      { "m2mf", nv30_transfer_m2mf, nv30_transfer_rect_m2mf },
+      { "sifm", nv30_transfer_sifm, nv30_transfer_rect_sifm },
+      { "blit", nv30_transfer_blit, nv30_transfer_rect_blit },
+      { "rect", nv30_transfer_cpu, nv30_transfer_rect_cpu },
+      {}
+   };
+
+   method = methods - 1;
+   while ((++method)->possible) {
+      if (method->possible(nv30, filter, src, dst)) {
+         method->execute(nv30, filter, src, dst);
+         return;
+      }
+   }
+
+   assert(0);
+}
+
+void
+nv30_transfer_push_data(struct nouveau_context *nv,
+                        struct nouveau_bo *bo, unsigned offset, unsigned domain,
+                        unsigned size, void *data)
+{
+   /* use ifc, or scratch + copy_data? */
+   fprintf(stderr, "nv30: push_data not implemented\n");
+}
+
+void
+nv30_transfer_copy_data(struct nouveau_context *nv,
+                        struct nouveau_bo *dst, unsigned d_off, unsigned d_dom,
+                        struct nouveau_bo *src, unsigned s_off, unsigned s_dom,
+                        unsigned size)
+{
+   struct nv04_fifo *fifo = nv->screen->channel->data;
+   struct nouveau_pushbuf_refn refs[] = {
+      { src, s_dom | NOUVEAU_BO_RD },
+      { dst, d_dom | NOUVEAU_BO_WR },
+   };
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   unsigned pages, lines;
+
+   pages = size >> 12;
+   size -= (pages << 12);
+
+   BEGIN_NV04(push, NV03_M2MF(DMA_BUFFER_IN), 2);
+   PUSH_DATA (push, (s_dom == NOUVEAU_BO_VRAM) ? fifo->vram : fifo->gart);
+   PUSH_DATA (push, (d_dom == NOUVEAU_BO_VRAM) ? fifo->vram : fifo->gart);
+
+   while (pages) {
+      lines  = (pages > 2047) ? 2047 : pages;
+      pages -= lines;
+
+      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+          nouveau_pushbuf_refn (push, refs, 2))
+         return;
+
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_IN), 8);
+      PUSH_RELOC(push, src, s_off, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_RELOC(push, dst, d_off, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_DATA (push, 4096);
+      PUSH_DATA (push, 4096);
+      PUSH_DATA (push, 4096);
+      PUSH_DATA (push, lines);
+      PUSH_DATA (push, NV03_M2MF_FORMAT_INPUT_INC_1 |
+                       NV03_M2MF_FORMAT_OUTPUT_INC_1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV04_GRAPH(M2MF, NOP), 1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_OUT), 1);
+      PUSH_DATA (push, 0x00000000);
+
+      s_off += (lines << 12);
+      d_off += (lines << 12);
+   }
+
+   if (size) {
+      if (nouveau_pushbuf_space(push, 13, 2, 0) ||
+          nouveau_pushbuf_refn (push, refs, 2))
+         return;
+
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_IN), 8);
+      PUSH_RELOC(push, src, s_off, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_RELOC(push, dst, d_off, NOUVEAU_BO_LOW, 0, 0);
+      PUSH_DATA (push, size);
+      PUSH_DATA (push, size);
+      PUSH_DATA (push, size);
+      PUSH_DATA (push, 1);
+      PUSH_DATA (push, NV03_M2MF_FORMAT_INPUT_INC_1 |
+                       NV03_M2MF_FORMAT_OUTPUT_INC_1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV04_GRAPH(M2MF, NOP), 1);
+      PUSH_DATA (push, 0x00000000);
+      BEGIN_NV04(push, NV03_M2MF(OFFSET_OUT), 1);
+      PUSH_DATA (push, 0x00000000);
+   }
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.h b/src/gallium/drivers/nouveau/nv30/nv30_transfer.h
new file mode 100644
index 00000000000..3fa6cd0c029
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.h
@@ -0,0 +1,40 @@
+#ifndef __NV30_TRANSFER_H__
+#define __NV30_TRANSFER_H__
+
+struct nv30_rect {
+   struct nouveau_bo *bo;
+   unsigned offset;
+   unsigned domain;
+   unsigned pitch;
+   unsigned cpp;
+   unsigned w;
+   unsigned h;
+   unsigned d;
+   unsigned z;
+   unsigned x0;
+   unsigned x1;
+   unsigned y0;
+   unsigned y1;
+};
+
+enum nv30_transfer_filter {
+   NEAREST = 0,
+   BILINEAR
+};
+
+void
+nv30_transfer_rect(struct nv30_context *, enum nv30_transfer_filter filter,
+                   struct nv30_rect *, struct nv30_rect *);
+
+void
+nv30_transfer_push_data(struct nouveau_context *,
+                        struct nouveau_bo *, unsigned offset, unsigned domain,
+                        unsigned size, void *data);
+
+void
+nv30_transfer_copy_data(struct nouveau_context *,
+                        struct nouveau_bo *, unsigned dstoff, unsigned dstdom,
+                        struct nouveau_bo *, unsigned srcoff, unsigned srcdom,
+                        unsigned size);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
new file mode 100644
index 00000000000..d9b3c3ed04a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "translate/translate.h"
+
+#include "nouveau_fence.h"
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_format.h"
+
+static void
+nv30_emit_vtxattr(struct nv30_context *nv30, struct pipe_vertex_buffer *vb,
+                  struct pipe_vertex_element *ve, unsigned attr)
+{
+   const unsigned nc = util_format_get_nr_components(ve->src_format);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nv04_resource *res = nv04_resource(vb->buffer);
+   const struct util_format_description *desc =
+      util_format_description(ve->src_format);
+   const void *data;
+   float v[4];
+
+   data = nouveau_resource_map_offset(&nv30->base, res, vb->buffer_offset +
+                                      ve->src_offset, NOUVEAU_BO_RD);
+
+   desc->unpack_rgba_float(v, 0, data, 0, 1, 1);
+
+   switch (nc) {
+   case 4:
+      BEGIN_NV04(push, NV30_3D(VTX_ATTR_4F(attr)), 4);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      PUSH_DATAf(push, v[2]);
+      PUSH_DATAf(push, v[3]);
+      break;
+   case 3:
+      BEGIN_NV04(push, NV30_3D(VTX_ATTR_3F(attr)), 3);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      PUSH_DATAf(push, v[2]);
+      break;
+   case 2:
+      BEGIN_NV04(push, NV30_3D(VTX_ATTR_2F(attr)), 2);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      break;
+   case 1:
+      BEGIN_NV04(push, NV30_3D(VTX_ATTR_1F(attr)), 1);
+      PUSH_DATAf(push, v[0]);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static INLINE void
+nv30_vbuf_range(struct nv30_context *nv30, int vbi,
+                uint32_t *base, uint32_t *size)
+{
+   assert(nv30->vbo_max_index != ~0);
+   *base = nv30->vbo_min_index * nv30->vtxbuf[vbi].stride;
+   *size = (nv30->vbo_max_index -
+            nv30->vbo_min_index + 1) * nv30->vtxbuf[vbi].stride;
+}
+
+static void
+nv30_prevalidate_vbufs(struct nv30_context *nv30)
+{
+   struct pipe_vertex_buffer *vb;
+   struct nv04_resource *buf;
+   int i;
+   uint32_t base, size;
+
+   nv30->vbo_fifo = nv30->vbo_user = 0;
+
+   for (i = 0; i < nv30->num_vtxbufs; i++) {
+      vb = &nv30->vtxbuf[i];
+      if (!vb->stride || !vb->buffer) /* NOTE: user_buffer not implemented */
+         continue;
+      buf = nv04_resource(vb->buffer);
+
+      /* NOTE: user buffers with temporary storage count as mapped by GPU */
+      if (!nouveau_resource_mapped_by_gpu(vb->buffer)) {
+         if (nv30->vbo_push_hint) {
+            nv30->vbo_fifo = ~0;
+            continue;
+         } else {
+            if (buf->status & NOUVEAU_BUFFER_STATUS_USER_MEMORY) {
+               nv30->vbo_user |= 1 << i;
+               assert(vb->stride > vb->buffer_offset);
+               nv30_vbuf_range(nv30, i, &base, &size);
+               nouveau_user_buffer_upload(&nv30->base, buf, base, size);
+            } else {
+               nouveau_buffer_migrate(&nv30->base, buf, NOUVEAU_BO_GART);
+            }
+            nv30->base.vbo_dirty = TRUE;
+         }
+      }
+   }
+}
+
+static void
+nv30_update_user_vbufs(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   uint32_t base, offset, size;
+   int i;
+   uint32_t written = 0;
+
+   for (i = 0; i < nv30->vertex->num_elements; i++) {
+      struct pipe_vertex_element *ve = &nv30->vertex->pipe[i];
+      const int b = ve->vertex_buffer_index;
+      struct pipe_vertex_buffer *vb = &nv30->vtxbuf[b];
+      struct nv04_resource *buf = nv04_resource(vb->buffer);
+
+      if (!(nv30->vbo_user & (1 << b)))
+         continue;
+
+      if (!vb->stride) {
+         nv30_emit_vtxattr(nv30, vb, ve, i);
+         continue;
+      }
+      nv30_vbuf_range(nv30, b, &base, &size);
+
+      if (!(written & (1 << b))) {
+         written |= 1 << b;
+         nouveau_user_buffer_upload(&nv30->base, buf, base, size);
+      }
+
+      offset = vb->buffer_offset + ve->src_offset;
+
+      BEGIN_NV04(push, NV30_3D(VTXBUF(i)), 1);
+      PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP, buf, offset,
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD,
+                       0, NV30_3D_VTXBUF_DMA1);
+   }
+   nv30->base.vbo_dirty = TRUE;
+}
+
+static INLINE void
+nv30_release_user_vbufs(struct nv30_context *nv30)
+{
+   uint32_t vbo_user = nv30->vbo_user;
+
+   while (vbo_user) {
+      int i = ffs(vbo_user) - 1;
+      vbo_user &= ~(1 << i);
+
+      nouveau_buffer_release_gpu_storage(nv04_resource(nv30->vtxbuf[i].buffer));
+   }
+
+   nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXTMP);
+}
+
+void
+nv30_vbo_validate(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nv30_vertex_stateobj *vertex = nv30->vertex;
+   struct pipe_vertex_element *ve;
+   struct pipe_vertex_buffer *vb;
+   unsigned i, redefine;
+
+   nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VTXBUF);
+   if (!nv30->vertex || nv30->draw_flags)
+      return;
+
+   if (unlikely(vertex->need_conversion)) {
+      nv30->vbo_fifo = ~0;
+      nv30->vbo_user = 0;
+   } else {
+      nv30_prevalidate_vbufs(nv30);
+   }
+
+   if (!PUSH_SPACE(push, 128))
+      return;
+
+   redefine = MAX2(vertex->num_elements, nv30->state.num_vtxelts);
+   BEGIN_NV04(push, NV30_3D(VTXFMT(0)), redefine);
+
+   for (i = 0; i < vertex->num_elements; i++) {
+      ve = &vertex->pipe[i];
+      vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+      if (likely(vb->stride) || nv30->vbo_fifo)
+         PUSH_DATA (push, (vb->stride << 8) | vertex->element[i].state);
+      else
+         PUSH_DATA (push, NV30_3D_VTXFMT_TYPE_V32_FLOAT);
+   }
+
+   for (; i < nv30->state.num_vtxelts; i++) {
+      PUSH_DATA (push, NV30_3D_VTXFMT_TYPE_V32_FLOAT);
+   }
+
+   for (i = 0; i < vertex->num_elements; i++) {
+      struct nv04_resource *res;
+      unsigned offset;
+      boolean user;
+
+      ve = &vertex->pipe[i];
+      vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+      user = (nv30->vbo_user & (1 << ve->vertex_buffer_index));
+
+      res = nv04_resource(vb->buffer);
+
+      if (nv30->vbo_fifo || unlikely(vb->stride == 0)) {
+         if (!nv30->vbo_fifo)
+            nv30_emit_vtxattr(nv30, vb, ve, i);
+         continue;
+      }
+
+      offset = ve->src_offset + vb->buffer_offset;
+
+      BEGIN_NV04(push, NV30_3D(VTXBUF(i)), 1);
+      PUSH_RESRC(push, NV30_3D(VTXBUF(i)), user ? BUFCTX_VTXTMP : BUFCTX_VTXBUF,
+                       res, offset, NOUVEAU_BO_LOW | NOUVEAU_BO_RD,
+                       0, NV30_3D_VTXBUF_DMA1);
+   }
+
+   nv30->state.num_vtxelts = vertex->num_elements;
+}
+
+static void *
+nv30_vertex_state_create(struct pipe_context *pipe, unsigned num_elements,
+                         const struct pipe_vertex_element *elements)
+{
+    struct nv30_vertex_stateobj *so;
+    struct translate_key transkey;
+    unsigned i;
+
+    assert(num_elements);
+
+    so = MALLOC(sizeof(*so) + sizeof(*so->element) * num_elements);
+    if (!so)
+        return NULL;
+    memcpy(so->pipe, elements, sizeof(*elements) * num_elements);
+    so->num_elements = num_elements;
+    so->need_conversion = FALSE;
+
+    transkey.nr_elements = 0;
+    transkey.output_stride = 0;
+
+    for (i = 0; i < num_elements; i++) {
+        const struct pipe_vertex_element *ve = &elements[i];
+        const unsigned vbi = ve->vertex_buffer_index;
+        enum pipe_format fmt = ve->src_format;
+
+        so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw;
+        if (!so->element[i].state) {
+            switch (util_format_get_nr_components(fmt)) {
+            case 1: fmt = PIPE_FORMAT_R32_FLOAT; break;
+            case 2: fmt = PIPE_FORMAT_R32G32_FLOAT; break;
+            case 3: fmt = PIPE_FORMAT_R32G32B32_FLOAT; break;
+            case 4: fmt = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
+            default:
+                assert(0);
+                FREE(so);
+                return NULL;
+            }
+            so->element[i].state = nv30_vtxfmt(pipe->screen, fmt)->hw;
+            so->need_conversion = TRUE;
+        }
+
+        if (1) {
+            unsigned j = transkey.nr_elements++;
+
+            transkey.element[j].type = TRANSLATE_ELEMENT_NORMAL;
+            transkey.element[j].input_format = ve->src_format;
+            transkey.element[j].input_buffer = vbi;
+            transkey.element[j].input_offset = ve->src_offset;
+            transkey.element[j].instance_divisor = ve->instance_divisor;
+
+            transkey.element[j].output_format = fmt;
+            transkey.element[j].output_offset = transkey.output_stride;
+            transkey.output_stride += (util_format_get_stride(fmt, 1) + 3) & ~3;
+        }
+    }
+
+    so->translate = translate_create(&transkey);
+    so->vtx_size = transkey.output_stride / 4;
+    so->vtx_per_packet_max = NV04_PFIFO_MAX_PACKET_LEN / MAX2(so->vtx_size, 1);
+    return so;
+}
+
+static void
+nv30_vertex_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_vertex_stateobj *so = hwcso;
+
+   if (so->translate)
+      so->translate->release(so->translate);
+   FREE(hwcso);
+}
+
+static void
+nv30_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->vertex = hwcso;
+   nv30->dirty |= NV30_NEW_VERTEX;
+}
+
+static void
+nv30_draw_arrays(struct nv30_context *nv30,
+                 unsigned mode, unsigned start, unsigned count,
+                 unsigned instance_count)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   unsigned prim;
+
+   prim = nv30_prim_gl(mode);
+
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, prim);
+   while (count) {
+      const unsigned mpush = 2047 * 256;
+      unsigned npush  = (count > mpush) ? mpush : count;
+      unsigned wpush  = ((npush + 255) & ~255) >> 8;
+
+      count -= npush;
+
+      BEGIN_NI04(push, NV30_3D(VB_VERTEX_BATCH), wpush);
+      while (npush >= 256) {
+         PUSH_DATA (push, 0xff000000 | start);
+         start += 256;
+         npush -= 256;
+      }
+
+      if (npush)
+         PUSH_DATA (push, ((npush - 1) << 24) | start);
+   }
+   BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+   PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+}
+
+static void
+nv30_draw_elements_inline_u08(struct nouveau_pushbuf *push, const uint8_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+
+   count >>= 1;
+   while (count) {
+      unsigned npush = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+      count -= npush;
+
+      BEGIN_NI04(push, NV30_3D(VB_ELEMENT_U16), npush);
+      while (npush--) {
+         PUSH_DATA (push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+   }
+
+}
+
+static void
+nv30_draw_elements_inline_u16(struct nouveau_pushbuf *push, const uint16_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+
+   count >>= 1;
+   while (count) {
+      unsigned npush = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+      count -= npush;
+
+      BEGIN_NI04(push, NV30_3D(VB_ELEMENT_U16), npush);
+      while (npush--) {
+         PUSH_DATA (push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+   }
+}
+
+static void
+nv30_draw_elements_inline_u32(struct nouveau_pushbuf *push, const uint32_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   while (count) {
+      const unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+
+      BEGIN_NI04(push, NV30_3D(VB_ELEMENT_U32), nr);
+      PUSH_DATAp(push, map, nr);
+
+      map += nr;
+      count -= nr;
+   }
+}
+
+static void
+nv30_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
+                                    const uint32_t *map,
+                                    unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+
+   count >>= 1;
+   while (count) {
+      unsigned npush = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);;
+      count -= npush;
+
+      BEGIN_NI04(push, NV30_3D(VB_ELEMENT_U16), npush);
+      while (npush--) {
+         PUSH_DATA (push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+   }
+}
+
+static void
+nv30_draw_elements(struct nv30_context *nv30, boolean shorten,
+                   unsigned mode, unsigned start, unsigned count,
+                   unsigned instance_count, int32_t index_bias)
+{
+   const unsigned index_size = nv30->idxbuf.index_size;
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   unsigned prim = nv30_prim_gl(mode);
+
+#if 0 /*XXX*/
+   if (index_bias != nv30->state.index_bias) {
+      BEGIN_NV04(push, NV30_3D(VB_ELEMENT_BASE), 1);
+      PUSH_DATA (push, index_bias);
+      nv30->state.index_bias = index_bias;
+   }
+#endif
+
+   if (eng3d->oclass == NV40_3D_CLASS && index_size > 1 &&
+       nv30->idxbuf.buffer) {
+      struct nv04_resource *res = nv04_resource(nv30->idxbuf.buffer);
+      unsigned offset = nv30->idxbuf.offset;
+
+      assert(nouveau_resource_mapped_by_gpu(&res->base));
+
+      BEGIN_NV04(push, NV30_3D(IDXBUF_OFFSET), 2);
+      PUSH_RESRC(push, NV30_3D(IDXBUF_OFFSET), BUFCTX_IDXBUF, res, offset,
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+      PUSH_MTHD (push, NV30_3D(IDXBUF_FORMAT), BUFCTX_IDXBUF, res->bo,
+                       (index_size == 2) ? 0x00000010 : 0x00000000,
+                       res->domain | NOUVEAU_BO_RD,
+                       0, NV30_3D_IDXBUF_FORMAT_DMA1);
+      BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+      PUSH_DATA (push, prim);
+      while (count) {
+         const unsigned mpush = 2047 * 256;
+         unsigned npush  = (count > mpush) ? mpush : count;
+         unsigned wpush  = ((npush + 255) & ~255) >> 8;
+
+         count -= npush;
+
+         BEGIN_NI04(push, NV30_3D(VB_INDEX_BATCH), wpush);
+         while (npush >= 256) {
+            PUSH_DATA (push, 0xff000000 | start);
+            start += 256;
+            npush -= 256;
+         }
+
+         if (npush)
+            PUSH_DATA (push, ((npush - 1) << 24) | start);
+      }
+      BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+      PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+      PUSH_RESET(push, BUFCTX_IDXBUF);
+   } else {
+      const void *data;
+      if (nv30->idxbuf.buffer)
+         data = nouveau_resource_map_offset(&nv30->base,
+                                            nv04_resource(nv30->idxbuf.buffer),
+                                            nv30->idxbuf.offset, NOUVEAU_BO_RD);
+      else
+         data = nv30->idxbuf.user_buffer;
+      if (!data)
+         return;
+
+      BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+      PUSH_DATA (push, prim);
+      switch (index_size) {
+      case 1:
+         nv30_draw_elements_inline_u08(push, data, start, count);
+         break;
+      case 2:
+         nv30_draw_elements_inline_u16(push, data, start, count);
+         break;
+      case 4:
+         if (shorten)
+            nv30_draw_elements_inline_u32_short(push, data, start, count);
+         else
+            nv30_draw_elements_inline_u32(push, data, start, count);
+         break;
+      default:
+         assert(0);
+         return;
+      }
+      BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
+      PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
+   }
+}
+
+static void
+nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+
+   /* For picking only a few vertices from a large user buffer, push is better,
+    * if index count is larger and we expect repeated vertices, suggest upload.
+    */
+   nv30->vbo_push_hint = /* the 64 is heuristic */
+      !(info->indexed &&
+        ((info->max_index - info->min_index + 64) < info->count));
+
+   nv30->vbo_min_index = info->min_index;
+   nv30->vbo_max_index = info->max_index;
+
+   if (nv30->vbo_push_hint != !!nv30->vbo_fifo)
+      nv30->dirty |= NV30_NEW_ARRAYS;
+
+   push->user_priv = &nv30->bufctx;
+   if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
+      nv30_update_user_vbufs(nv30);
+
+   nv30_state_validate(nv30, TRUE);
+   if (nv30->draw_flags) {
+      nv30_render_vbo(pipe, info);
+      return;
+   } else
+   if (nv30->vbo_fifo) {
+      nv30_push_vbo(nv30, info);
+      return;
+   }
+
+   if (nv30->base.vbo_dirty) {
+      BEGIN_NV04(push, NV30_3D(VTX_CACHE_INVALIDATE_1710), 1);
+      PUSH_DATA (push, 0);
+      nv30->base.vbo_dirty = FALSE;
+   }
+
+   if (!info->indexed) {
+      nv30_draw_arrays(nv30,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
+   } else {
+      boolean shorten = info->max_index <= 65535;
+
+      if (info->primitive_restart != nv30->state.prim_restart) {
+         if (info->primitive_restart) {
+            BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 2);
+            PUSH_DATA (push, 1);
+            PUSH_DATA (push, info->restart_index);
+
+            if (info->restart_index > 65535)
+               shorten = FALSE;
+         } else {
+            BEGIN_NV04(push, NV40_3D(PRIM_RESTART_ENABLE), 1);
+            PUSH_DATA (push, 0);
+         }
+         nv30->state.prim_restart = info->primitive_restart;
+      } else
+      if (info->primitive_restart) {
+         BEGIN_NV04(push, NV40_3D(PRIM_RESTART_INDEX), 1);
+         PUSH_DATA (push, info->restart_index);
+
+         if (info->restart_index > 65535)
+            shorten = FALSE;
+      }
+
+      nv30_draw_elements(nv30, shorten,
+                         info->mode, info->start, info->count,
+                         info->instance_count, info->index_bias);
+   }
+
+   nv30_state_release(nv30);
+   nv30_release_user_vbufs(nv30);
+}
+
+void
+nv30_vbo_init(struct pipe_context *pipe)
+{
+   pipe->create_vertex_elements_state = nv30_vertex_state_create;
+   pipe->delete_vertex_elements_state = nv30_vertex_state_delete;
+   pipe->bind_vertex_elements_state = nv30_vertex_state_bind;
+   pipe->draw_vbo = nv30_draw_vbo;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
new file mode 100644
index 00000000000..7bf05ddfe0a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_dynarray.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "nv_object.xml.h"
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_state.h"
+
+static void
+nv30_vertprog_destroy(struct nv30_vertprog *vp)
+{
+   util_dynarray_fini(&vp->branch_relocs);
+   nouveau_heap_free(&vp->exec);
+   FREE(vp->insns);
+   vp->insns = NULL;
+   vp->nr_insns = 0;
+
+   util_dynarray_fini(&vp->const_relocs);
+   nouveau_heap_free(&vp->data);
+   FREE(vp->consts);
+   vp->consts = NULL;
+   vp->nr_consts = 0;
+
+   vp->translated = FALSE;
+}
+
+void
+nv30_vertprog_validate(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   struct nouveau_object *eng3d = nv30->screen->eng3d;
+   struct nv30_vertprog *vp = nv30->vertprog.program;
+   struct nv30_fragprog *fp = nv30->fragprog.program;
+   boolean upload_code = FALSE;
+   boolean upload_data = FALSE;
+   unsigned i;
+
+   if (nv30->dirty & NV30_NEW_FRAGPROG) {
+      if (memcmp(vp->texcoord, fp->texcoord, sizeof(vp->texcoord))) {
+         if (vp->translated)
+            nv30_vertprog_destroy(vp);
+         memcpy(vp->texcoord, fp->texcoord, sizeof(vp->texcoord));
+      }
+   }
+
+   if (nv30->rast && nv30->rast->pipe.clip_plane_enable != vp->enabled_ucps) {
+      vp->enabled_ucps = nv30->rast->pipe.clip_plane_enable;
+      if (vp->translated)
+         nv30_vertprog_destroy(vp);
+   }
+
+   if (!vp->translated) {
+      vp->translated = _nvfx_vertprog_translate(nv30, vp);
+      if (!vp->translated) {
+         nv30->draw_flags |= NV30_NEW_VERTPROG;
+         return;
+      }
+      nv30->dirty |= NV30_NEW_VERTPROG;
+   }
+
+   if (!vp->exec) {
+      struct nouveau_heap *heap = nv30->screen->vp_exec_heap;
+      struct nv30_shader_reloc *reloc = vp->branch_relocs.data;
+      unsigned nr_reloc = vp->branch_relocs.size / sizeof(*reloc);
+      uint32_t *inst, target;
+
+      if (nouveau_heap_alloc(heap, vp->nr_insns, &vp->exec, &vp->exec)) {
+         while (heap->next && heap->size < vp->nr_insns) {
+            struct nouveau_heap **evict = heap->next->priv;
+            nouveau_heap_free(evict);
+         }
+
+         if (nouveau_heap_alloc(heap, vp->nr_insns, &vp->exec, &vp->exec)) {
+            nv30->draw_flags |= NV30_NEW_VERTPROG;
+            return;
+         }
+      }
+
+      if (eng3d->oclass < NV40_3D_CLASS) {
+         while (nr_reloc--) {
+            inst     = vp->insns[reloc->location].data;
+            target   = vp->exec->start + reloc->target;
+
+            inst[2] &= ~0x000007fc;
+            inst[2] |= target << 2;
+            reloc++;
+         }
+      } else {
+         while (nr_reloc--) {
+            inst     = vp->insns[reloc->location].data;
+            target   = vp->exec->start + reloc->target;
+
+            inst[2] &= ~0x0000003f;
+            inst[2] |= target >> 3;
+            inst[3] &= ~0xe0000000;
+            inst[3] |= target << 29;
+            reloc++;
+         }
+      }
+
+      upload_code = TRUE;
+   }
+
+   if (vp->nr_consts && !vp->data) {
+      struct nouveau_heap *heap = nv30->screen->vp_data_heap;
+      struct nv30_shader_reloc *reloc = vp->const_relocs.data;
+      unsigned nr_reloc = vp->const_relocs.size / sizeof(*reloc);
+      uint32_t *inst, target;
+
+      if (nouveau_heap_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+         while (heap->next && heap->size < vp->nr_consts) {
+            struct nv30_vertprog *evp = heap->next->priv;
+            nouveau_heap_free(&evp->data);
+         }
+
+         if (nouveau_heap_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+            nv30->draw_flags |= NV30_NEW_VERTPROG;
+            return;
+         }
+      }
+
+      if (eng3d->oclass < NV40_3D_CLASS) {
+         while (nr_reloc--) {
+            inst     = vp->insns[reloc->location].data;
+            target   = vp->data->start + reloc->target;
+
+            inst[1] &= ~0x0007fc000;
+            inst[1] |= (target & 0x1ff) << 14;
+            reloc++;
+         }
+      } else {
+         while (nr_reloc--) {
+            inst     = vp->insns[reloc->location].data;
+            target   = vp->data->start + reloc->target;
+
+            inst[1] &= ~0x0001ff000;
+            inst[1] |= (target & 0x1ff) << 12;
+            reloc++;
+         }
+      }
+
+      upload_code = TRUE;
+      upload_data = TRUE;
+   }
+
+   if (vp->nr_consts) {
+      struct nv04_resource *res = nv04_resource(nv30->vertprog.constbuf);
+
+      for (i = 0; i < vp->nr_consts; i++) {
+         struct nv30_vertprog_data *data = &vp->consts[i];
+
+         if (data->index < 0) {
+            if (!upload_data)
+               continue;
+         } else {
+            float *constbuf = (float *)res->data;
+            if (!upload_data &&
+                !memcmp(data->value, &constbuf[data->index * 4], 16))
+               continue;
+            memcpy(data->value, &constbuf[data->index * 4], 16);
+         }
+
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
+         PUSH_DATA (push, vp->data->start + i);
+         PUSH_DATAp(push, data->value, 4);
+      }
+   }
+
+   if (upload_code) {
+      BEGIN_NV04(push, NV30_3D(VP_UPLOAD_FROM_ID), 1);
+      PUSH_DATA (push, vp->exec->start);
+      for (i = 0; i < vp->nr_insns; i++) {
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_INST(0)), 4);
+         PUSH_DATAp(push, vp->insns[i].data, 4);
+      }
+   }
+
+   if (nv30->dirty & (NV30_NEW_VERTPROG | NV30_NEW_FRAGPROG)) {
+      BEGIN_NV04(push, NV30_3D(VP_START_FROM_ID), 1);
+      PUSH_DATA (push, vp->exec->start);
+      if (eng3d->oclass < NV40_3D_CLASS) {
+         BEGIN_NV04(push, NV30_3D(ENGINE), 1);
+         PUSH_DATA (push, 0x00000013); /* vp instead of ff, somehow */
+      } else {
+         BEGIN_NV04(push, NV40_3D(VP_ATTRIB_EN), 2);
+         PUSH_DATA (push, vp->ir);
+         PUSH_DATA (push, vp->or | fp->vp_or);
+         BEGIN_NV04(push, NV30_3D(ENGINE), 1);
+         PUSH_DATA (push, 0x00000011);
+      }
+   }
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   struct nv30_vertprog *vp = CALLOC_STRUCT(nv30_vertprog);
+   if (!vp)
+      return NULL;
+
+   vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+   tgsi_scan_shader(vp->pipe.tokens, &vp->info);
+   return vp;
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_vertprog *vp = hwcso;
+
+   if (vp->translated)
+      nv30_vertprog_destroy(vp);
+   FREE((void *)vp->pipe.tokens);
+   FREE(vp);
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+
+   nv30->vertprog.program = hwcso;
+   nv30->dirty |= NV30_NEW_VERTPROG;
+}
+
+void
+nv30_vertprog_init(struct pipe_context *pipe)
+{
+   pipe->create_vs_state = nv30_vp_state_create;
+   pipe->bind_vs_state = nv30_vp_state_bind;
+   pipe->delete_vs_state = nv30_vp_state_delete;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.h b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.h
new file mode 100644
index 00000000000..5556e0c77bd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.h
@@ -0,0 +1,176 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+/* guess that this is the same as nv40 */
+#define NV30_VP_INST_INDEX_INPUT                                        (1 << 27)
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0x1F << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0x1FF <<  2)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x1F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+(n))
+#  define NV30_VP_INST_DEST_CLP(n) (17 + (n))
+
+/* guess that this is the same as nv40 */
+#define NV30_VP_INST_INDEX_CONST                                        (1 << 1)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+#include "nv30/nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_winsys.h b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
new file mode 100644
index 00000000000..5cee5df60ce
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv30_winsys.h
@@ -0,0 +1,158 @@
+#ifndef __NV30_WINSYS_H__
+#define __NV30_WINSYS_H__
+
+#include <string.h>
+#include "nouveau_winsys.h"
+#include "nouveau_buffer.h"
+
+/*XXX: rnn */
+#define NV40_3D_VTXTEX_OFFSET(i) (0x0900 + ((i) * 0x20)) // 401e80
+#define NV40_3D_VTXTEX_FORMAT(i) (0x0904 + ((i) * 0x20)) // 401e90
+#define NV40_3D_VTXTEX_WRAP(i)   (0x0908 + ((i) * 0x20)) // 401ea0
+#define NV40_3D_VTXTEX_ENABLE(i) (0x090c + ((i) * 0x20)) // 401eb0
+#define NV40_3D_VTXTEX_SWZ(i)    (0x0910 + ((i) * 0x20)) // 401ec0
+#define NV40_3D_VTXTEX_FILTER(i) (0x0914 + ((i) * 0x20)) // 401ed0
+#define NV40_3D_VTXTEX_SIZE(i)   (0x0918 + ((i) * 0x20)) // 401ee0
+#define NV40_3D_VTXTEX_BCOL(i)   (0x091c + ((i) * 0x20)) // 401ef0
+#define NV30_3D_VTX_CACHE_INVALIDATE_1710 0x1710
+#define NV30_3D_R1718 0x1718
+#define NV40_3D_PRIM_RESTART_ENABLE 0x1dac
+#define NV40_3D_PRIM_RESTART_INDEX  0x1db0
+
+static INLINE void
+PUSH_RELOC(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t offset,
+      uint32_t flags, uint32_t vor, uint32_t tor)
+{
+   nouveau_pushbuf_reloc(push, bo, offset, flags, vor, tor);
+}
+
+static INLINE struct nouveau_bufctx *
+bufctx(struct nouveau_pushbuf *push)
+{
+   struct nouveau_bufctx **pctx = push->user_priv;
+   return *pctx;
+}
+
+static INLINE void
+PUSH_RESET(struct nouveau_pushbuf *push, int bin)
+{
+   nouveau_bufctx_reset(bufctx(push), bin);
+}
+
+static INLINE void
+PUSH_REFN(struct nouveau_pushbuf *push, int bin,
+     struct nouveau_bo *bo, uint32_t access)
+{
+   nouveau_bufctx_refn(bufctx(push), bin, bo, access);
+}
+
+static INLINE void
+PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
+      struct nouveau_bo *bo, uint32_t offset, uint32_t access)
+{
+   nouveau_bufctx_mthd(bufctx(push), bin, (1 << 18) | (subc << 13) | mthd,
+                       bo, offset, access | NOUVEAU_BO_LOW, 0, 0)->priv = NULL;
+   PUSH_DATA(push, bo->offset + offset);
+}
+
+static INLINE void
+PUSH_MTHDo(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
+      struct nouveau_bo *bo, uint32_t access, uint32_t vor, uint32_t tor)
+{
+   nouveau_bufctx_mthd(bufctx(push), bin, (1 << 18) | (subc << 13) | mthd,
+                       bo, 0, access | NOUVEAU_BO_OR, vor, tor)->priv = NULL;
+   if (bo->flags & NOUVEAU_BO_VRAM)
+      PUSH_DATA(push, vor);
+   else
+      PUSH_DATA(push, tor);
+}
+
+static INLINE void
+PUSH_MTHDs(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
+      struct nouveau_bo *bo, uint32_t data, uint32_t access,
+      uint32_t vor, uint32_t tor)
+{
+   nouveau_bufctx_mthd(bufctx(push), bin, (1 << 18) | (subc << 13) | mthd,
+                       bo, data, access | NOUVEAU_BO_OR, vor, tor)->priv = NULL;
+   if (bo->flags & NOUVEAU_BO_VRAM)
+      PUSH_DATA(push, data | vor);
+   else
+      PUSH_DATA(push, data | tor);
+}
+
+static INLINE struct nouveau_bufref *
+PUSH_MTHD(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
+     struct nouveau_bo *bo, uint32_t data, uint32_t access,
+     uint32_t vor, uint32_t tor)
+{
+   struct nouveau_bufref *bref =
+   nouveau_bufctx_mthd(bufctx(push), bin, (1 << 18) | (subc << 13) | mthd,
+                       bo, data, access | NOUVEAU_BO_OR, vor, tor);
+   if (access & NOUVEAU_BO_LOW)
+      data += bo->offset;
+   if (bo->flags & NOUVEAU_BO_VRAM)
+      data |= vor;
+   else
+      data |= tor;
+   PUSH_DATA(push, data);
+   bref->priv = NULL;
+   return bref;
+}
+
+static INLINE void
+PUSH_RESRC(struct nouveau_pushbuf *push, int subc, int mthd, int bin,
+           struct nv04_resource *r, uint32_t data, uint32_t access,
+           uint32_t vor, uint32_t tor)
+{
+   PUSH_MTHD(push, subc, mthd, bin, r->bo, r->offset + data,
+             r->domain | access, vor, tor)->priv = r;
+}
+
+static INLINE void
+BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
+{
+   PUSH_SPACE(push, size + 1);
+   PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd);
+}
+
+static INLINE void
+BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
+{
+   PUSH_SPACE(push, size + 1);
+   PUSH_DATA (push, 0x40000000 | (size << 18) | (subc << 13) | mthd);
+}
+
+/* subchannel assignment
+ *
+ * 0: <1.0.0 - used by kernel for m2mf
+ *     1.0.0 - used by kernel for nvsw
+ *
+ * 1: <1.0.0 - used by kernel for nvsw
+ *     1.0.0 - free for userspace
+ *
+ * 2-7: free for userspace on all kernel versions
+ */
+
+#define SUBC_M2MF(mthd)  2, (mthd)
+#define NV03_M2MF(mthd)  SUBC_M2MF(NV03_M2MF_##mthd)
+
+#define SUBC_SF2D(mthd)  3, (mthd)
+#define NV04_SF2D(mthd)  SUBC_SF2D(NV04_SURFACE_2D_##mthd)
+
+#define SUBC_SSWZ(mthd)  4, (mthd)
+#define NV04_SSWZ(mthd)  SUBC_SSWZ(NV04_SURFACE_SWZ_##mthd)
+
+#define SUBC_SIFM(mthd)  5, (mthd)
+#define NV03_SIFM(mthd)  SUBC_SIFM(NV03_SIFM_##mthd)
+#define NV05_SIFM(mthd)  SUBC_SIFM(NV05_SIFM_##mthd)
+
+#define SUBC_3D(mthd)    7, (mthd)
+#define NV30_3D(mthd)    SUBC_3D(NV30_3D_##mthd)
+#define NV40_3D(mthd)    SUBC_3D(NV40_3D_##mthd)
+
+#define NV01_SUBC(subc, mthd) SUBC_##subc((NV01_SUBCHAN_##mthd))
+#define NV11_SUBC(subc, mthd) SUBC_##subc((NV11_SUBCHAN_##mthd))
+
+#define NV04_GRAPH(subc, mthd) SUBC_##subc((NV04_GRAPH_##mthd))
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv40_vertprog.h b/src/gallium/drivers/nouveau/nv30/nv40_vertprog.h
new file mode 100644
index 00000000000..b369ced886e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv40_vertprog.h
@@ -0,0 +1,178 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_SATURATE                                          (1 << 26)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x3F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x3F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+#include "nv30/nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv40_verttex.c b/src/gallium/drivers/nouveau/nv30/nv40_verttex.c
new file mode 100644
index 00000000000..9a7163c448f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nv40_verttex.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2012 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Ben Skeggs
+ *
+ */
+
+#include "util/u_inlines.h"
+#include "nv30/nv30_context.h"
+
+void
+nv40_verttex_validate(struct nv30_context *nv30)
+{
+   struct nouveau_pushbuf *push = nv30->base.pushbuf;
+   unsigned dirty = nv30->vertprog.dirty_samplers;
+
+   while (dirty) {
+      unsigned unit = ffs(dirty) - 1;
+      struct nv30_sampler_view *sv = (void *)nv30->fragprog.textures[unit];
+      struct nv30_sampler_state *ss = nv30->fragprog.samplers[unit];
+
+      if (ss && sv) {
+      } else {
+         BEGIN_NV04(push, NV40_3D(VTXTEX_ENABLE(unit)), 1);
+         PUSH_DATA (push, 0);
+      }
+   }
+
+   nv30->vertprog.dirty_samplers = 0;
+}
+
+static void
+nv40_verttex_sampler_states_bind(struct pipe_context *pipe,
+                                 unsigned nr, void **hwcso)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < nr; i++) {
+      nv30->vertprog.samplers[i] = hwcso[i];
+      nv30->vertprog.dirty_samplers |= (1 << i);
+   }
+
+   for (; i < nv30->vertprog.num_samplers; i++) {
+      nv30->vertprog.samplers[i] = NULL;
+      nv30->vertprog.dirty_samplers |= (1 << i);
+   }
+
+   nv30->vertprog.num_samplers = nr;
+   nv30->dirty |= NV30_NEW_VERTTEX;
+}
+
+
+static void
+nv40_verttex_set_sampler_views(struct pipe_context *pipe, unsigned nr,
+                               struct pipe_sampler_view **views)
+{
+   struct nv30_context *nv30 = nv30_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < nr; i++) {
+      nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VERTTEX(i));
+      pipe_sampler_view_reference(&nv30->vertprog.textures[i], views[i]);
+      nv30->vertprog.dirty_samplers |= (1 << i);
+   }
+
+   for (; i < nv30->vertprog.num_textures; i++) {
+      nouveau_bufctx_reset(nv30->bufctx, BUFCTX_VERTTEX(i));
+      pipe_sampler_view_reference(&nv30->vertprog.textures[i], NULL);
+      nv30->vertprog.dirty_samplers |= (1 << i);
+   }
+
+   nv30->vertprog.num_textures = nr;
+   nv30->dirty |= NV30_NEW_VERTTEX;
+}
+
+void
+nv40_verttex_init(struct pipe_context *pipe)
+{
+   pipe->bind_vertex_sampler_states = nv40_verttex_sampler_states_bind;
+   pipe->set_vertex_sampler_views = nv40_verttex_set_sampler_views;
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
new file mode 100644
index 00000000000..4751ec80de5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -0,0 +1,1251 @@
+#include <float.h>
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_linkage.h"
+#include "util/u_inlines.h"
+#include "util/u_debug.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nvfx_shader.h"
+
+struct nvfx_fpc {
+   struct nv30_fragprog *fp;
+
+   unsigned max_temps;
+   unsigned long long r_temps;
+   unsigned long long r_temps_discard;
+   struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+   struct nvfx_reg r_input[PIPE_MAX_SHADER_INPUTS];
+   struct nvfx_reg *r_temp;
+
+   int num_regs;
+
+   unsigned inst_offset;
+   unsigned have_const;
+
+   struct util_dynarray imm_data;
+
+   struct nvfx_reg* r_imm;
+   unsigned nr_imm;
+
+   struct util_dynarray if_stack;
+   //struct util_dynarray loop_stack;
+   struct util_dynarray label_relocs;
+};
+
+static INLINE struct nvfx_reg
+temp(struct nvfx_fpc *fpc)
+{
+   int idx = __builtin_ctzll(~fpc->r_temps);
+
+   if (idx >= fpc->max_temps) {
+      NOUVEAU_ERR("out of temps!!\n");
+      assert(0);
+      return nvfx_reg(NVFXSR_TEMP, 0);
+   }
+
+   fpc->r_temps |= (1ULL << idx);
+   fpc->r_temps_discard |= (1ULL << idx);
+   return nvfx_reg(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_fpc *fpc)
+{
+   fpc->r_temps &= ~fpc->r_temps_discard;
+   fpc->r_temps_discard = 0ULL;
+}
+
+static inline struct nvfx_reg
+nvfx_fp_imm(struct nvfx_fpc *fpc, float a, float b, float c, float d)
+{
+   float v[4] = {a, b, c, d};
+   int idx = fpc->imm_data.size >> 4;
+
+   memcpy(util_dynarray_grow(&fpc->imm_data, sizeof(float) * 4), v, 4 * sizeof(float));
+   return nvfx_reg(NVFXSR_IMM, idx);
+}
+
+static void
+grow_insns(struct nvfx_fpc *fpc, int size)
+{
+   struct nv30_fragprog *fp = fpc->fp;
+
+   fp->insn_len += size;
+   fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
+{
+   struct nv30_fragprog *fp = fpc->fp;
+   uint32_t *hw = &fp->insn[fpc->inst_offset];
+   uint32_t sr = 0;
+
+   switch (src.reg.type) {
+   case NVFXSR_INPUT:
+      sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+      hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+      break;
+   case NVFXSR_OUTPUT:
+      sr |= NVFX_FP_REG_SRC_HALF;
+      /* fall-through */
+   case NVFXSR_TEMP:
+      sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+      sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
+      break;
+   case NVFXSR_IMM:
+      if (!fpc->have_const) {
+         grow_insns(fpc, 4);
+         hw = &fp->insn[fpc->inst_offset];
+         fpc->have_const = 1;
+      }
+
+      memcpy(&fp->insn[fpc->inst_offset + 4],
+            (float*)fpc->imm_data.data + src.reg.index * 4,
+            sizeof(uint32_t) * 4);
+
+      sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
+      break;
+   case NVFXSR_CONST:
+      if (!fpc->have_const) {
+         grow_insns(fpc, 4);
+         hw = &fp->insn[fpc->inst_offset];
+         fpc->have_const = 1;
+      }
+
+      {
+         struct nv30_fragprog_data *fpd;
+
+         fp->consts = realloc(fp->consts, ++fp->nr_consts *
+                    sizeof(*fpd));
+         fpd = &fp->consts[fp->nr_consts - 1];
+         fpd->offset = fpc->inst_offset + 4;
+         fpd->index = src.reg.index;
+         memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+      }
+
+      sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
+      break;
+   case NVFXSR_NONE:
+      sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+      break;
+   default:
+      assert(0);
+   }
+
+   if (src.negate)
+      sr |= NVFX_FP_REG_NEGATE;
+
+   if (src.abs)
+      hw[1] |= (1 << (29 + pos));
+
+   sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
+          (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
+          (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
+          (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
+
+   hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
+{
+   struct nv30_fragprog *fp = fpc->fp;
+   uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+   switch (dst.type) {
+   case NVFXSR_OUTPUT:
+      if (dst.index == 1)
+         fp->fp_control |= 0x0000000e;
+      else {
+         hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
+         dst.index <<= 1;
+      }
+      /* fall-through */
+   case NVFXSR_TEMP:
+      if (fpc->num_regs < (dst.index + 1))
+         fpc->num_regs = dst.index + 1;
+      break;
+   case NVFXSR_NONE:
+      hw[0] |= (1 << 30);
+      break;
+   default:
+      assert(0);
+   }
+
+   hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
+{
+   struct nv30_fragprog *fp = fpc->fp;
+   uint32_t *hw;
+
+   fpc->inst_offset = fp->insn_len;
+   fpc->have_const = 0;
+   grow_insns(fpc, 4);
+   hw = &fp->insn[fpc->inst_offset];
+   memset(hw, 0, sizeof(uint32_t) * 4);
+
+   if (insn.op == NVFX_FP_OP_OPCODE_KIL)
+      fp->fp_control |= NV30_3D_FP_CONTROL_USES_KIL;
+   hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
+   hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
+   hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+
+   if (insn.sat)
+      hw[0] |= NVFX_FP_OP_OUT_SAT;
+
+   if (insn.cc_update)
+      hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
+   hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
+   hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+        (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+        (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+        (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+   if(insn.unit >= 0)
+   {
+      hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+   }
+
+   emit_dst(fpc, insn.dst);
+   emit_src(fpc, 0, insn.src[0]);
+   emit_src(fpc, 1, insn.src[1]);
+   emit_src(fpc, 2, insn.src[2]);
+}
+
+#define arith(s,o,d,m,s0,s1,s2) \
+       nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
+                       (d), (m), (s0), (s1), (s2))
+
+#define tex(s,o,u,d,m,s0,s1,s2) \
+   nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
+                   (d), (m), (s0), none, none)
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
+{
+   const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+   struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
+   uint32_t *hw;
+   insn.cc_update = 1;
+   nvfx_fp_emit(fpc, insn);
+
+   fpc->inst_offset = fpc->fp->insn_len;
+   grow_insns(fpc, 4);
+   hw = &fpc->fp->insn[fpc->inst_offset];
+   /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+   hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+      NV40_FP_OP_OUT_NONE |
+      (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+   /* Use .xxxx swizzle so that we check only src[0].x*/
+   hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+         (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+         (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+         (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
+         (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
+   hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
+   hw[3] = 0; /* | endif_offset */
+   util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
+}
+
+/* IF src.x != 0, as TGSI specifies */
+static void
+nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+        hw[3] = 0;
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_ret(struct nvfx_fpc *fpc)
+{
+   uint32_t *hw;
+   fpc->inst_offset = fpc->fp->insn_len;
+   grow_insns(fpc, 4);
+   hw = &fpc->fp->insn[fpc->inst_offset];
+   /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+   hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
+   /* Use .xxxx swizzle so that we check only src[0].x*/
+   hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+   hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
+   hw[3] = 0;
+}
+
+static void
+nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
+                        NV40_FP_OP_OUT_NONE |
+                        (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
+                        (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
+                        (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
+                        (count << NV40_FP_OP_REP_COUNT3_SHIFT);
+        hw[3] = 0; /* | end_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        //util_dynarray_append(&fpc->loop_stack, unsigned, target);
+}
+
+/* warning: this only works forward, and probably only if not inside any IF */
+static void
+nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
+{
+        struct nvfx_relocation reloc;
+        uint32_t *hw;
+        fpc->inst_offset = fpc->fp->insn_len;
+        grow_insns(fpc, 4);
+        hw = &fpc->fp->insn[fpc->inst_offset];
+        /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+        hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
+                NV40_FP_OP_OUT_NONE |
+                (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
+        /* Use .xxxx swizzle so that we check only src[0].x*/
+        hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+                        (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
+        hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
+        hw[3] = 0; /* | endif_offset */
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 2;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+        reloc.target = target;
+        reloc.location = fpc->inst_offset + 3;
+        util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
+}
+
+static void
+nv40_fp_brk(struct nvfx_fpc *fpc)
+{
+   uint32_t *hw;
+   fpc->inst_offset = fpc->fp->insn_len;
+   grow_insns(fpc, 4);
+   hw = &fpc->fp->insn[fpc->inst_offset];
+   /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
+   hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
+      NV40_FP_OP_OUT_NONE;
+   /* Use .xxxx swizzle so that we check only src[0].x*/
+   hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
+   hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
+   hw[3] = 0;
+}
+
+static INLINE struct nvfx_src
+tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+   struct nvfx_src src;
+
+   switch (fsrc->Register.File) {
+   case TGSI_FILE_INPUT:
+      src.reg = fpc->r_input[fsrc->Register.Index];
+      break;
+   case TGSI_FILE_CONSTANT:
+      src.reg = nvfx_reg(NVFXSR_CONST, fsrc->Register.Index);
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      assert(fsrc->Register.Index < fpc->nr_imm);
+      src.reg = fpc->r_imm[fsrc->Register.Index];
+      break;
+   case TGSI_FILE_TEMPORARY:
+      src.reg = fpc->r_temp[fsrc->Register.Index];
+      break;
+   /* NV40 fragprog result regs are just temps, so this is simple */
+   case TGSI_FILE_OUTPUT:
+      src.reg = fpc->r_result[fsrc->Register.Index];
+      break;
+   default:
+      NOUVEAU_ERR("bad src file\n");
+      src.reg.index = 0;
+      src.reg.type = 0;
+      break;
+   }
+
+   src.abs = fsrc->Register.Absolute;
+   src.negate = fsrc->Register.Negate;
+   src.swz[0] = fsrc->Register.SwizzleX;
+   src.swz[1] = fsrc->Register.SwizzleY;
+   src.swz[2] = fsrc->Register.SwizzleZ;
+   src.swz[3] = fsrc->Register.SwizzleW;
+   src.indirect = 0;
+   src.indirect_reg = 0;
+   src.indirect_swz = 0;
+   return src;
+}
+
+static INLINE struct nvfx_reg
+tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+   switch (fdst->Register.File) {
+   case TGSI_FILE_OUTPUT:
+      return fpc->r_result[fdst->Register.Index];
+   case TGSI_FILE_TEMPORARY:
+      return fpc->r_temp[fdst->Register.Index];
+   case TGSI_FILE_NULL:
+      return nvfx_reg(NVFXSR_NONE, 0);
+   default:
+      NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
+      return nvfx_reg(NVFXSR_NONE, 0);
+   }
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+   int mask = 0;
+
+   if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
+   if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
+   if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
+   if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
+   return mask;
+}
+
+static boolean
+nvfx_fragprog_parse_instruction(struct nv30_context* nvfx, struct nvfx_fpc *fpc,
+            const struct tgsi_full_instruction *finst)
+{
+   const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+   struct nvfx_insn insn;
+   struct nvfx_src src[3], tmp;
+   struct nvfx_reg dst;
+   int mask, sat, unit = 0;
+   int ai = -1, ci = -1, ii = -1;
+   int i;
+
+   if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+      return TRUE;
+
+   for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *fsrc;
+
+      fsrc = &finst->Src[i];
+      if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+         src[i] = tgsi_src(fpc, fsrc);
+      }
+   }
+
+   for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *fsrc;
+
+      fsrc = &finst->Src[i];
+
+      switch (fsrc->Register.File) {
+      case TGSI_FILE_INPUT:
+         if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG && (0
+               || fsrc->Register.SwizzleX == PIPE_SWIZZLE_ALPHA
+               || fsrc->Register.SwizzleY == PIPE_SWIZZLE_ALPHA
+               || fsrc->Register.SwizzleZ == PIPE_SWIZZLE_ALPHA
+               || fsrc->Register.SwizzleW == PIPE_SWIZZLE_ALPHA
+               )) {
+            /* hardware puts 0 in fogcoord.w, but GL/Gallium want 1 there */
+            struct nvfx_src addend = nvfx_src(nvfx_fp_imm(fpc, 0, 0, 0, 1));
+            addend.swz[0] = fsrc->Register.SwizzleX;
+            addend.swz[1] = fsrc->Register.SwizzleY;
+            addend.swz[2] = fsrc->Register.SwizzleZ;
+            addend.swz[3] = fsrc->Register.SwizzleW;
+            src[i] = nvfx_src(temp(fpc));
+            nvfx_fp_emit(fpc, arith(0, ADD, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), addend, none));
+         } else if (ai == -1 || ai == fsrc->Register.Index) {
+            ai = fsrc->Register.Index;
+            src[i] = tgsi_src(fpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(fpc));
+            nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         if ((ci == -1 && ii == -1) ||
+             ci == fsrc->Register.Index) {
+            ci = fsrc->Register.Index;
+            src[i] = tgsi_src(fpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(fpc));
+            nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         if ((ci == -1 && ii == -1) ||
+             ii == fsrc->Register.Index) {
+            ii = fsrc->Register.Index;
+            src[i] = tgsi_src(fpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(fpc));
+            nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         /* handled above */
+         break;
+      case TGSI_FILE_SAMPLER:
+         unit = fsrc->Register.Index;
+         break;
+      case TGSI_FILE_OUTPUT:
+         break;
+      default:
+         NOUVEAU_ERR("bad src file\n");
+         return FALSE;
+      }
+   }
+
+   dst  = tgsi_dst(fpc, &finst->Dst[0]);
+   mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+   sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+   switch (finst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none));
+      break;
+   case TGSI_OPCODE_ADD:
+      nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
+      break;
+   case TGSI_OPCODE_CMP:
+      insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_fp_emit(fpc, insn);
+
+      insn = arith(sat, MOV, dst, mask, src[2], none, none);
+      insn.cc_test = NVFX_COND_GE;
+      nvfx_fp_emit(fpc, insn);
+
+      insn = arith(sat, MOV, dst, mask, src[1], none, none);
+      insn.cc_test = NVFX_COND_LT;
+      nvfx_fp_emit(fpc, insn);
+      break;
+   case TGSI_OPCODE_COS:
+      nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_DDX:
+      if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+         tmp = nvfx_src(temp(fpc));
+         nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+         nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+         nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+         nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
+      } else {
+         nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
+      }
+      break;
+   case TGSI_OPCODE_DDY:
+      if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+         tmp = nvfx_src(temp(fpc));
+         nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
+         nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
+         nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
+         nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
+      } else {
+         nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
+      }
+      break;
+   case TGSI_OPCODE_DP2:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
+      nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
+      break;
+   case TGSI_OPCODE_DP3:
+      nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_DP4:
+      nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_DPH:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none));
+      nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none));
+      break;
+   case TGSI_OPCODE_DST:
+      nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_EX2:
+      nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_FLR:
+      nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_FRC:
+      nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_KILL:
+      nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
+      break;
+   case TGSI_OPCODE_KILL_IF:
+      insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_fp_emit(fpc, insn);
+
+      insn = arith(0, KIL, none.reg, 0, none, none, none);
+      insn.cc_test = NVFX_COND_LT;
+      nvfx_fp_emit(fpc, insn);
+      break;
+   case TGSI_OPCODE_LG2:
+      nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_LIT:
+      if(!nvfx->is_nv4x)
+         nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], none, none));
+      else {
+         /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by
+          * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement
+          *
+          * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead
+          */
+         struct nvfx_src maxs = nvfx_src(nvfx_fp_imm(fpc, 0, FLT_MIN, 0, 0));
+         tmp = nvfx_src(temp(fpc));
+         if (ci>= 0 || ii >= 0) {
+            nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none));
+            maxs = tmp;
+         }
+         nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none));
+         nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none));
+         nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none));
+         nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none));
+      }
+      break;
+   case TGSI_OPCODE_LRP:
+      if(!nvfx->is_nv4x)
+         nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
+      else {
+         tmp = nvfx_src(temp(fpc));
+         nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+         nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
+      }
+      break;
+   case TGSI_OPCODE_MAD:
+      nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
+      break;
+   case TGSI_OPCODE_MAX:
+      nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_MIN:
+      nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_MOV:
+      nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_MUL:
+      nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_POW:
+      if(!nvfx->is_nv4x)
+         nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
+      else {
+         tmp = nvfx_src(temp(fpc));
+         nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+         nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+         nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
+      }
+      break;
+   case TGSI_OPCODE_RCP:
+      nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_RFL:
+      if(!nvfx->is_nv4x)
+         nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none));
+      else {
+         tmp = nvfx_src(temp(fpc));
+         nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[0], none));
+         nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_Y, src[0], src[1], none));
+         insn = arith(0, DIV, tmp.reg, NVFX_FP_MASK_Z, swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+         insn.scale = NVFX_FP_OP_DST_SCALE_2X;
+         nvfx_fp_emit(fpc, insn);
+         nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, swz(tmp, Z, Z, Z, Z), src[0], neg(src[1])));
+      }
+      break;
+   case TGSI_OPCODE_RSQ:
+      if(!nvfx->is_nv4x)
+         nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
+      else {
+         tmp = nvfx_src(temp(fpc));
+         insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
+         insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
+         nvfx_fp_emit(fpc, insn);
+         nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
+      }
+      break;
+   case TGSI_OPCODE_SCS:
+      /* avoid overwriting the source */
+      if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
+      {
+         if (mask & NVFX_FP_MASK_X)
+            nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+         if (mask & NVFX_FP_MASK_Y)
+            nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
+      }
+      else
+      {
+         if (mask & NVFX_FP_MASK_Y)
+            nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none));
+         if (mask & NVFX_FP_MASK_X)
+            nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
+      }
+      break;
+   case TGSI_OPCODE_SEQ:
+      nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SFL:
+      nvfx_fp_emit(fpc, arith(sat, SFL, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SGE:
+      nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SGT:
+      nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SIN:
+      nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_SLE:
+      nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SLT:
+      nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SNE:
+      nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SSG:
+   {
+      struct nvfx_src minones = swz(nvfx_src(nvfx_fp_imm(fpc, -1, -1, -1, -1)), X, X, X, X);
+
+      insn = arith(sat, MOV, dst, mask, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_fp_emit(fpc, insn);
+
+      insn = arith(0, STR, dst, mask, none, none, none);
+      insn.cc_test = NVFX_COND_GT;
+      nvfx_fp_emit(fpc, insn);
+
+      if(!sat) {
+         insn = arith(0, MOV, dst, mask, minones, none, none);
+         insn.cc_test = NVFX_COND_LT;
+         nvfx_fp_emit(fpc, insn);
+      }
+      break;
+   }
+   case TGSI_OPCODE_STR:
+      nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SUB:
+      nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none));
+      break;
+   case TGSI_OPCODE_TEX:
+      nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
+      break;
+        case TGSI_OPCODE_TRUNC:
+                tmp = nvfx_src(temp(fpc));
+                insn = arith(0, MOV, none.reg, mask, src[0], none, none);
+                insn.cc_update = 1;
+                nvfx_fp_emit(fpc, insn);
+
+                nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
+                nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
+
+                insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
+                insn.cc_test = NVFX_COND_LT;
+                nvfx_fp_emit(fpc, insn);
+                break;
+        case TGSI_OPCODE_TXB:
+                nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXL:
+                if(nvfx->is_nv4x)
+                        nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
+                else /* unsupported on nv30, use TEX and hope they like it */
+                        nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
+                break;
+        case TGSI_OPCODE_TXP:
+                nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
+                break;
+   case TGSI_OPCODE_XPD:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+      nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
+      break;
+
+   case TGSI_OPCODE_IF:
+      // MOVRC0 R31 (TR0.xyzw), R<src>:
+      // IF (NE.xxxx) ELSE <else> END <end>
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      nv40_fp_if(fpc, src[0]);
+      break;
+
+   case TGSI_OPCODE_ELSE:
+   {
+      uint32_t *hw;
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+      hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
+      hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+      break;
+   }
+
+   case TGSI_OPCODE_ENDIF:
+   {
+      uint32_t *hw;
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      assert(util_dynarray_contains(&fpc->if_stack, unsigned));
+      hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
+      if(!hw[2])
+         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
+      hw[3] = fpc->fp->insn_len;
+      break;
+   }
+
+   case TGSI_OPCODE_BRA:
+      /* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */
+      /* no state tracker uses this, so don't implement this for now */
+      assert(0);
+      nv40_fp_bra(fpc, finst->Label.Label);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      /* nothing to do here */
+      break;
+
+   case TGSI_OPCODE_CAL:
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      nv40_fp_cal(fpc, finst->Label.Label);
+      break;
+
+   case TGSI_OPCODE_RET:
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      nv40_fp_ret(fpc);
+      break;
+
+   case TGSI_OPCODE_BGNLOOP:
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
+      nv40_fp_rep(fpc, 255, finst->Label.Label);
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+
+   case TGSI_OPCODE_BRK:
+      if(!nvfx->use_nv4x)
+         goto nv3x_cflow;
+      nv40_fp_brk(fpc);
+      break;
+
+   case TGSI_OPCODE_CONT:
+   {
+      static int warned = 0;
+      if(!warned) {
+         NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
+         warned = 1;
+      }
+      break;
+   }
+
+        default:
+      NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+      return FALSE;
+   }
+
+out:
+   release_temps(fpc);
+   return TRUE;
+nv3x_cflow:
+   {
+      static int warned = 0;
+      if(!warned) {
+         NOUVEAU_ERR(
+               "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
+               "If rendering is incorrect, try to disable GLSL support in the application.\n");
+         warned = 1;
+      }
+   }
+   goto out;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_input(struct nv30_context *nvfx, struct nvfx_fpc *fpc,
+                               const struct tgsi_full_declaration *fdec)
+{
+   unsigned idx = fdec->Range.First;
+   unsigned hw;
+
+   switch (fdec->Semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:
+      hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      hw = NVFX_FP_OP_INPUT_SRC_COL0 + fdec->Semantic.Index;
+      break;
+   case TGSI_SEMANTIC_FOG:
+      hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+      break;
+   case TGSI_SEMANTIC_FACE:
+      hw = NV40_FP_OP_INPUT_SRC_FACING;
+      break;
+   case TGSI_SEMANTIC_TEXCOORD:
+      assert(fdec->Semantic.Index < 8);
+      fpc->fp->texcoord[fdec->Semantic.Index] = fdec->Semantic.Index;
+      fpc->fp->texcoords |= (1 << fdec->Semantic.Index);
+      fpc->fp->vp_or |= (0x00004000 << fdec->Semantic.Index);
+      hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.Index);
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+   case TGSI_SEMANTIC_PCOORD:
+      /* will be assigned to remaining TC slots later */
+      return TRUE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
+   return TRUE;
+}
+
+static boolean
+nvfx_fragprog_assign_generic(struct nv30_context *nvfx, struct nvfx_fpc *fpc,
+                             const struct tgsi_full_declaration *fdec)
+{
+   unsigned num_texcoords = nvfx->use_nv4x ? 10 : 8;
+   unsigned idx = fdec->Range.First;
+   unsigned hw;
+
+   switch (fdec->Semantic.Name) {
+   case TGSI_SEMANTIC_GENERIC:
+   case TGSI_SEMANTIC_PCOORD:
+      for (hw = 0; hw < num_texcoords; hw++) {
+         if (fpc->fp->texcoord[hw] == 0xffff) {
+            if (hw <= 7) {
+               fpc->fp->texcoords |= (0x1 << hw);
+               fpc->fp->vp_or |= (0x00004000 << hw);
+            } else {
+               fpc->fp->vp_or |= (0x00001000 << (hw - 8));
+            }
+            if (fdec->Semantic.Name == TGSI_SEMANTIC_PCOORD) {
+               fpc->fp->texcoord[hw] = 0xfffe;
+               fpc->fp->point_sprite_control |= (0x00000100 << hw);
+            } else {
+               fpc->fp->texcoord[hw] = fdec->Semantic.Index + 8;
+            }
+            hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
+            fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
+            return TRUE;
+         }
+      }
+      return FALSE;
+   default:
+      return TRUE;
+   }
+}
+
+static boolean
+nvfx_fragprog_parse_decl_output(struct nv30_context* nvfx, struct nvfx_fpc *fpc,
+            const struct tgsi_full_declaration *fdec)
+{
+   unsigned idx = fdec->Range.First;
+   unsigned hw;
+
+   switch (fdec->Semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:
+      hw = 1;
+      break;
+   case TGSI_SEMANTIC_COLOR:
+      hw = ~0;
+      switch (fdec->Semantic.Index) {
+      case 0: hw = 0; break;
+      case 1: hw = 2; break;
+      case 2: hw = 3; break;
+      case 3: hw = 4; break;
+      }
+      if(hw > ((nvfx->use_nv4x) ? 4 : 2)) {
+         NOUVEAU_ERR("bad rcol index\n");
+         return FALSE;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("bad output semantic\n");
+      return FALSE;
+   }
+
+   fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
+   fpc->r_temps |= (1ULL << hw);
+   return TRUE;
+}
+
+static boolean
+nvfx_fragprog_prepare(struct nv30_context* nvfx, struct nvfx_fpc *fpc)
+{
+   struct tgsi_parse_context p;
+   int high_temp = -1, i;
+
+   fpc->r_imm = CALLOC(fpc->fp->info.immediate_count, sizeof(struct nvfx_reg));
+
+   tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+   while (!tgsi_parse_end_of_tokens(&p)) {
+      const union tgsi_full_token *tok = &p.FullToken;
+
+      tgsi_parse_token(&p);
+      switch(tok->Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+         const struct tgsi_full_declaration *fdec;
+         fdec = &p.FullToken.FullDeclaration;
+         switch (fdec->Declaration.File) {
+         case TGSI_FILE_INPUT:
+            if (!nvfx_fragprog_parse_decl_input(nvfx, fpc, fdec))
+               goto out_err;
+            break;
+         case TGSI_FILE_OUTPUT:
+            if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
+               goto out_err;
+            break;
+         case TGSI_FILE_TEMPORARY:
+            if (fdec->Range.Last > high_temp) {
+               high_temp =
+                  fdec->Range.Last;
+            }
+            break;
+         default:
+            break;
+         }
+      }
+         break;
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      {
+         struct tgsi_full_immediate *imm;
+
+         imm = &p.FullToken.FullImmediate;
+         assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+         assert(fpc->nr_imm < fpc->fp->info.immediate_count);
+
+         fpc->r_imm[fpc->nr_imm++] = nvfx_fp_imm(fpc, imm->u[0].Float, imm->u[1].Float, imm->u[2].Float, imm->u[3].Float);
+         break;
+      }
+      default:
+         break;
+      }
+   }
+   tgsi_parse_free(&p);
+
+   tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+   while (!tgsi_parse_end_of_tokens(&p)) {
+      const struct tgsi_full_declaration *fdec;
+      tgsi_parse_token(&p);
+      switch(p.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         fdec = &p.FullToken.FullDeclaration;
+         switch (fdec->Declaration.File) {
+         case TGSI_FILE_INPUT:
+            if (!nvfx_fragprog_assign_generic(nvfx, fpc, fdec))
+               goto out_err;
+            break;
+         default:
+            break;
+         }
+         break;
+      default:
+         break;
+      }
+   }
+   tgsi_parse_free(&p);
+
+   if (++high_temp) {
+      fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
+      for (i = 0; i < high_temp; i++)
+         fpc->r_temp[i] = temp(fpc);
+      fpc->r_temps_discard = 0ULL;
+   }
+
+   return TRUE;
+
+out_err:
+   FREE(fpc->r_temp);
+   fpc->r_temp = NULL;
+
+   tgsi_parse_free(&p);
+   return FALSE;
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE)
+
+void
+_nvfx_fragprog_translate(struct nv30_context *nvfx, struct nv30_fragprog *fp,
+         boolean emulate_sprite_flipping)
+{
+   struct tgsi_parse_context parse;
+   struct nvfx_fpc *fpc = NULL;
+   struct util_dynarray insns;
+
+   fp->translated = FALSE;
+   fp->point_sprite_control = 0;
+   fp->vp_or = 0;
+
+   fpc = CALLOC_STRUCT(nvfx_fpc);
+   if (!fpc)
+      goto out_err;
+
+   fpc->max_temps = nvfx->use_nv4x ? 48 : 32;
+   fpc->fp = fp;
+   fpc->num_regs = 2;
+   memset(fp->texcoord, 0xff, sizeof(fp->texcoord));
+
+   for (unsigned i = 0; i < fp->info.num_properties; ++i) {
+      switch (fp->info.properties[i].name) {
+      case TGSI_PROPERTY_FS_COORD_ORIGIN:
+         if (fp->info.properties[i].data[0])
+            fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED;
+         break;
+      case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+         if (fp->info.properties[i].data[0])
+            fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER;
+         break;
+      case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+         if (fp->info.properties[i].data[0])
+            fp->rt_enable |= NV30_3D_RT_ENABLE_MRT;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (!nvfx_fragprog_prepare(nvfx, fpc))
+      goto out_err;
+
+   tgsi_parse_init(&parse, fp->pipe.tokens);
+   util_dynarray_init(&insns);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      {
+         const struct tgsi_full_instruction *finst;
+
+         util_dynarray_append(&insns, unsigned, fp->insn_len);
+         finst = &parse.FullToken.FullInstruction;
+         if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
+            goto out_err;
+      }
+         break;
+      default:
+         break;
+      }
+   }
+   util_dynarray_append(&insns, unsigned, fp->insn_len);
+
+   for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+   {
+      struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
+      fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
+   }
+   util_dynarray_fini(&insns);
+
+   if(!nvfx->is_nv4x)
+      fp->fp_control |= (fpc->num_regs-1)/2;
+   else
+      fp->fp_control |= fpc->num_regs << NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT;
+
+   /* Terminate final instruction */
+   if(fp->insn)
+      fp->insn[fpc->inst_offset] |= 0x00000001;
+
+   /* Append NOP + END instruction for branches to the end of the program */
+   fpc->inst_offset = fp->insn_len;
+   grow_insns(fpc, 4);
+   fp->insn[fpc->inst_offset + 0] = 0x00000001;
+   fp->insn[fpc->inst_offset + 1] = 0x00000000;
+   fp->insn[fpc->inst_offset + 2] = 0x00000000;
+   fp->insn[fpc->inst_offset + 3] = 0x00000000;
+
+   if(debug_get_option_nvfx_dump_fp())
+   {
+      debug_printf("\n");
+      tgsi_dump(fp->pipe.tokens, 0);
+
+      debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x");
+      for (unsigned i = 0; i < fp->insn_len; i += 4)
+         debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
+      debug_printf("\n");
+   }
+
+   fp->translated = TRUE;
+
+out:
+   tgsi_parse_free(&parse);
+   if(fpc)
+   {
+      FREE(fpc->r_temp);
+      util_dynarray_fini(&fpc->if_stack);
+      util_dynarray_fini(&fpc->label_relocs);
+      util_dynarray_fini(&fpc->imm_data);
+      //util_dynarray_fini(&fpc->loop_stack);
+      FREE(fpc);
+   }
+
+   return;
+
+out_err:
+   _debug_printf("Error: failed to compile this fragment program:\n");
+   tgsi_dump(fp->pipe.tokens, 0);
+   goto out;
+}
+
+static inline void
+nvfx_fp_memcpy(void* dst, const void* src, size_t len)
+{
+#ifndef PIPE_ARCH_BIG_ENDIAN
+   memcpy(dst, src, len);
+#else
+   size_t i;
+   for(i = 0; i < len; i += 4) {
+      uint32_t v = *(uint32_t*)((char*)src + i);
+      *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_shader.h b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
new file mode 100644
index 00000000000..987e1b043dd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_shader.h
@@ -0,0 +1,525 @@
+#ifndef __NVFX_SHADER_H__
+#define __NVFX_SHADER_H__
+
+#include <stdint.h>
+
+#include "pipe/p_compiler.h"
+
+#define NVFX_SWZ_IDENTITY ((3 << 6) | (2 << 4) | (1 << 2) | (0 << 0))
+
+/* this will resolve to either the NV30 or the NV40 version
+ * depending on the current hardware */
+/* unusual, but very fast and compact method */
+#define NVFX_VP(c) ((NV30_VP_##c) + (nv30->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
+
+#define NVFX_VP_INST_SLOT_VEC 0
+#define NVFX_VP_INST_SLOT_SCA 1
+
+#define NVFX_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#define NVFX_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#define NVFX_VP_INST_IN_NORMAL  2
+#define NVFX_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#define NVFX_VP_INST_IN_COL1  4
+#define NVFX_VP_INST_IN_FOGC  5
+#define NVFX_VP_INST_IN_TC0  8
+#define NVFX_VP_INST_IN_TC(n)  (8+n)
+
+#define NVFX_VP_INST_SCA_OP_NOP 0x00
+#define NVFX_VP_INST_SCA_OP_MOV 0x01
+#define NVFX_VP_INST_SCA_OP_RCP 0x02
+#define NVFX_VP_INST_SCA_OP_RCC 0x03
+#define NVFX_VP_INST_SCA_OP_RSQ 0x04
+#define NVFX_VP_INST_SCA_OP_EXP 0x05
+#define NVFX_VP_INST_SCA_OP_LOG 0x06
+#define NVFX_VP_INST_SCA_OP_LIT 0x07
+#define NVFX_VP_INST_SCA_OP_BRA 0x09
+#define NVFX_VP_INST_SCA_OP_CAL 0x0B
+#define NVFX_VP_INST_SCA_OP_RET 0x0C
+#define NVFX_VP_INST_SCA_OP_LG2 0x0D
+#define NVFX_VP_INST_SCA_OP_EX2 0x0E
+#define NVFX_VP_INST_SCA_OP_SIN 0x0F
+#define NVFX_VP_INST_SCA_OP_COS 0x10
+
+#define NV40_VP_INST_SCA_OP_PUSHA 0x13
+#define NV40_VP_INST_SCA_OP_POPA 0x14
+
+#define NVFX_VP_INST_VEC_OP_NOP 0x00
+#define NVFX_VP_INST_VEC_OP_MOV 0x01
+#define NVFX_VP_INST_VEC_OP_MUL 0x02
+#define NVFX_VP_INST_VEC_OP_ADD 0x03
+#define NVFX_VP_INST_VEC_OP_MAD 0x04
+#define NVFX_VP_INST_VEC_OP_DP3 0x05
+#define NVFX_VP_INST_VEC_OP_DPH 0x06
+#define NVFX_VP_INST_VEC_OP_DP4 0x07
+#define NVFX_VP_INST_VEC_OP_DST 0x08
+#define NVFX_VP_INST_VEC_OP_MIN 0x09
+#define NVFX_VP_INST_VEC_OP_MAX 0x0A
+#define NVFX_VP_INST_VEC_OP_SLT 0x0B
+#define NVFX_VP_INST_VEC_OP_SGE 0x0C
+#define NVFX_VP_INST_VEC_OP_ARL 0x0D
+#define NVFX_VP_INST_VEC_OP_FRC 0x0E
+#define NVFX_VP_INST_VEC_OP_FLR 0x0F
+#define NVFX_VP_INST_VEC_OP_SEQ 0x10
+#define NVFX_VP_INST_VEC_OP_SFL 0x11
+#define NVFX_VP_INST_VEC_OP_SGT 0x12
+#define NVFX_VP_INST_VEC_OP_SLE 0x13
+#define NVFX_VP_INST_VEC_OP_SNE 0x14
+#define NVFX_VP_INST_VEC_OP_STR 0x15
+#define NVFX_VP_INST_VEC_OP_SSG 0x16
+#define NVFX_VP_INST_VEC_OP_ARR 0x17
+#define NVFX_VP_INST_VEC_OP_ARA 0x18
+
+#define NV40_VP_INST_VEC_OP_TXL 0x19
+
+/* DWORD 3 */
+#define NVFX_VP_INST_LAST                           (1 << 0)
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ * 0: OPDEST
+ * 	0: program end
+ * 	1-6: destination register
+ * 	7: destination register is fp16?? (use for outputs)
+ * 	8: set condition code
+ * 	9: writemask x
+ *  	10: writemask y
+ *  	11: writemask z
+ *  	12: writemask w
+ *  	13-16: source attribute register number (e.g. COL0)
+ *  	17-20: texture unit number
+ *  	21: expand value on texture operation (x -> 2x - 1)
+ *  	22-23: precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = s0.8 fixed (nv40-only))
+ * 	24-29: opcode
+ * 	30: no destination
+ * 	31: saturate
+ * 1 - SRC0
+ * 	0-17: see common source fields
+ * 	18: execute if condition code less
+ * 	19: execute if condition code equal
+ * 	20: execute if condition code greater
+ * 	21-22: condition code swizzle x source component
+ * 	23-24: condition code swizzle y source component
+ * 	25-26: condition code swizzle z source component
+ * 	27-28: condition code swizzle w source component
+ * 	29: source 0 absolute
+ * 	30: always 0 in renouveau tests
+ * 	31: always 0 in renouveau tests
+ * 2 - SRC1
+ * 	0-17: see common source fields
+ * 	18: source 1 absolute
+ * 	19-20: input precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = ???
+ * 	21-27: always 0 in renouveau tests
+ * 	28-30: scale (0 = 1x, 1 = 2x, 2 = 4x, 3 = 8x, 4 = ???, 5, = 1/2, 6 = 1/4, 7 = 1/8)
+ * 	31: opcode is branch
+ * 3 - SRC2
+ * 	0-17: see common source fields
+ * 	18: source 2 absolute
+ * 	19-29: address register displacement
+ * 	30: use index register
+ * 	31: disable perspective-correct interpolation?
+ *
+* Common fields of 0, 1, 2 - SRC
+ * 	0-1: source register type (0 = temp, 1 = input, 2 = immediate, 3 = ???)
+ * 	2-7: source temp register index
+ * 	8: source register is fp16??
+ * 	9-10: source swizzle x source component
+ * 	11-12: source swizzle y source component
+ * 	13-14: source swizzle z source component
+ * 	15-16: source swizzle w source component
+ *	17: negate
+
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ *
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ *
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ *
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ *
+ * NV40 Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ */
+
+//== Opcode / Destination selection ==
+#define NVFX_FP_OP_PROGRAM_END          (1 << 0)
+#define NVFX_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+#define NV40_FP_OP_OUT_REG_MASK          (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NVFX_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NVFX_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NVFX_FP_OP_OUTMASK_SHIFT        9
+#define NVFX_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NVFX_FP_OP_OUT_X  (1<<9)
+#  define NVFX_FP_OP_OUT_Y  (1<<10)
+#  define NVFX_FP_OP_OUT_Z  (1<<11)
+#  define NVFX_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NVFX_FP_OP_INPUT_SRC_SHIFT        13
+#define NVFX_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NVFX_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NVFX_FP_OP_INPUT_SRC_COL0  0x1
+#  define NVFX_FP_OP_INPUT_SRC_COL1  0x2
+#  define NVFX_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NVFX_FP_OP_INPUT_SRC_TC0    0x4
+#  define NVFX_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#  define NV40_FP_OP_INPUT_SRC_FACING  0xE
+#define NVFX_FP_OP_TEX_UNIT_SHIFT        17
+#define NVFX_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NVFX_FP_OP_PRECISION_SHIFT        22
+#define NVFX_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NVFX_FP_PRECISION_FP32  0
+#   define NVFX_FP_PRECISION_FP16  1
+#   define NVFX_FP_PRECISION_FX12  2
+#define NVFX_FP_OP_OPCODE_SHIFT          24
+#define NVFX_FP_OP_OPCODE_MASK          (0x3F << 24)
+/* NV30/NV40 fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_NOP 0x00
+#define NVFX_FP_OP_OPCODE_MOV 0x01
+#define NVFX_FP_OP_OPCODE_MUL 0x02
+#define NVFX_FP_OP_OPCODE_ADD 0x03
+#define NVFX_FP_OP_OPCODE_MAD 0x04
+#define NVFX_FP_OP_OPCODE_DP3 0x05
+#define NVFX_FP_OP_OPCODE_DP4 0x06
+#define NVFX_FP_OP_OPCODE_DST 0x07
+#define NVFX_FP_OP_OPCODE_MIN 0x08
+#define NVFX_FP_OP_OPCODE_MAX 0x09
+#define NVFX_FP_OP_OPCODE_SLT 0x0A
+#define NVFX_FP_OP_OPCODE_SGE 0x0B
+#define NVFX_FP_OP_OPCODE_SLE 0x0C
+#define NVFX_FP_OP_OPCODE_SGT 0x0D
+#define NVFX_FP_OP_OPCODE_SNE 0x0E
+#define NVFX_FP_OP_OPCODE_SEQ 0x0F
+#define NVFX_FP_OP_OPCODE_FRC 0x10
+#define NVFX_FP_OP_OPCODE_FLR 0x11
+#define NVFX_FP_OP_OPCODE_KIL 0x12
+#define NVFX_FP_OP_OPCODE_PK4B 0x13
+#define NVFX_FP_OP_OPCODE_UP4B 0x14
+#define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_TEX 0x17
+#define NVFX_FP_OP_OPCODE_TXP 0x18
+#define NVFX_FP_OP_OPCODE_TXD 0x19
+#define NVFX_FP_OP_OPCODE_RCP 0x1A
+#define NVFX_FP_OP_OPCODE_EX2 0x1C
+#define NVFX_FP_OP_OPCODE_LG2 0x1D
+#define NVFX_FP_OP_OPCODE_STR 0x20
+#define NVFX_FP_OP_OPCODE_SFL 0x21
+#define NVFX_FP_OP_OPCODE_COS 0x22
+#define NVFX_FP_OP_OPCODE_SIN 0x23
+#define NVFX_FP_OP_OPCODE_PK2H 0x24
+#define NVFX_FP_OP_OPCODE_UP2H 0x25
+#define NVFX_FP_OP_OPCODE_PK4UB 0x27
+#define NVFX_FP_OP_OPCODE_UP4UB 0x28
+#define NVFX_FP_OP_OPCODE_PK2US 0x29
+#define NVFX_FP_OP_OPCODE_UP2US 0x2A
+#define NVFX_FP_OP_OPCODE_DP2A 0x2E
+#define NVFX_FP_OP_OPCODE_TXB 0x31
+#define NVFX_FP_OP_OPCODE_DIV 0x3A
+
+/* NV30 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
+#define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
+#define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
+#define NVFX_FP_OP_OPCODE_POW_NV30 0x26
+#define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
+
+/* NV40 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F
+#define NVFX_FP_OP_OPCODE_LITEX2_NV40 0x3C
+
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+
+#define NV40_FP_OP_OUT_NONE         (1 << 30)
+#define NVFX_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NVFX_FP_OP_SRC0_ABS          (1 << 29)
+#define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
+#define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NVFX_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NVFX_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NVFX_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NVFX_FP_OP_COND_SWZ_X_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NVFX_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NVFX_FP_OP_COND_SHIFT          18
+#define NVFX_FP_OP_COND_MASK          (0x07 << 18)
+#  define NVFX_FP_OP_COND_FL  0
+#  define NVFX_FP_OP_COND_LT  1
+#  define NVFX_FP_OP_COND_EQ  2
+#  define NVFX_FP_OP_COND_LE  3
+#  define NVFX_FP_OP_COND_GT  4
+#  define NVFX_FP_OP_COND_NE  5
+#  define NVFX_FP_OP_COND_GE  6
+#  define NVFX_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NVFX_FP_OP_DST_SCALE_SHIFT        28
+#define NVFX_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NVFX_FP_OP_DST_SCALE_1X                                                0
+#define NVFX_FP_OP_DST_SCALE_2X                                                1
+#define NVFX_FP_OP_DST_SCALE_4X                                                2
+#define NVFX_FP_OP_DST_SCALE_8X                                                3
+#define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
+#define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
+#define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+#define NVFX_FP_OP_SRC1_ABS          (1 << 18)
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF: absolute offset in dwords */
+#define NV40_FP_OP_ELSE_OFFSET_SHIFT                                           0
+#define NV40_FP_OP_ELSE_OFFSET_MASK                             (0x7FFFFFFF << 0)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_SUB_OFFSET_SHIFT                                                 0
+#define NV40_FP_OP_SUB_OFFSET_MASK                                   (0x7FFFFFFF << 0)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF: absolute offset in dwords */
+#define NV40_FP_OP_END_OFFSET_SHIFT                                            0
+#define NV40_FP_OP_END_OFFSET_MASK                              (0x7FFFFFFF << 0)
+
+/* high order bits of SRC2 */
+#define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT        19
+#define NV40_FP_OP_ADDR_INDEX_MASK        (0xF << 19)
+
+//== Register selection ==
+#define NVFX_FP_REG_TYPE_SHIFT           0
+#define NVFX_FP_REG_TYPE_MASK           (3 << 0)
+#  define NVFX_FP_REG_TYPE_TEMP   0
+#  define NVFX_FP_REG_TYPE_INPUT  1
+#  define NVFX_FP_REG_TYPE_CONST  2
+#define NVFX_FP_REG_SRC_SHIFT            2
+#define NV30_FP_REG_SRC_MASK              (31 << 2)
+#define NV40_FP_REG_SRC_MASK              (63 << 2)
+#define NVFX_FP_REG_SRC_HALF            (1 << 8)
+#define NVFX_FP_REG_SWZ_ALL_SHIFT        9
+#define NVFX_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NVFX_FP_REG_SWZ_X_SHIFT          9
+#define NVFX_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NVFX_FP_REG_SWZ_Y_SHIFT          11
+#define NVFX_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NVFX_FP_REG_SWZ_Z_SHIFT          13
+#define NVFX_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NVFX_FP_REG_SWZ_W_SHIFT          15
+#define NVFX_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NVFX_FP_SWIZZLE_X  0
+#  define NVFX_FP_SWIZZLE_Y  1
+#  define NVFX_FP_SWIZZLE_Z  2
+#  define NVFX_FP_SWIZZLE_W  3
+#define NVFX_FP_REG_NEGATE          (1 << 17)
+
+#define NVFXSR_NONE	0
+#define NVFXSR_OUTPUT	1
+#define NVFXSR_INPUT	2
+#define NVFXSR_TEMP	3
+#define NVFXSR_CONST	5
+#define NVFXSR_IMM	6
+
+#define NVFX_COND_FL  0
+#define NVFX_COND_LT  1
+#define NVFX_COND_EQ  2
+#define NVFX_COND_LE  3
+#define NVFX_COND_GT  4
+#define NVFX_COND_NE  5
+#define NVFX_COND_GE  6
+#define NVFX_COND_TR  7
+
+/* Yes, this are ordered differently... */
+
+#define NVFX_VP_MASK_X 8
+#define NVFX_VP_MASK_Y 4
+#define NVFX_VP_MASK_Z 2
+#define NVFX_VP_MASK_W 1
+#define NVFX_VP_MASK_ALL 0xf
+
+#define NVFX_FP_MASK_X 1
+#define NVFX_FP_MASK_Y 2
+#define NVFX_FP_MASK_Z 4
+#define NVFX_FP_MASK_W 8
+#define NVFX_FP_MASK_ALL 0xf
+
+#define NVFX_SWZ_X 0
+#define NVFX_SWZ_Y 1
+#define NVFX_SWZ_Z 2
+#define NVFX_SWZ_W 3
+
+#define swz(s,x,y,z,w) nvfx_src_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_src_neg((s))
+#define abs(s) nvfx_src_abs((s))
+
+struct nvfx_reg {
+	int8_t type;
+	int32_t index;
+};
+
+struct nvfx_src {
+	struct nvfx_reg reg;
+
+	uint8_t indirect : 1;
+	uint8_t indirect_reg : 1;
+	uint8_t indirect_swz : 2;
+	uint8_t negate : 1;
+	uint8_t abs : 1;
+	uint8_t swz[4];
+};
+
+struct nvfx_insn
+{
+	uint8_t op;
+	char scale;
+	int8_t unit;
+	uint8_t mask;
+	uint8_t cc_swz[4];
+
+	uint8_t sat : 1;
+	uint8_t cc_update : 1;
+	uint8_t cc_update_reg : 1;
+	uint8_t cc_test : 3;
+	uint8_t cc_test_reg : 1;
+
+	struct nvfx_reg dst;
+	struct nvfx_src src[3];
+};
+
+static INLINE struct nvfx_insn
+nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2)
+{
+	struct nvfx_insn insn = {
+		.op = op,
+		.scale = 0,
+		.unit = unit,
+		.sat = sat,
+		.mask = mask,
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = NVFX_COND_TR,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+		.dst = dst,
+		.src = {s0, s1, s2}
+	};
+	return insn;
+}
+
+static INLINE struct nvfx_reg
+nvfx_reg(int type, int index)
+{
+	struct nvfx_reg temp = {
+		.type = type,
+		.index = index,
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_src
+nvfx_src(struct nvfx_reg reg)
+{
+	struct nvfx_src temp = {
+		.reg = reg,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.indirect = 0,
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_src
+nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w)
+{
+	struct nvfx_src dst = src;
+
+	dst.swz[NVFX_SWZ_X] = src.swz[x];
+	dst.swz[NVFX_SWZ_Y] = src.swz[y];
+	dst.swz[NVFX_SWZ_Z] = src.swz[z];
+	dst.swz[NVFX_SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nvfx_src
+nvfx_src_neg(struct nvfx_src src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nvfx_src
+nvfx_src_abs(struct nvfx_src src)
+{
+	src.abs = 1;
+	return src;
+}
+
+struct nvfx_relocation {
+        unsigned location;
+        unsigned target;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
new file mode 100644
index 00000000000..3ae51ef9e82
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -0,0 +1,1133 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_linkage.h"
+#include "util/u_debug.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "draw/draw_context.h"
+
+#include "nv30/nv30-40_3d.xml.h"
+#include "nv30/nv30_context.h"
+#include "nv30/nv30_resource.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#include "nv30/nv30_vertprog.h"
+#include "nv30/nv40_vertprog.h"
+
+struct nvfx_loop_entry {
+   unsigned brk_target;
+   unsigned cont_target;
+};
+
+struct nvfx_vpc {
+   struct nv30_context* nv30;
+   struct pipe_shader_state pipe;
+   struct nv30_vertprog *vp;
+   struct tgsi_shader_info* info;
+
+   struct nv30_vertprog_exec *vpi;
+
+   unsigned r_temps;
+   unsigned r_temps_discard;
+   struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
+   struct nvfx_reg *r_address;
+   struct nvfx_reg *r_temp;
+   struct nvfx_reg *r_const;
+   struct nvfx_reg r_0_1;
+
+   struct nvfx_reg *imm;
+   unsigned nr_imm;
+
+   int hpos_idx;
+   int cvtx_idx;
+
+   struct util_dynarray label_relocs;
+   struct util_dynarray loop_stack;
+};
+
+static struct nvfx_reg
+temp(struct nvfx_vpc *vpc)
+{
+   int idx = ffs(~vpc->r_temps) - 1;
+
+   if (idx < 0) {
+      NOUVEAU_ERR("out of temps!!\n");
+      assert(0);
+      return nvfx_reg(NVFXSR_TEMP, 0);
+   }
+
+   vpc->r_temps |= (1 << idx);
+   vpc->r_temps_discard |= (1 << idx);
+   return nvfx_reg(NVFXSR_TEMP, idx);
+}
+
+static inline void
+release_temps(struct nvfx_vpc *vpc)
+{
+   vpc->r_temps &= ~vpc->r_temps_discard;
+   vpc->r_temps_discard = 0;
+}
+
+static struct nvfx_reg
+constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+   struct nv30_vertprog *vp = vpc->vp;
+   struct nv30_vertprog_data *vpd;
+   int idx;
+
+   if (pipe >= 0) {
+      for (idx = 0; idx < vp->nr_consts; idx++) {
+         if (vp->consts[idx].index == pipe)
+            return nvfx_reg(NVFXSR_CONST, idx);
+      }
+   }
+
+   idx = vp->nr_consts++;
+   vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+   vpd = &vp->consts[idx];
+
+   vpd->index = pipe;
+   vpd->value[0] = x;
+   vpd->value[1] = y;
+   vpd->value[2] = z;
+   vpd->value[3] = w;
+   return nvfx_reg(NVFXSR_CONST, idx);
+}
+
+#define arith(s,t,o,d,m,s0,s1,s2) \
+   nvfx_insn((s), (NVFX_VP_INST_SLOT_##t << 7) | NVFX_VP_INST_##t##_OP_##o, -1, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_context *nv30, struct nvfx_vpc *vpc, uint32_t *hw,
+         int pos, struct nvfx_src src)
+{
+   struct nv30_vertprog *vp = vpc->vp;
+   uint32_t sr = 0;
+   struct nvfx_relocation reloc;
+
+   switch (src.reg.type) {
+   case NVFXSR_TEMP:
+      sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
+      sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+      break;
+   case NVFXSR_INPUT:
+      sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+             NVFX_VP(SRC_REG_TYPE_SHIFT));
+      vp->ir |= (1 << src.reg.index);
+      hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+      break;
+   case NVFXSR_CONST:
+      sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
+             NVFX_VP(SRC_REG_TYPE_SHIFT));
+      if (src.reg.index < 256 && src.reg.index >= -256) {
+         reloc.location = vp->nr_insns - 1;
+         reloc.target = src.reg.index;
+         util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc);
+      } else {
+         hw[1] |= (src.reg.index << NVFX_VP(INST_CONST_SRC_SHIFT)) &
+               NVFX_VP(INST_CONST_SRC_MASK);
+      }
+      break;
+   case NVFXSR_NONE:
+      sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+             NVFX_VP(SRC_REG_TYPE_SHIFT));
+      break;
+   default:
+      assert(0);
+   }
+
+   if (src.negate)
+      sr |= NVFX_VP(SRC_NEGATE);
+
+   if (src.abs)
+      hw[0] |= (1 << (21 + pos));
+
+   sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
+          (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
+          (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
+          (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
+
+   if(src.indirect) {
+      if(src.reg.type == NVFXSR_CONST)
+         hw[3] |= NVFX_VP(INST_INDEX_CONST);
+      else if(src.reg.type == NVFXSR_INPUT)
+         hw[0] |= NVFX_VP(INST_INDEX_INPUT);
+      else
+         assert(0);
+
+      if(src.indirect_reg)
+         hw[0] |= NVFX_VP(INST_ADDR_REG_SELECT_1);
+      hw[0] |= src.indirect_swz << NVFX_VP(INST_ADDR_SWZ_SHIFT);
+   }
+
+   switch (pos) {
+   case 0:
+      hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
+           NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
+      hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
+           NVFX_VP(INST_SRC0L_SHIFT);
+      break;
+   case 1:
+      hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
+      break;
+   case 2:
+      hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
+           NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
+      hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
+           NVFX_VP(INST_SRC2L_SHIFT);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void
+emit_dst(struct nv30_context *nv30, struct nvfx_vpc *vpc, uint32_t *hw,
+         int slot, struct nvfx_reg dst)
+{
+   struct nv30_vertprog *vp = vpc->vp;
+
+   switch (dst.type) {
+   case NVFXSR_NONE:
+      if(!nv30->is_nv4x)
+         hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK;
+      else {
+         hw[3] |= NV40_VP_INST_DEST_MASK;
+         if (slot == 0)
+            hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+         else
+            hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+      }
+      break;
+   case NVFXSR_TEMP:
+      if(!nv30->is_nv4x)
+         hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+      else {
+         hw[3] |= NV40_VP_INST_DEST_MASK;
+         if (slot == 0)
+            hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+         else
+            hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+      }
+      break;
+   case NVFXSR_OUTPUT:
+      /* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
+      if(nv30->is_nv4x) {
+         switch (dst.index) {
+         case NV30_VP_INST_DEST_CLP(0):
+            dst.index = NVFX_VP(INST_DEST_FOGC);
+            vp->or   |= (1 << 6);
+            break;
+         case NV30_VP_INST_DEST_CLP(1):
+            dst.index = NVFX_VP(INST_DEST_FOGC);
+            vp->or   |= (1 << 7);
+            break;
+         case NV30_VP_INST_DEST_CLP(2):
+            dst.index = NVFX_VP(INST_DEST_FOGC);
+            vp->or   |= (1 << 8);
+            break;
+         case NV30_VP_INST_DEST_CLP(3):
+            dst.index = NVFX_VP(INST_DEST_PSZ);
+            vp->or   |= (1 << 9);
+            break;
+         case NV30_VP_INST_DEST_CLP(4):
+            dst.index = NVFX_VP(INST_DEST_PSZ);
+            vp->or   |= (1 << 10);
+            break;
+         case NV30_VP_INST_DEST_CLP(5):
+            dst.index = NVFX_VP(INST_DEST_PSZ);
+            vp->or   |= (1 << 11);
+            break;
+         case NV40_VP_INST_DEST_COL0: vp->or |= (1 << 0); break;
+         case NV40_VP_INST_DEST_COL1: vp->or |= (1 << 1); break;
+         case NV40_VP_INST_DEST_BFC0: vp->or |= (1 << 2); break;
+         case NV40_VP_INST_DEST_BFC1: vp->or |= (1 << 3); break;
+         case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+         case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break;
+         }
+      }
+
+      if(!nv30->is_nv4x) {
+         hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+         hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK;
+
+         /*XXX: no way this is entirely correct, someone needs to
+          *     figure out what exactly it is.
+          */
+         hw[3] |= 0x800;
+      } else {
+         hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+         if (slot == 0) {
+            hw[0] |= NV40_VP_INST_VEC_RESULT;
+            hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK;
+         } else {
+            hw[3] |= NV40_VP_INST_SCA_RESULT;
+            hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void
+nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn)
+{
+   struct nv30_context *nv30 = vpc->nv30;
+   struct nv30_vertprog *vp = vpc->vp;
+   unsigned slot = insn.op >> 7;
+   unsigned op = insn.op & 0x7f;
+   uint32_t *hw;
+
+   vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+   vpc->vpi = &vp->insns[vp->nr_insns - 1];
+   memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+
+   hw = vpc->vpi->data;
+
+   if (insn.cc_test != NVFX_COND_TR)
+      hw[0] |= NVFX_VP(INST_COND_TEST_ENABLE);
+   hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT));
+   hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+             (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+             (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+             (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+   if(insn.cc_update)
+      hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE);
+
+   if(insn.sat) {
+      assert(nv30->use_nv4x);
+      if(nv30->use_nv4x)
+         hw[0] |= NV40_VP_INST_SATURATE;
+   }
+
+   if(!nv30->is_nv4x) {
+      if(slot == 0)
+         hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+      else {
+         hw[0] |= ((op >> 4) << NV30_VP_INST_SCA_OPCODEH_SHIFT);
+         hw[1] |= ((op & 0xf) << NV30_VP_INST_SCA_OPCODEL_SHIFT);
+      }
+//      hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
+//      hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
+
+      if (insn.dst.type == NVFXSR_OUTPUT) {
+         if (slot)
+            hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+         else
+            hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+      } else {
+         if (slot)
+            hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+         else
+            hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+      }
+    } else {
+      if (slot == 0) {
+         hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+         hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+         hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+       } else {
+         hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+         hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ;
+         hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+      }
+   }
+
+   emit_dst(nv30, vpc, hw, slot, insn.dst);
+   emit_src(nv30, vpc, hw, 0, insn.src[0]);
+   emit_src(nv30, vpc, hw, 1, insn.src[1]);
+   emit_src(nv30, vpc, hw, 2, insn.src[2]);
+
+//   if(insn.src[0].indirect || op == NVFX_VP_INST_VEC_OP_ARL)
+//      hw[3] |= NV40_VP_INST_SCA_RESULT;
+}
+
+static inline struct nvfx_src
+tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+   struct nvfx_src src;
+
+   switch (fsrc->Register.File) {
+   case TGSI_FILE_INPUT:
+      src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index);
+      break;
+   case TGSI_FILE_CONSTANT:
+      if(fsrc->Register.Indirect) {
+         src.reg = vpc->r_const[0];
+         src.reg.index = fsrc->Register.Index;
+      } else {
+         src.reg = vpc->r_const[fsrc->Register.Index];
+      }
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      src.reg = vpc->imm[fsrc->Register.Index];
+      break;
+   case TGSI_FILE_TEMPORARY:
+      src.reg = vpc->r_temp[fsrc->Register.Index];
+      break;
+   default:
+      NOUVEAU_ERR("bad src file\n");
+      src.reg.index = 0;
+      src.reg.type = -1;
+      break;
+   }
+
+   src.abs = fsrc->Register.Absolute;
+   src.negate = fsrc->Register.Negate;
+   src.swz[0] = fsrc->Register.SwizzleX;
+   src.swz[1] = fsrc->Register.SwizzleY;
+   src.swz[2] = fsrc->Register.SwizzleZ;
+   src.swz[3] = fsrc->Register.SwizzleW;
+   src.indirect = 0;
+   src.indirect_reg = 0;
+   src.indirect_swz = 0;
+
+   if(fsrc->Register.Indirect) {
+      if(fsrc->Indirect.File == TGSI_FILE_ADDRESS &&
+         (fsrc->Register.File == TGSI_FILE_CONSTANT ||
+          fsrc->Register.File == TGSI_FILE_INPUT)) {
+         src.indirect = 1;
+         src.indirect_reg = fsrc->Indirect.Index;
+         src.indirect_swz = fsrc->Indirect.Swizzle;
+      } else {
+         src.reg.index = 0;
+         src.reg.type = -1;
+      }
+   }
+
+   return src;
+}
+
+static INLINE struct nvfx_reg
+tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+   struct nvfx_reg dst;
+
+   switch (fdst->Register.File) {
+   case TGSI_FILE_NULL:
+      dst = nvfx_reg(NVFXSR_NONE, 0);
+      break;
+   case TGSI_FILE_OUTPUT:
+      dst = vpc->r_result[fdst->Register.Index];
+      break;
+   case TGSI_FILE_TEMPORARY:
+      dst = vpc->r_temp[fdst->Register.Index];
+      break;
+   case TGSI_FILE_ADDRESS:
+      dst = vpc->r_address[fdst->Register.Index];
+      break;
+   default:
+      NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File);
+      dst.index = 0;
+      dst.type = 0;
+      break;
+   }
+
+   return dst;
+}
+
+static inline int
+tgsi_mask(uint tgsi)
+{
+   int mask = 0;
+
+   if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
+   if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
+   if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
+   if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
+   return mask;
+}
+
+static boolean
+nvfx_vertprog_parse_instruction(struct nv30_context *nv30, struct nvfx_vpc *vpc,
+            unsigned idx, const struct tgsi_full_instruction *finst)
+{
+   struct nvfx_src src[3], tmp;
+   struct nvfx_reg dst;
+   struct nvfx_reg final_dst;
+   struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+   struct nvfx_insn insn;
+   struct nvfx_relocation reloc;
+   struct nvfx_loop_entry loop;
+   boolean sat = FALSE;
+   int mask;
+   int ai = -1, ci = -1, ii = -1;
+   int i;
+   unsigned sub_depth = 0;
+
+   for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *fsrc;
+
+      fsrc = &finst->Src[i];
+      if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+         src[i] = tgsi_src(vpc, fsrc);
+      }
+   }
+
+   for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *fsrc;
+
+      fsrc = &finst->Src[i];
+
+      switch (fsrc->Register.File) {
+      case TGSI_FILE_INPUT:
+         if (ai == -1 || ai == fsrc->Register.Index) {
+            ai = fsrc->Register.Index;
+            src[i] = tgsi_src(vpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(vpc));
+            nvfx_vp_emit(vpc, arith(0, VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL,
+                         tgsi_src(vpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         if ((ci == -1 && ii == -1) ||
+             ci == fsrc->Register.Index) {
+            ci = fsrc->Register.Index;
+            src[i] = tgsi_src(vpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(vpc));
+            nvfx_vp_emit(vpc, arith(0, VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL,
+                         tgsi_src(vpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         if ((ci == -1 && ii == -1) ||
+             ii == fsrc->Register.Index) {
+            ii = fsrc->Register.Index;
+            src[i] = tgsi_src(vpc, fsrc);
+         } else {
+            src[i] = nvfx_src(temp(vpc));
+            nvfx_vp_emit(vpc, arith(0, VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL,
+                         tgsi_src(vpc, fsrc), none, none));
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         /* handled above */
+         break;
+      default:
+         NOUVEAU_ERR("bad src file\n");
+         return FALSE;
+      }
+   }
+
+   for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+      if(src[i].reg.type < 0)
+         return FALSE;
+   }
+
+   if(finst->Dst[0].Register.File == TGSI_FILE_ADDRESS &&
+      finst->Instruction.Opcode != TGSI_OPCODE_ARL)
+      return FALSE;
+
+   final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
+   mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) {
+      assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
+      if (nv30->use_nv4x)
+         sat = TRUE;
+      else
+      if(dst.type != NVFXSR_TEMP)
+         dst = temp(vpc);
+   }
+
+   switch (finst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, abs(src[0]), none, none));
+      break;
+   case TGSI_OPCODE_ADD:
+      nvfx_vp_emit(vpc, arith(sat, VEC, ADD, dst, mask, src[0], none, src[1]));
+      break;
+   case TGSI_OPCODE_ARL:
+      nvfx_vp_emit(vpc, arith(0, VEC, ARL, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, neg(tmp), none, none));
+      break;
+   case TGSI_OPCODE_CMP:
+      insn = arith(0, VEC, MOV, none.reg, mask, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_vp_emit(vpc, insn);
+
+      insn = arith(sat, VEC, MOV, dst, mask, src[2], none, none);
+      insn.cc_test = NVFX_COND_GE;
+      nvfx_vp_emit(vpc, insn);
+
+      insn = arith(sat, VEC, MOV, dst, mask, src[1], none, none);
+      insn.cc_test = NVFX_COND_LT;
+      nvfx_vp_emit(vpc, insn);
+      break;
+   case TGSI_OPCODE_COS:
+      nvfx_vp_emit(vpc, arith(sat, SCA, COS, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_DP2:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, ADD, dst, mask, swz(tmp, X, X, X, X), none, swz(tmp, Y, Y, Y, Y)));
+      break;
+   case TGSI_OPCODE_DP3:
+      nvfx_vp_emit(vpc, arith(sat, VEC, DP3, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_DP4:
+      nvfx_vp_emit(vpc, arith(sat, VEC, DP4, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_DPH:
+      nvfx_vp_emit(vpc, arith(sat, VEC, DPH, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_DST:
+      nvfx_vp_emit(vpc, arith(sat, VEC, DST, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_EX2:
+      nvfx_vp_emit(vpc, arith(sat, SCA, EX2, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_EXP:
+      nvfx_vp_emit(vpc, arith(sat, SCA, EXP, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_FLR:
+      nvfx_vp_emit(vpc, arith(sat, VEC, FLR, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_FRC:
+      nvfx_vp_emit(vpc, arith(sat, VEC, FRC, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_LG2:
+      nvfx_vp_emit(vpc, arith(sat, SCA, LG2, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_LIT:
+      nvfx_vp_emit(vpc, arith(sat, SCA, LIT, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_LOG:
+      nvfx_vp_emit(vpc, arith(sat, SCA, LOG, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_LRP:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MAD, dst, mask, src[0], src[1], tmp));
+      break;
+   case TGSI_OPCODE_MAD:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MAD, dst, mask, src[0], src[1], src[2]));
+      break;
+   case TGSI_OPCODE_MAX:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MAX, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_MIN:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MIN, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_MOV:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_MUL:
+      nvfx_vp_emit(vpc, arith(sat, VEC, MUL, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_POW:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X)));
+      nvfx_vp_emit(vpc, arith(0, VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
+      nvfx_vp_emit(vpc, arith(sat, SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X)));
+      break;
+   case TGSI_OPCODE_RCP:
+      nvfx_vp_emit(vpc, arith(sat, SCA, RCP, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_RSQ:
+      nvfx_vp_emit(vpc, arith(sat, SCA, RSQ, dst, mask, none, none, abs(src[0])));
+      break;
+   case TGSI_OPCODE_SEQ:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SEQ, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SFL:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SFL, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SGE:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SGE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SGT:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SGT, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SIN:
+      nvfx_vp_emit(vpc, arith(sat, SCA, SIN, dst, mask, none, none, src[0]));
+      break;
+   case TGSI_OPCODE_SLE:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SLE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SLT:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SLT, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SNE:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SNE, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SSG:
+      nvfx_vp_emit(vpc, arith(sat, VEC, SSG, dst, mask, src[0], none, none));
+      break;
+   case TGSI_OPCODE_STR:
+      nvfx_vp_emit(vpc, arith(sat, VEC, STR, dst, mask, src[0], src[1], none));
+      break;
+   case TGSI_OPCODE_SUB:
+      nvfx_vp_emit(vpc, arith(sat, VEC, ADD, dst, mask, src[0], none, neg(src[1])));
+      break;
+   case TGSI_OPCODE_TRUNC:
+      tmp = nvfx_src(temp(vpc));
+      insn = arith(0, VEC, MOV, none.reg, mask, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_vp_emit(vpc, insn);
+
+      nvfx_vp_emit(vpc, arith(0, VEC, FLR, tmp.reg, mask, abs(src[0]), none, none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, tmp, none, none));
+
+      insn = arith(sat, VEC, MOV, dst, mask, neg(tmp), none, none);
+      insn.cc_test = NVFX_COND_LT;
+      nvfx_vp_emit(vpc, insn);
+      break;
+   case TGSI_OPCODE_XPD:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp)));
+      break;
+   case TGSI_OPCODE_IF:
+      insn = arith(0, VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none);
+      insn.cc_update = 1;
+      nvfx_vp_emit(vpc, insn);
+
+      reloc.location = vpc->vp->nr_insns;
+      reloc.target = finst->Label.Label + 1;
+      util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+      insn = arith(0, SCA, BRA, none.reg, 0, none, none, none);
+      insn.cc_test = NVFX_COND_EQ;
+      insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0;
+      nvfx_vp_emit(vpc, insn);
+      break;
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_BRA:
+   case TGSI_OPCODE_CAL:
+      reloc.location = vpc->vp->nr_insns;
+      reloc.target = finst->Label.Label;
+      util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+      if(finst->Instruction.Opcode == TGSI_OPCODE_CAL)
+         insn = arith(0, SCA, CAL, none.reg, 0, none, none, none);
+      else
+         insn = arith(0, SCA, BRA, none.reg, 0, none, none, none);
+      nvfx_vp_emit(vpc, insn);
+      break;
+   case TGSI_OPCODE_RET:
+      if(sub_depth || !vpc->vp->enabled_ucps) {
+         tmp = none;
+         tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0;
+         nvfx_vp_emit(vpc, arith(0, SCA, RET, none.reg, 0, none, none, tmp));
+      } else {
+         reloc.location = vpc->vp->nr_insns;
+         reloc.target = vpc->info->num_instructions;
+         util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+         nvfx_vp_emit(vpc, arith(0, SCA, BRA, none.reg, 0, none, none, none));
+      }
+      break;
+   case TGSI_OPCODE_BGNSUB:
+      ++sub_depth;
+      break;
+   case TGSI_OPCODE_ENDSUB:
+      --sub_depth;
+      break;
+   case TGSI_OPCODE_ENDIF:
+      /* nothing to do here */
+      break;
+   case TGSI_OPCODE_BGNLOOP:
+      loop.cont_target = idx;
+      loop.brk_target = finst->Label.Label + 1;
+      util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop);
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry);
+
+      reloc.location = vpc->vp->nr_insns;
+      reloc.target = loop.cont_target;
+      util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+      nvfx_vp_emit(vpc, arith(0, SCA, BRA, none.reg, 0, none, none, none));
+      break;
+   case TGSI_OPCODE_CONT:
+      loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+      reloc.location = vpc->vp->nr_insns;
+      reloc.target = loop.cont_target;
+      util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+      nvfx_vp_emit(vpc, arith(0, SCA, BRA, none.reg, 0, none, none, none));
+      break;
+   case TGSI_OPCODE_BRK:
+      loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry);
+
+      reloc.location = vpc->vp->nr_insns;
+      reloc.target = loop.brk_target;
+      util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+
+      nvfx_vp_emit(vpc, arith(0, SCA, BRA, none.reg, 0, none, none, none));
+      break;
+   case TGSI_OPCODE_END:
+      assert(!sub_depth);
+      if(vpc->vp->enabled_ucps) {
+         if(idx != (vpc->info->num_instructions - 1)) {
+            reloc.location = vpc->vp->nr_insns;
+            reloc.target = vpc->info->num_instructions;
+            util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc);
+            nvfx_vp_emit(vpc, arith(0, SCA, BRA, none.reg, 0, none, none, none));
+         }
+      } else {
+         if(vpc->vp->nr_insns)
+            vpc->vp->insns[vpc->vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+         nvfx_vp_emit(vpc, arith(0, VEC, NOP, none.reg, 0, none, none, none));
+         vpc->vp->insns[vpc->vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+      return FALSE;
+   }
+
+   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE && !nv30->use_nv4x) {
+      if (!vpc->r_0_1.type)
+         vpc->r_0_1 = constant(vpc, -1, 0, 1, 0, 0);
+      nvfx_vp_emit(vpc, arith(0, VEC, MAX, dst, mask, nvfx_src(dst), swz(nvfx_src(vpc->r_0_1), X, X, X, X), none));
+      nvfx_vp_emit(vpc, arith(0, VEC, MIN, final_dst, mask, nvfx_src(dst), swz(nvfx_src(vpc->r_0_1), Y, Y, Y, Y), none));
+   }
+
+   release_temps(vpc);
+   return TRUE;
+}
+
+static boolean
+nvfx_vertprog_parse_decl_output(struct nv30_context *nv30, struct nvfx_vpc *vpc,
+                                const struct tgsi_full_declaration *fdec)
+{
+   unsigned num_texcoords = nv30->is_nv4x ? 10 : 8;
+   unsigned idx = fdec->Range.First;
+   unsigned semantic_index = fdec->Semantic.Index;
+   int hw = 0, i;
+
+   switch (fdec->Semantic.Name) {
+   case TGSI_SEMANTIC_POSITION:
+      hw = NVFX_VP(INST_DEST_POS);
+      vpc->hpos_idx = idx;
+      break;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      vpc->r_result[idx] = temp(vpc);
+      vpc->r_temps_discard = 0;
+      vpc->cvtx_idx = idx;
+      return TRUE;
+   case TGSI_SEMANTIC_COLOR:
+      if (fdec->Semantic.Index == 0) {
+         hw = NVFX_VP(INST_DEST_COL0);
+      } else
+      if (fdec->Semantic.Index == 1) {
+         hw = NVFX_VP(INST_DEST_COL1);
+      } else {
+         NOUVEAU_ERR("bad colour semantic index\n");
+         return FALSE;
+      }
+      break;
+   case TGSI_SEMANTIC_BCOLOR:
+      if (fdec->Semantic.Index == 0) {
+         hw = NVFX_VP(INST_DEST_BFC0);
+      } else
+      if (fdec->Semantic.Index == 1) {
+         hw = NVFX_VP(INST_DEST_BFC1);
+      } else {
+         NOUVEAU_ERR("bad bcolour semantic index\n");
+         return FALSE;
+      }
+      break;
+   case TGSI_SEMANTIC_FOG:
+      hw = NVFX_VP(INST_DEST_FOGC);
+      break;
+   case TGSI_SEMANTIC_PSIZE:
+      hw = NVFX_VP(INST_DEST_PSZ);
+      break;
+   case TGSI_SEMANTIC_GENERIC:
+      /* this is really an identifier for VP/FP linkage */
+      semantic_index += 8;
+      /* fall through */
+   case TGSI_SEMANTIC_TEXCOORD:
+      for (i = 0; i < num_texcoords; i++) {
+         if (vpc->vp->texcoord[i] == semantic_index) {
+            hw = NVFX_VP(INST_DEST_TC(i));
+            break;
+         }
+      }
+
+      if (i == num_texcoords) {
+         vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
+         return TRUE;
+      }
+      break;
+   case TGSI_SEMANTIC_EDGEFLAG:
+      /* not really an error just a fallback */
+      NOUVEAU_ERR("cannot handle edgeflag output\n");
+      return FALSE;
+   default:
+      NOUVEAU_ERR("bad output semantic\n");
+      return FALSE;
+   }
+
+   vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
+   return TRUE;
+}
+
+static boolean
+nvfx_vertprog_prepare(struct nv30_context *nv30, struct nvfx_vpc *vpc)
+{
+   struct tgsi_parse_context p;
+   int high_const = -1, high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+   tgsi_parse_init(&p, vpc->pipe.tokens);
+   while (!tgsi_parse_end_of_tokens(&p)) {
+      const union tgsi_full_token *tok = &p.FullToken;
+
+      tgsi_parse_token(&p);
+      switch(tok->Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         nr_imm++;
+         break;
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+         const struct tgsi_full_declaration *fdec;
+
+         fdec = &p.FullToken.FullDeclaration;
+         switch (fdec->Declaration.File) {
+         case TGSI_FILE_TEMPORARY:
+            if (fdec->Range.Last > high_temp) {
+               high_temp =
+                  fdec->Range.Last;
+            }
+            break;
+         case TGSI_FILE_ADDRESS:
+            if (fdec->Range.Last > high_addr) {
+               high_addr =
+                  fdec->Range.Last;
+            }
+            break;
+         case TGSI_FILE_CONSTANT:
+            if (fdec->Range.Last > high_const) {
+               high_const =
+                     fdec->Range.Last;
+            }
+            break;
+         case TGSI_FILE_OUTPUT:
+            if (!nvfx_vertprog_parse_decl_output(nv30, vpc, fdec))
+               return FALSE;
+            break;
+         default:
+            break;
+         }
+      }
+         break;
+      default:
+         break;
+      }
+   }
+   tgsi_parse_free(&p);
+
+   if (nr_imm) {
+      vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg));
+      assert(vpc->imm);
+   }
+
+   if (++high_temp) {
+      vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
+      for (i = 0; i < high_temp; i++)
+         vpc->r_temp[i] = temp(vpc);
+   }
+
+   if (++high_addr) {
+      vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg));
+      for (i = 0; i < high_addr; i++)
+         vpc->r_address[i] = nvfx_reg(NVFXSR_TEMP, i);
+   }
+
+   if(++high_const) {
+      vpc->r_const = CALLOC(high_const, sizeof(struct nvfx_reg));
+      for (i = 0; i < high_const; i++)
+         vpc->r_const[i] = constant(vpc, i, 0, 0, 0, 0);
+   }
+
+   vpc->r_temps_discard = 0;
+   return TRUE;
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE)
+
+boolean
+_nvfx_vertprog_translate(struct nv30_context *nv30, struct nv30_vertprog *vp)
+{
+   struct tgsi_parse_context parse;
+   struct nvfx_vpc *vpc = NULL;
+   struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
+   struct util_dynarray insns;
+   int i, ucps;
+
+   vp->translated = FALSE;
+   vp->nr_insns = 0;
+   vp->nr_consts = 0;
+
+   vpc = CALLOC_STRUCT(nvfx_vpc);
+   if (!vpc)
+      return FALSE;
+   vpc->nv30 = nv30;
+   vpc->vp   = vp;
+   vpc->pipe = vp->pipe;
+   vpc->info = &vp->info;
+   vpc->cvtx_idx = -1;
+
+   if (!nvfx_vertprog_prepare(nv30, vpc)) {
+      FREE(vpc);
+      return FALSE;
+   }
+
+   /* Redirect post-transform vertex position to a temp if user clip
+    * planes are enabled.  We need to append code to the vtxprog
+    * to handle clip planes later.
+    */
+   if (vp->enabled_ucps && vpc->cvtx_idx < 0)  {
+      vpc->r_result[vpc->hpos_idx] = temp(vpc);
+      vpc->r_temps_discard = 0;
+      vpc->cvtx_idx = vpc->hpos_idx;
+   }
+
+   util_dynarray_init(&insns);
+
+   tgsi_parse_init(&parse, vp->pipe.tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      {
+         const struct tgsi_full_immediate *imm;
+
+         imm = &parse.FullToken.FullImmediate;
+         assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+         assert(imm->Immediate.NrTokens == 4 + 1);
+         vpc->imm[vpc->nr_imm++] =
+            constant(vpc, -1,
+                imm->u[0].Float,
+                imm->u[1].Float,
+                imm->u[2].Float,
+                imm->u[3].Float);
+      }
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      {
+         const struct tgsi_full_instruction *finst;
+         unsigned idx = insns.size >> 2;
+         util_dynarray_append(&insns, unsigned, vp->nr_insns);
+         finst = &parse.FullToken.FullInstruction;
+         if (!nvfx_vertprog_parse_instruction(nv30, vpc, idx, finst))
+            goto out;
+      }
+         break;
+      default:
+         break;
+      }
+   }
+
+   util_dynarray_append(&insns, unsigned, vp->nr_insns);
+
+   for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
+   {
+      struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i);
+      struct nvfx_relocation hw_reloc;
+
+      hw_reloc.location = label_reloc->location;
+      hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target];
+
+      //debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target);
+
+      util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc);
+   }
+   util_dynarray_fini(&insns);
+   util_dynarray_trim(&vp->branch_relocs);
+
+   /* XXX: what if we add a RET before?!  make sure we jump here...*/
+
+   /* Write out HPOS if it was redirected to a temp earlier */
+   if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
+      struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT,
+                  NVFX_VP(INST_DEST_POS));
+      struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]);
+
+      nvfx_vp_emit(vpc, arith(0, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none));
+   }
+
+   /* Insert code to handle user clip planes */
+   ucps = vp->enabled_ucps;
+   while (ucps) {
+      int i = ffs(ucps) - 1; ucps &= ~(1 << i);
+      struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i));
+      struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, 512 + i));
+      struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->cvtx_idx]);
+      unsigned mask;
+
+      if(nv30->is_nv4x)
+      {
+         switch (i) {
+         case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+         case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+         case 2: case 5: mask = NVFX_VP_MASK_W; break;
+         default:
+            NOUVEAU_ERR("invalid clip dist #%d\n", i);
+            goto out;
+         }
+      }
+      else
+         mask = NVFX_VP_MASK_X;
+
+      nvfx_vp_emit(vpc, arith(0, VEC, DP4, cdst, mask, htmp, ceqn, none));
+   }
+
+   if (vpc->vp->nr_insns)
+      vpc->vp->insns[vpc->vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+
+   if(debug_get_option_nvfx_dump_vp())
+   {
+      debug_printf("\n");
+      tgsi_dump(vpc->pipe.tokens, 0);
+
+      debug_printf("\n%s vertex program:\n", nv30->is_nv4x ? "nv4x" : "nv3x");
+      for (i = 0; i < vp->nr_insns; i++)
+         debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]);
+      debug_printf("\n");
+   }
+
+   vp->translated = TRUE;
+
+out:
+   tgsi_parse_free(&parse);
+   if(vpc) {
+      util_dynarray_fini(&vpc->label_relocs);
+      util_dynarray_fini(&vpc->loop_stack);
+      FREE(vpc->r_temp);
+      FREE(vpc->r_address);
+      FREE(vpc->r_const);
+      FREE(vpc->imm);
+      FREE(vpc);
+   }
+
+   return vp->translated;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
new file mode 100644
index 00000000000..dfbef2c6a30
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
@@ -0,0 +1,416 @@
+#ifndef RNNDB_NV50_2D_XML
+#define RNNDB_NV50_2D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/nv50_2d.xml      (  11113 bytes, from 2011-07-09 13:43:58)
+- ./rnndb/copyright.xml  (   6452 bytes, from 2011-07-09 13:43:58)
+- ./rnndb/nv_object.xml  (  12912 bytes, from 2012-07-12 09:41:09)
+- ./rnndb/nvchipsets.xml (   3736 bytes, from 2012-07-12 09:41:09)
+- ./rnndb/nv_defs.xml    (   4437 bytes, from 2011-07-09 13:43:58)
+- ./rnndb/nv50_defs.xml  (   5468 bytes, from 2011-07-09 13:43:58)
+
+Copyright (C) 2006-2011 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+
+#define NV50_2D_DMA_NOTIFY					0x00000180
+
+#define NV50_2D_DMA_DST						0x00000184
+
+#define NV50_2D_DMA_SRC						0x00000188
+
+#define NV50_2D_DMA_COND					0x0000018c
+
+#define NV50_2D_DST_FORMAT					0x00000200
+
+#define NV50_2D_DST_LINEAR					0x00000204
+
+#define NV50_2D_DST_TILE_MODE					0x00000208
+
+#define NV50_2D_DST_DEPTH					0x0000020c
+
+#define NV50_2D_DST_LAYER					0x00000210
+
+#define NV50_2D_DST_PITCH					0x00000214
+
+#define NV50_2D_DST_WIDTH					0x00000218
+
+#define NV50_2D_DST_HEIGHT					0x0000021c
+
+#define NV50_2D_DST_ADDRESS_HIGH				0x00000220
+
+#define NV50_2D_DST_ADDRESS_LOW					0x00000224
+
+#define NV50_2D_UNK228						0x00000228
+
+#define NVC0_2D_UNK228						0x00000228
+
+#define NV50_2D_SRC_FORMAT					0x00000230
+
+#define NV50_2D_SRC_LINEAR					0x00000234
+
+#define NV50_2D_SRC_TILE_MODE					0x00000238
+
+#define NV50_2D_SRC_DEPTH					0x0000023c
+
+#define NV50_2D_SRC_LAYER					0x00000240
+
+#define NVC0_2D_UNK0240						0x00000240
+
+#define NV50_2D_SRC_PITCH					0x00000244
+#define NV50_2D_SRC_PITCH__MAX					0x00040000
+
+#define NV50_2D_SRC_WIDTH					0x00000248
+#define NV50_2D_SRC_WIDTH__MAX					0x00010000
+
+#define NV50_2D_SRC_HEIGHT					0x0000024c
+#define NV50_2D_SRC_HEIGHT__MAX					0x00010000
+
+#define NV50_2D_SRC_ADDRESS_HIGH				0x00000250
+
+#define NV50_2D_SRC_ADDRESS_LOW					0x00000254
+
+#define NV50_2D_UNK258						0x00000258
+
+#define NV50_2D_UNK260						0x00000260
+
+#define NV50_2D_COND_ADDRESS_HIGH				0x00000264
+
+#define NV50_2D_COND_ADDRESS_LOW				0x00000268
+
+#define NV50_2D_COND_MODE					0x0000026c
+#define NV50_2D_COND_MODE_NEVER					0x00000000
+#define NV50_2D_COND_MODE_ALWAYS				0x00000001
+#define NV50_2D_COND_MODE_RES_NON_ZERO				0x00000002
+#define NV50_2D_COND_MODE_EQUAL					0x00000003
+#define NV50_2D_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NV50_2D_CLIP_X						0x00000280
+
+#define NV50_2D_CLIP_Y						0x00000284
+
+#define NV50_2D_CLIP_W						0x00000288
+
+#define NV50_2D_CLIP_H						0x0000028c
+
+#define NV50_2D_CLIP_ENABLE					0x00000290
+
+#define NV50_2D_COLOR_KEY_FORMAT				0x00000294
+#define NV50_2D_COLOR_KEY_FORMAT_16BPP				0x00000000
+#define NV50_2D_COLOR_KEY_FORMAT_15BPP				0x00000001
+#define NV50_2D_COLOR_KEY_FORMAT_24BPP				0x00000002
+#define NV50_2D_COLOR_KEY_FORMAT_30BPP				0x00000003
+#define NV50_2D_COLOR_KEY_FORMAT_8BPP				0x00000004
+#define NV50_2D_COLOR_KEY_FORMAT_16BPP2				0x00000005
+#define NV50_2D_COLOR_KEY_FORMAT_32BPP				0x00000006
+
+#define NV50_2D_COLOR_KEY					0x00000298
+
+#define NV50_2D_COLOR_KEY_ENABLE				0x0000029c
+
+#define NV50_2D_ROP						0x000002a0
+
+#define NV50_2D_BETA1						0x000002a4
+#define NV50_2D_BETA1_BETA1__MASK				0x7f800000
+#define NV50_2D_BETA1_BETA1__SHIFT				23
+
+#define NV50_2D_BETA4						0x000002a8
+#define NV50_2D_BETA4_B__MASK					0x000000ff
+#define NV50_2D_BETA4_B__SHIFT					0
+#define NV50_2D_BETA4_G__MASK					0x0000ff00
+#define NV50_2D_BETA4_G__SHIFT					8
+#define NV50_2D_BETA4_R__MASK					0x00ff0000
+#define NV50_2D_BETA4_R__SHIFT					16
+#define NV50_2D_BETA4_A__MASK					0xff000000
+#define NV50_2D_BETA4_A__SHIFT					24
+
+#define NV50_2D_OPERATION					0x000002ac
+#define NV50_2D_OPERATION_SRCCOPY_AND				0x00000000
+#define NV50_2D_OPERATION_ROP_AND				0x00000001
+#define NV50_2D_OPERATION_BLEND					0x00000002
+#define NV50_2D_OPERATION_SRCCOPY				0x00000003
+#define NV50_2D_OPERATION_ROP					0x00000004
+#define NV50_2D_OPERATION_SRCCOPY_PREMULT			0x00000005
+#define NV50_2D_OPERATION_BLEND_PREMULT				0x00000006
+
+#define NV50_2D_PATTERN_OFFSET					0x000002b0
+#define NV50_2D_PATTERN_OFFSET_X__MASK				0x0000003f
+#define NV50_2D_PATTERN_OFFSET_X__SHIFT				0
+#define NV50_2D_PATTERN_OFFSET_Y__MASK				0x00003f00
+#define NV50_2D_PATTERN_OFFSET_Y__SHIFT				8
+
+#define NV50_2D_PATTERN_SELECT					0x000002b4
+#define NV50_2D_PATTERN_SELECT_MONO_8X8				0x00000000
+#define NV50_2D_PATTERN_SELECT_MONO_64X1			0x00000001
+#define NV50_2D_PATTERN_SELECT_MONO_1X64			0x00000002
+#define NV50_2D_PATTERN_SELECT_COLOR				0x00000003
+
+#define NVC0_2D_UNK2DC						0x000002dc
+
+#define NVC0_2D_UNK2E0						0x000002e0
+
+#define NV50_2D_PATTERN_COLOR_FORMAT				0x000002e8
+#define NV50_2D_PATTERN_COLOR_FORMAT_16BPP			0x00000000
+#define NV50_2D_PATTERN_COLOR_FORMAT_15BPP			0x00000001
+#define NV50_2D_PATTERN_COLOR_FORMAT_32BPP			0x00000002
+#define NV50_2D_PATTERN_COLOR_FORMAT_8BPP			0x00000003
+#define NV50_2D_PATTERN_COLOR_FORMAT_UNK4			0x00000004
+#define NV50_2D_PATTERN_COLOR_FORMAT_UNK5			0x00000005
+#define NV50_2D_PATTERN_COLOR_FORMAT_UNK6			0x00000006
+
+#define NV50_2D_PATTERN_MONO_FORMAT				0x000002ec
+#define NV50_2D_PATTERN_MONO_FORMAT_CGA6			0x00000000
+#define NV50_2D_PATTERN_MONO_FORMAT_LE				0x00000001
+
+#define NV50_2D_PATTERN_COLOR(i0)			       (0x000002f0 + 0x4*(i0))
+#define NV50_2D_PATTERN_COLOR__ESIZE				0x00000004
+#define NV50_2D_PATTERN_COLOR__LEN				0x00000002
+
+#define NV50_2D_PATTERN_BITMAP(i0)			       (0x000002f8 + 0x4*(i0))
+#define NV50_2D_PATTERN_BITMAP__ESIZE				0x00000004
+#define NV50_2D_PATTERN_BITMAP__LEN				0x00000002
+
+#define NV50_2D_PATTERN_X8R8G8B8(i0)			       (0x00000300 + 0x4*(i0))
+#define NV50_2D_PATTERN_X8R8G8B8__ESIZE				0x00000004
+#define NV50_2D_PATTERN_X8R8G8B8__LEN				0x00000040
+#define NV50_2D_PATTERN_X8R8G8B8_B__MASK			0x000000ff
+#define NV50_2D_PATTERN_X8R8G8B8_B__SHIFT			0
+#define NV50_2D_PATTERN_X8R8G8B8_G__MASK			0x0000ff00
+#define NV50_2D_PATTERN_X8R8G8B8_G__SHIFT			8
+#define NV50_2D_PATTERN_X8R8G8B8_R__MASK			0x00ff0000
+#define NV50_2D_PATTERN_X8R8G8B8_R__SHIFT			16
+
+#define NV50_2D_PATTERN_R5G6B5(i0)			       (0x00000400 + 0x4*(i0))
+#define NV50_2D_PATTERN_R5G6B5__ESIZE				0x00000004
+#define NV50_2D_PATTERN_R5G6B5__LEN				0x00000020
+#define NV50_2D_PATTERN_R5G6B5_B0__MASK				0x0000001f
+#define NV50_2D_PATTERN_R5G6B5_B0__SHIFT			0
+#define NV50_2D_PATTERN_R5G6B5_G0__MASK				0x000007e0
+#define NV50_2D_PATTERN_R5G6B5_G0__SHIFT			5
+#define NV50_2D_PATTERN_R5G6B5_R0__MASK				0x0000f800
+#define NV50_2D_PATTERN_R5G6B5_R0__SHIFT			11
+#define NV50_2D_PATTERN_R5G6B5_B1__MASK				0x001f0000
+#define NV50_2D_PATTERN_R5G6B5_B1__SHIFT			16
+#define NV50_2D_PATTERN_R5G6B5_G1__MASK				0x07e00000
+#define NV50_2D_PATTERN_R5G6B5_G1__SHIFT			21
+#define NV50_2D_PATTERN_R5G6B5_R1__MASK				0xf8000000
+#define NV50_2D_PATTERN_R5G6B5_R1__SHIFT			27
+
+#define NV50_2D_PATTERN_X1R5G5B5(i0)			       (0x00000480 + 0x4*(i0))
+#define NV50_2D_PATTERN_X1R5G5B5__ESIZE				0x00000004
+#define NV50_2D_PATTERN_X1R5G5B5__LEN				0x00000020
+#define NV50_2D_PATTERN_X1R5G5B5_B0__MASK			0x0000001f
+#define NV50_2D_PATTERN_X1R5G5B5_B0__SHIFT			0
+#define NV50_2D_PATTERN_X1R5G5B5_G0__MASK			0x000003e0
+#define NV50_2D_PATTERN_X1R5G5B5_G0__SHIFT			5
+#define NV50_2D_PATTERN_X1R5G5B5_R0__MASK			0x00007c00
+#define NV50_2D_PATTERN_X1R5G5B5_R0__SHIFT			10
+#define NV50_2D_PATTERN_X1R5G5B5_B1__MASK			0x001f0000
+#define NV50_2D_PATTERN_X1R5G5B5_B1__SHIFT			16
+#define NV50_2D_PATTERN_X1R5G5B5_G1__MASK			0x03e00000
+#define NV50_2D_PATTERN_X1R5G5B5_G1__SHIFT			21
+#define NV50_2D_PATTERN_X1R5G5B5_R1__MASK			0x7c000000
+#define NV50_2D_PATTERN_X1R5G5B5_R1__SHIFT			26
+
+#define NV50_2D_PATTERN_Y8(i0)				       (0x00000500 + 0x4*(i0))
+#define NV50_2D_PATTERN_Y8__ESIZE				0x00000004
+#define NV50_2D_PATTERN_Y8__LEN					0x00000010
+#define NV50_2D_PATTERN_Y8_Y0__MASK				0x000000ff
+#define NV50_2D_PATTERN_Y8_Y0__SHIFT				0
+#define NV50_2D_PATTERN_Y8_Y1__MASK				0x0000ff00
+#define NV50_2D_PATTERN_Y8_Y1__SHIFT				8
+#define NV50_2D_PATTERN_Y8_Y2__MASK				0x00ff0000
+#define NV50_2D_PATTERN_Y8_Y2__SHIFT				16
+#define NV50_2D_PATTERN_Y8_Y3__MASK				0xff000000
+#define NV50_2D_PATTERN_Y8_Y3__SHIFT				24
+
+#define NVC0_2D_DRAW_COLOR_LONG(i0)			       (0x00000540 + 0x4*(i0))
+#define NVC0_2D_DRAW_COLOR_LONG__ESIZE				0x00000004
+#define NVC0_2D_DRAW_COLOR_LONG__LEN				0x00000004
+
+#define NV50_2D_DRAW_SHAPE					0x00000580
+#define NV50_2D_DRAW_SHAPE_POINTS				0x00000000
+#define NV50_2D_DRAW_SHAPE_LINES				0x00000001
+#define NV50_2D_DRAW_SHAPE_LINE_STRIP				0x00000002
+#define NV50_2D_DRAW_SHAPE_TRIANGLES				0x00000003
+#define NV50_2D_DRAW_SHAPE_RECTANGLES				0x00000004
+
+#define NV50_2D_DRAW_COLOR_FORMAT				0x00000584
+
+#define NV50_2D_DRAW_COLOR					0x00000588
+
+#define NV50_2D_UNK58C						0x0000058c
+#define NV50_2D_UNK58C_0					0x00000001
+#define NV50_2D_UNK58C_1					0x00000010
+#define NV50_2D_UNK58C_2					0x00000100
+#define NV50_2D_UNK58C_3					0x00001000
+
+#define NV50_2D_DRAW_POINT16					0x000005e0
+#define NV50_2D_DRAW_POINT16_X__MASK				0x0000ffff
+#define NV50_2D_DRAW_POINT16_X__SHIFT				0
+#define NV50_2D_DRAW_POINT16_Y__MASK				0xffff0000
+#define NV50_2D_DRAW_POINT16_Y__SHIFT				16
+
+#define NV50_2D_DRAW_POINT32_X(i0)			       (0x00000600 + 0x8*(i0))
+#define NV50_2D_DRAW_POINT32_X__ESIZE				0x00000008
+#define NV50_2D_DRAW_POINT32_X__LEN				0x00000040
+
+#define NV50_2D_DRAW_POINT32_Y(i0)			       (0x00000604 + 0x8*(i0))
+#define NV50_2D_DRAW_POINT32_Y__ESIZE				0x00000008
+#define NV50_2D_DRAW_POINT32_Y__LEN				0x00000040
+
+#define NV50_2D_SIFC_BITMAP_ENABLE				0x00000800
+
+#define NV50_2D_SIFC_FORMAT					0x00000804
+
+#define NV50_2D_SIFC_BITMAP_FORMAT				0x00000808
+#define NV50_2D_SIFC_BITMAP_FORMAT_I1				0x00000000
+#define NV50_2D_SIFC_BITMAP_FORMAT_I4				0x00000001
+#define NV50_2D_SIFC_BITMAP_FORMAT_I8				0x00000002
+
+#define NV50_2D_SIFC_BITMAP_LSB_FIRST				0x0000080c
+
+#define NV50_2D_SIFC_BITMAP_LINE_PACK_MODE			0x00000810
+#define NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_PACKED		0x00000000
+#define NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_BYTE		0x00000001
+#define NV50_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_WORD		0x00000002
+
+#define NV50_2D_SIFC_BITMAP_COLOR_BIT0				0x00000814
+
+#define NV50_2D_SIFC_BITMAP_COLOR_BIT1				0x00000818
+
+#define NV50_2D_SIFC_BITMAP_WRITE_BIT0_ENABLE			0x0000081c
+
+#define NV50_2D_SIFC_WIDTH					0x00000838
+
+#define NV50_2D_SIFC_HEIGHT					0x0000083c
+
+#define NV50_2D_SIFC_DX_DU_FRACT				0x00000840
+
+#define NV50_2D_SIFC_DX_DU_INT					0x00000844
+
+#define NV50_2D_SIFC_DY_DV_FRACT				0x00000848
+
+#define NV50_2D_SIFC_DY_DV_INT					0x0000084c
+
+#define NV50_2D_SIFC_DST_X_FRACT				0x00000850
+
+#define NV50_2D_SIFC_DST_X_INT					0x00000854
+
+#define NV50_2D_SIFC_DST_Y_FRACT				0x00000858
+
+#define NV50_2D_SIFC_DST_Y_INT					0x0000085c
+
+#define NV50_2D_SIFC_DATA					0x00000860
+
+#define NV50_2D_UNK0870						0x00000870
+
+#define NV50_2D_UNK0880						0x00000880
+
+#define NV50_2D_UNK0884						0x00000884
+
+#define NV50_2D_UNK0888						0x00000888
+
+#define NV50_2D_BLIT_CONTROL					0x0000088c
+#define NV50_2D_BLIT_CONTROL_ORIGIN__MASK			0x00000001
+#define NV50_2D_BLIT_CONTROL_ORIGIN__SHIFT			0
+#define NV50_2D_BLIT_CONTROL_ORIGIN_CENTER			0x00000000
+#define NV50_2D_BLIT_CONTROL_ORIGIN_CORNER			0x00000001
+#define NV50_2D_BLIT_CONTROL_FILTER__MASK			0x00000010
+#define NV50_2D_BLIT_CONTROL_FILTER__SHIFT			4
+#define NV50_2D_BLIT_CONTROL_FILTER_POINT_SAMPLE		0x00000000
+#define NV50_2D_BLIT_CONTROL_FILTER_BILINEAR			0x00000010
+
+#define NV50_2D_BLIT_DST_X					0x000008b0
+
+#define NV50_2D_BLIT_DST_Y					0x000008b4
+
+#define NV50_2D_BLIT_DST_W					0x000008b8
+
+#define NV50_2D_BLIT_DST_H					0x000008bc
+
+#define NV50_2D_BLIT_DU_DX_FRACT				0x000008c0
+
+#define NV50_2D_BLIT_DU_DX_INT					0x000008c4
+
+#define NV50_2D_BLIT_DV_DY_FRACT				0x000008c8
+
+#define NV50_2D_BLIT_DV_DY_INT					0x000008cc
+
+#define NV50_2D_BLIT_SRC_X_FRACT				0x000008d0
+
+#define NV50_2D_BLIT_SRC_X_INT					0x000008d4
+
+#define NV50_2D_BLIT_SRC_Y_FRACT				0x000008d8
+
+#define NV50_2D_BLIT_SRC_Y_INT					0x000008dc
+
+#define NVC0_2D_FIRMWARE(i0)				       (0x000008e0 + 0x4*(i0))
+#define NVC0_2D_FIRMWARE__ESIZE					0x00000004
+#define NVC0_2D_FIRMWARE__LEN					0x00000020
+
+
+#endif /* RNNDB_NV50_2D_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_3d.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_3d.xml.h
new file mode 100644
index 00000000000..9dff8b2dd13
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_3d.xml.h
@@ -0,0 +1,2110 @@
+#ifndef RNNDB_NV50_3D_XML
+#define RNNDB_NV50_3D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/nv50_3d.xml      (  65226 bytes, from 2012-01-28 13:46:30)
+- ./rnndb/copyright.xml  (   6452 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv_defs.xml    (   4437 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv50_defs.xml  (   5468 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nvchipsets.xml (   3617 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv_3ddefs.xml  (  16394 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv_object.xml  (  12672 bytes, from 2011-08-11 18:25:12)
+
+Copyright (C) 2006-2012 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV50_3D_DMA_NOTIFY					0x00000180
+
+#define NV50_3D_DMA_ZETA					0x00000184
+
+#define NV50_3D_DMA_QUERY					0x00000188
+
+#define NV50_3D_DMA_VTXBUF					0x0000018c
+
+#define NV50_3D_DMA_LOCAL					0x00000190
+
+#define NV50_3D_DMA_STACK					0x00000194
+
+#define NV50_3D_DMA_CODE_CB					0x00000198
+
+#define NV50_3D_DMA_TSC						0x0000019c
+
+#define NV50_3D_DMA_TIC						0x000001a0
+
+#define NV50_3D_DMA_TEXTURE					0x000001a4
+
+#define NV50_3D_DMA_STRMOUT					0x000001a8
+
+#define NV50_3D_DMA_CLIPID					0x000001ac
+
+#define NV50_3D_DMA_COLOR(i0)				       (0x000001c0 + 0x4*(i0))
+#define NV50_3D_DMA_COLOR__ESIZE				0x00000004
+#define NV50_3D_DMA_COLOR__LEN					0x00000008
+
+#define NV50_3D_RT(i0)					       (0x00000200 + 0x20*(i0))
+#define NV50_3D_RT__ESIZE					0x00000020
+#define NV50_3D_RT__LEN						0x00000008
+
+#define NV50_3D_RT_ADDRESS_HIGH(i0)			       (0x00000200 + 0x20*(i0))
+
+#define NV50_3D_RT_ADDRESS_LOW(i0)			       (0x00000204 + 0x20*(i0))
+
+#define NV50_3D_RT_FORMAT(i0)				       (0x00000208 + 0x20*(i0))
+
+#define NV50_3D_RT_TILE_MODE(i0)			       (0x0000020c + 0x20*(i0))
+#define NV50_3D_RT_TILE_MODE_X__MASK				0x0000000f
+#define NV50_3D_RT_TILE_MODE_X__SHIFT				0
+#define NV50_3D_RT_TILE_MODE_Y__MASK				0x000000f0
+#define NV50_3D_RT_TILE_MODE_Y__SHIFT				4
+#define NV50_3D_RT_TILE_MODE_Z__MASK				0x00000f00
+#define NV50_3D_RT_TILE_MODE_Z__SHIFT				8
+
+#define NV50_3D_RT_LAYER_STRIDE(i0)			       (0x00000210 + 0x20*(i0))
+#define NV50_3D_RT_LAYER_STRIDE__SHR				2
+
+#define NV50_3D_RT_UNK14(i0)				       (0x00000214 + 0x20*(i0))
+
+#define NV50_3D_VTX_ATTR_1F(i0)				       (0x00000300 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_1F__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_1F__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_2H(i0)				       (0x00000340 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_2H__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_2H__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_2H_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_2H_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_2H_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_2H_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_2F_X(i0)			       (0x00000380 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_2F_X__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_2F_X__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_2F_Y(i0)			       (0x00000384 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_2F_Y__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_2F_Y__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_3F_X(i0)			       (0x00000400 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_3F_X__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_3F_X__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_3F_Y(i0)			       (0x00000404 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_3F_Y__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_3F_Y__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_3F_Z(i0)			       (0x00000408 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_3F_Z__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_3F_Z__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_4F_X(i0)			       (0x00000500 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_4F_X__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_4F_X__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_4F_Y(i0)			       (0x00000504 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_4F_Y__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_4F_Y__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_4F_Z(i0)			       (0x00000508 + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_4F_Z__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_4F_Z__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_4F_W(i0)			       (0x0000050c + 0x10*(i0))
+#define NV50_3D_VTX_ATTR_4F_W__ESIZE				0x00000010
+#define NV50_3D_VTX_ATTR_4F_W__LEN				0x00000010
+
+#define NV50_3D_VTX_ATTR_4H_0(i0)			       (0x00000600 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4H_0__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4H_0__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4H_0_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4H_0_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4H_0_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4H_0_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4H_1(i0)			       (0x00000604 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4H_1__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4H_1__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4H_1_Z__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4H_1_Z__SHIFT				0
+#define NV50_3D_VTX_ATTR_4H_1_W__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4H_1_W__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_2I(i0)				       (0x00000680 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_2I__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_2I__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_2I_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_2I_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_2I_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_2I_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_2NI(i0)			       (0x000006c0 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_2NI__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_2NI__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_2NI_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_2NI_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_2NI_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_2NI_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4I_0(i0)			       (0x00000700 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4I_0__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4I_0__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4I_0_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4I_0_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4I_0_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4I_0_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4I_1(i0)			       (0x00000704 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4I_1__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4I_1__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4I_1_Z__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4I_1_Z__SHIFT				0
+#define NV50_3D_VTX_ATTR_4I_1_W__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4I_1_W__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4NI_0(i0)			       (0x00000780 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4NI_0__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4NI_0__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4NI_0_X__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4NI_0_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4NI_0_Y__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4NI_0_Y__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4NI_1(i0)			       (0x00000784 + 0x8*(i0))
+#define NV50_3D_VTX_ATTR_4NI_1__ESIZE				0x00000008
+#define NV50_3D_VTX_ATTR_4NI_1__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4NI_1_Z__MASK				0x0000ffff
+#define NV50_3D_VTX_ATTR_4NI_1_Z__SHIFT				0
+#define NV50_3D_VTX_ATTR_4NI_1_W__MASK				0xffff0000
+#define NV50_3D_VTX_ATTR_4NI_1_W__SHIFT				16
+
+#define NV50_3D_VTX_ATTR_4UB(i0)			       (0x00000800 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_4UB__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_4UB__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4UB_X__MASK				0x000000ff
+#define NV50_3D_VTX_ATTR_4UB_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4UB_Y__MASK				0x0000ff00
+#define NV50_3D_VTX_ATTR_4UB_Y__SHIFT				8
+#define NV50_3D_VTX_ATTR_4UB_Z__MASK				0x00ff0000
+#define NV50_3D_VTX_ATTR_4UB_Z__SHIFT				16
+#define NV50_3D_VTX_ATTR_4UB_W__MASK				0xff000000
+#define NV50_3D_VTX_ATTR_4UB_W__SHIFT				24
+
+#define NV50_3D_VTX_ATTR_4B(i0)				       (0x00000840 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_4B__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_4B__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4B_X__MASK				0x000000ff
+#define NV50_3D_VTX_ATTR_4B_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4B_Y__MASK				0x0000ff00
+#define NV50_3D_VTX_ATTR_4B_Y__SHIFT				8
+#define NV50_3D_VTX_ATTR_4B_Z__MASK				0x00ff0000
+#define NV50_3D_VTX_ATTR_4B_Z__SHIFT				16
+#define NV50_3D_VTX_ATTR_4B_W__MASK				0xff000000
+#define NV50_3D_VTX_ATTR_4B_W__SHIFT				24
+
+#define NV50_3D_VTX_ATTR_4NUB(i0)			       (0x00000880 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_4NUB__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_4NUB__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4NUB_X__MASK				0x000000ff
+#define NV50_3D_VTX_ATTR_4NUB_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4NUB_Y__MASK				0x0000ff00
+#define NV50_3D_VTX_ATTR_4NUB_Y__SHIFT				8
+#define NV50_3D_VTX_ATTR_4NUB_Z__MASK				0x00ff0000
+#define NV50_3D_VTX_ATTR_4NUB_Z__SHIFT				16
+#define NV50_3D_VTX_ATTR_4NUB_W__MASK				0xff000000
+#define NV50_3D_VTX_ATTR_4NUB_W__SHIFT				24
+
+#define NV50_3D_VTX_ATTR_4NB(i0)			       (0x000008c0 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_4NB__ESIZE				0x00000004
+#define NV50_3D_VTX_ATTR_4NB__LEN				0x00000010
+#define NV50_3D_VTX_ATTR_4NB_X__MASK				0x000000ff
+#define NV50_3D_VTX_ATTR_4NB_X__SHIFT				0
+#define NV50_3D_VTX_ATTR_4NB_Y__MASK				0x0000ff00
+#define NV50_3D_VTX_ATTR_4NB_Y__SHIFT				8
+#define NV50_3D_VTX_ATTR_4NB_Z__MASK				0x00ff0000
+#define NV50_3D_VTX_ATTR_4NB_Z__SHIFT				16
+#define NV50_3D_VTX_ATTR_4NB_W__MASK				0xff000000
+#define NV50_3D_VTX_ATTR_4NB_W__SHIFT				24
+
+#define NV50_3D_VERTEX_ARRAY_FETCH(i0)			       (0x00000900 + 0x10*(i0))
+#define NV50_3D_VERTEX_ARRAY_FETCH__ESIZE			0x00000010
+#define NV50_3D_VERTEX_ARRAY_FETCH__LEN				0x00000010
+#define NV50_3D_VERTEX_ARRAY_FETCH_STRIDE__MASK			0x00000fff
+#define NV50_3D_VERTEX_ARRAY_FETCH_STRIDE__SHIFT		0
+#define NV50_3D_VERTEX_ARRAY_FETCH_ENABLE			0x20000000
+
+#define NV50_3D_VERTEX_ARRAY_START_HIGH(i0)		       (0x00000904 + 0x10*(i0))
+#define NV50_3D_VERTEX_ARRAY_START_HIGH__ESIZE			0x00000010
+#define NV50_3D_VERTEX_ARRAY_START_HIGH__LEN			0x00000010
+
+#define NV50_3D_VERTEX_ARRAY_START_LOW(i0)		       (0x00000908 + 0x10*(i0))
+#define NV50_3D_VERTEX_ARRAY_START_LOW__ESIZE			0x00000010
+#define NV50_3D_VERTEX_ARRAY_START_LOW__LEN			0x00000010
+
+#define NV50_3D_VERTEX_ARRAY_DIVISOR(i0)		       (0x0000090c + 0x10*(i0))
+#define NV50_3D_VERTEX_ARRAY_DIVISOR__ESIZE			0x00000010
+#define NV50_3D_VERTEX_ARRAY_DIVISOR__LEN			0x00000010
+
+#define NV50_3D_VIEWPORT_SCALE_X(i0)			       (0x00000a00 + 0x20*(i0))
+#define NV50_3D_VIEWPORT_SCALE_X__ESIZE				0x00000020
+#define NV50_3D_VIEWPORT_SCALE_X__LEN				0x00000010
+
+#define NV50_3D_VIEWPORT_SCALE_Y(i0)			       (0x00000a04 + 0x20*(i0))
+#define NV50_3D_VIEWPORT_SCALE_Y__ESIZE				0x00000020
+#define NV50_3D_VIEWPORT_SCALE_Y__LEN				0x00000010
+
+#define NV50_3D_VIEWPORT_SCALE_Z(i0)			       (0x00000a08 + 0x20*(i0))
+#define NV50_3D_VIEWPORT_SCALE_Z__ESIZE				0x00000020
+#define NV50_3D_VIEWPORT_SCALE_Z__LEN				0x00000010
+
+#define NV50_3D_VIEWPORT_TRANSLATE_X(i0)		       (0x00000a0c + 0x20*(i0))
+#define NV50_3D_VIEWPORT_TRANSLATE_X__ESIZE			0x00000020
+#define NV50_3D_VIEWPORT_TRANSLATE_X__LEN			0x00000010
+
+#define NV50_3D_VIEWPORT_TRANSLATE_Y(i0)		       (0x00000a10 + 0x20*(i0))
+#define NV50_3D_VIEWPORT_TRANSLATE_Y__ESIZE			0x00000020
+#define NV50_3D_VIEWPORT_TRANSLATE_Y__LEN			0x00000010
+
+#define NV50_3D_VIEWPORT_TRANSLATE_Z(i0)		       (0x00000a14 + 0x20*(i0))
+#define NV50_3D_VIEWPORT_TRANSLATE_Z__ESIZE			0x00000020
+#define NV50_3D_VIEWPORT_TRANSLATE_Z__LEN			0x00000010
+
+#define NV50_3D_VIEWPORT_HORIZ(i0)			       (0x00000c00 + 0x10*(i0))
+#define NV50_3D_VIEWPORT_HORIZ__ESIZE				0x00000010
+#define NV50_3D_VIEWPORT_HORIZ__LEN				0x00000010
+#define NV50_3D_VIEWPORT_HORIZ_X__MASK				0x0000ffff
+#define NV50_3D_VIEWPORT_HORIZ_X__SHIFT				0
+#define NV50_3D_VIEWPORT_HORIZ_W__MASK				0xffff0000
+#define NV50_3D_VIEWPORT_HORIZ_W__SHIFT				16
+
+#define NV50_3D_VIEWPORT_VERT(i0)			       (0x00000c04 + 0x10*(i0))
+#define NV50_3D_VIEWPORT_VERT__ESIZE				0x00000010
+#define NV50_3D_VIEWPORT_VERT__LEN				0x00000010
+#define NV50_3D_VIEWPORT_VERT_Y__MASK				0x0000ffff
+#define NV50_3D_VIEWPORT_VERT_Y__SHIFT				0
+#define NV50_3D_VIEWPORT_VERT_H__MASK				0xffff0000
+#define NV50_3D_VIEWPORT_VERT_H__SHIFT				16
+
+#define NV50_3D_DEPTH_RANGE_NEAR(i0)			       (0x00000c08 + 0x10*(i0))
+#define NV50_3D_DEPTH_RANGE_NEAR__ESIZE				0x00000010
+#define NV50_3D_DEPTH_RANGE_NEAR__LEN				0x00000010
+
+#define NV50_3D_DEPTH_RANGE_FAR(i0)			       (0x00000c0c + 0x10*(i0))
+#define NV50_3D_DEPTH_RANGE_FAR__ESIZE				0x00000010
+#define NV50_3D_DEPTH_RANGE_FAR__LEN				0x00000010
+
+#define NV50_3D_CLIP_RECT_HORIZ(i0)			       (0x00000d00 + 0x8*(i0))
+#define NV50_3D_CLIP_RECT_HORIZ__ESIZE				0x00000008
+#define NV50_3D_CLIP_RECT_HORIZ__LEN				0x00000008
+#define NV50_3D_CLIP_RECT_HORIZ_MIN__MASK			0x0000ffff
+#define NV50_3D_CLIP_RECT_HORIZ_MIN__SHIFT			0
+#define NV50_3D_CLIP_RECT_HORIZ_MAX__MASK			0xffff0000
+#define NV50_3D_CLIP_RECT_HORIZ_MAX__SHIFT			16
+
+#define NV50_3D_CLIP_RECT_VERT(i0)			       (0x00000d04 + 0x8*(i0))
+#define NV50_3D_CLIP_RECT_VERT__ESIZE				0x00000008
+#define NV50_3D_CLIP_RECT_VERT__LEN				0x00000008
+#define NV50_3D_CLIP_RECT_VERT_MIN__MASK			0x0000ffff
+#define NV50_3D_CLIP_RECT_VERT_MIN__SHIFT			0
+#define NV50_3D_CLIP_RECT_VERT_MAX__MASK			0xffff0000
+#define NV50_3D_CLIP_RECT_VERT_MAX__SHIFT			16
+
+#define NV50_3D_CLIPID_REGION_HORIZ(i0)			       (0x00000d40 + 0x8*(i0))
+#define NV50_3D_CLIPID_REGION_HORIZ__ESIZE			0x00000008
+#define NV50_3D_CLIPID_REGION_HORIZ__LEN			0x00000004
+#define NV50_3D_CLIPID_REGION_HORIZ_X__MASK			0x0000ffff
+#define NV50_3D_CLIPID_REGION_HORIZ_X__SHIFT			0
+#define NV50_3D_CLIPID_REGION_HORIZ_W__MASK			0xffff0000
+#define NV50_3D_CLIPID_REGION_HORIZ_W__SHIFT			16
+
+#define NV50_3D_CLIPID_REGION_VERT(i0)			       (0x00000d44 + 0x8*(i0))
+#define NV50_3D_CLIPID_REGION_VERT__ESIZE			0x00000008
+#define NV50_3D_CLIPID_REGION_VERT__LEN				0x00000004
+#define NV50_3D_CLIPID_REGION_VERT_Y__MASK			0x0000ffff
+#define NV50_3D_CLIPID_REGION_VERT_Y__SHIFT			0
+#define NV50_3D_CLIPID_REGION_VERT_H__MASK			0xffff0000
+#define NV50_3D_CLIPID_REGION_VERT_H__SHIFT			16
+
+#define NV50_3D_UNK0D60						0x00000d60
+
+#define NV50_3D_UNK0D64						0x00000d64
+
+#define NV50_3D_COUNTER_ENABLE					0x00000d68
+#define NV50_3D_COUNTER_ENABLE_VFETCH_VERTICES			0x00000001
+#define NV50_3D_COUNTER_ENABLE_VFETCH_PRIMITIVES		0x00000002
+#define NV50_3D_COUNTER_ENABLE_VP_LAUNCHES			0x00000004
+#define NV50_3D_COUNTER_ENABLE_GP_LAUNCHES			0x00000008
+#define NV50_3D_COUNTER_ENABLE_GP_PRIMITIVES_OUT		0x00000010
+#define NV50_3D_COUNTER_ENABLE_TRANSFORM_FEEDBACK		0x00000020
+#define NV50_3D_COUNTER_ENABLE_GENERATED_PRIMITIVES		0x00000040
+#define NV50_3D_COUNTER_ENABLE_RAST_PRIMITIVES_PRECLIP		0x00000080
+#define NV50_3D_COUNTER_ENABLE_RAST_PRIMITIVES_POSTCLIP		0x00000100
+#define NV50_3D_COUNTER_ENABLE_FP_PIXELS			0x00000200
+#define NV84_3D_COUNTER_ENABLE_UNK0A				0x00000400
+
+#define NV50_3D_UNK0D6C(i0)				       (0x00000d6c + 0x4*(i0))
+#define NV50_3D_UNK0D6C__ESIZE					0x00000004
+#define NV50_3D_UNK0D6C__LEN					0x00000002
+#define NV50_3D_UNK0D6C_X__MASK					0x0000ffff
+#define NV50_3D_UNK0D6C_X__SHIFT				0
+#define NV50_3D_UNK0D6C_Y__MASK					0xffff0000
+#define NV50_3D_UNK0D6C_Y__SHIFT				16
+
+#define NV50_3D_VERTEX_BUFFER_FIRST				0x00000d74
+
+#define NV50_3D_VERTEX_BUFFER_COUNT				0x00000d78
+
+#define NV50_3D_UNK0D7C						0x00000d7c
+
+#define NV50_3D_CLEAR_COLOR(i0)				       (0x00000d80 + 0x4*(i0))
+#define NV50_3D_CLEAR_COLOR__ESIZE				0x00000004
+#define NV50_3D_CLEAR_COLOR__LEN				0x00000004
+
+#define NV50_3D_CLEAR_DEPTH					0x00000d90
+
+#define NV50_3D_STACK_ADDRESS_HIGH				0x00000d94
+
+#define NV50_3D_STACK_ADDRESS_LOW				0x00000d98
+
+#define NV50_3D_STACK_SIZE_LOG					0x00000d9c
+
+#define NV50_3D_CLEAR_STENCIL					0x00000da0
+
+#define NV50_3D_STRMOUT_PARAMS_LATCH				0x00000da4
+
+#define NV50_3D_STRMOUT_PRIMITIVE_LIMIT				0x00000da8
+
+#define NV50_3D_POLYGON_MODE_FRONT				0x00000dac
+#define NV50_3D_POLYGON_MODE_FRONT_POINT			0x00001b00
+#define NV50_3D_POLYGON_MODE_FRONT_LINE				0x00001b01
+#define NV50_3D_POLYGON_MODE_FRONT_FILL				0x00001b02
+
+#define NV50_3D_POLYGON_MODE_BACK				0x00000db0
+#define NV50_3D_POLYGON_MODE_BACK_POINT				0x00001b00
+#define NV50_3D_POLYGON_MODE_BACK_LINE				0x00001b01
+#define NV50_3D_POLYGON_MODE_BACK_FILL				0x00001b02
+
+#define NV50_3D_POLYGON_SMOOTH_ENABLE				0x00000db4
+
+#define NV50_3D_UNK0DB8						0x00000db8
+
+#define NV50_3D_ZCULL_UNK0DBC					0x00000dbc
+#define NV50_3D_ZCULL_UNK0DBC_UNK0				0x00000001
+#define NV50_3D_ZCULL_UNK0DBC_UNK16__MASK			0x00030000
+#define NV50_3D_ZCULL_UNK0DBC_UNK16__SHIFT			16
+
+#define NV50_3D_POLYGON_OFFSET_POINT_ENABLE			0x00000dc0
+
+#define NV50_3D_POLYGON_OFFSET_LINE_ENABLE			0x00000dc4
+
+#define NV50_3D_POLYGON_OFFSET_FILL_ENABLE			0x00000dc8
+
+#define NV50_3D_UNK0DCC						0x00000dcc
+
+#define NV50_3D_VTX_ATTR_MASK_UNK0DD0(i0)		       (0x00000dd0 + 0x4*(i0))
+#define NV50_3D_VTX_ATTR_MASK_UNK0DD0__ESIZE			0x00000004
+#define NV50_3D_VTX_ATTR_MASK_UNK0DD0__LEN			0x00000002
+
+#define NV50_3D_ZCULL_UNK0DD8					0x00000dd8
+#define NV50_3D_ZCULL_UNK0DD8_UNK0__MASK			0x00000007
+#define NV50_3D_ZCULL_UNK0DD8_UNK0__SHIFT			0
+#define NVA3_3D_ZCULL_UNK0DD8_UNK9				0x00000200
+#define NV50_3D_ZCULL_UNK0DD8_UNK16__MASK			0xffff0000
+#define NV50_3D_ZCULL_UNK0DD8_UNK16__SHIFT			16
+
+#define NV50_3D_UNK0DDC						0x00000ddc
+
+#define NV50_3D_UNK0DE0						0x00000de0
+
+#define NV50_3D_WATCHDOG_TIMER					0x00000de4
+
+#define NV50_3D_UNK0DE8						0x00000de8
+
+#define NV50_3D_UNK0DEC						0x00000dec
+
+#define NV50_3D_UNK0DF0						0x00000df0
+#define NV50_3D_UNK0DF0_UNK0					0x00000001
+#define NV50_3D_UNK0DF0_UNK1__MASK				0x00000ff0
+#define NV50_3D_UNK0DF0_UNK1__SHIFT				4
+
+#define NV50_3D_UNK0DF4						0x00000df4
+
+#define NV50_3D_WINDOW_OFFSET_X					0x00000df8
+
+#define NV50_3D_WINDOW_OFFSET_Y					0x00000dfc
+
+#define NV50_3D_SCISSOR_ENABLE(i0)			       (0x00000e00 + 0x10*(i0))
+#define NV50_3D_SCISSOR_ENABLE__ESIZE				0x00000010
+#define NV50_3D_SCISSOR_ENABLE__LEN				0x00000010
+
+#define NV50_3D_SCISSOR_HORIZ(i0)			       (0x00000e04 + 0x10*(i0))
+#define NV50_3D_SCISSOR_HORIZ__ESIZE				0x00000010
+#define NV50_3D_SCISSOR_HORIZ__LEN				0x00000010
+#define NV50_3D_SCISSOR_HORIZ_MIN__MASK				0x0000ffff
+#define NV50_3D_SCISSOR_HORIZ_MIN__SHIFT			0
+#define NV50_3D_SCISSOR_HORIZ_MAX__MASK				0xffff0000
+#define NV50_3D_SCISSOR_HORIZ_MAX__SHIFT			16
+
+#define NV50_3D_SCISSOR_VERT(i0)			       (0x00000e08 + 0x10*(i0))
+#define NV50_3D_SCISSOR_VERT__ESIZE				0x00000010
+#define NV50_3D_SCISSOR_VERT__LEN				0x00000010
+#define NV50_3D_SCISSOR_VERT_MIN__MASK				0x0000ffff
+#define NV50_3D_SCISSOR_VERT_MIN__SHIFT				0
+#define NV50_3D_SCISSOR_VERT_MAX__MASK				0xffff0000
+#define NV50_3D_SCISSOR_VERT_MAX__SHIFT				16
+
+#define NV50_3D_CB_ADDR						0x00000f00
+#define NV50_3D_CB_ADDR_ID__MASK				0x003fff00
+#define NV50_3D_CB_ADDR_ID__SHIFT				8
+#define NV50_3D_CB_ADDR_BUFFER__MASK				0x0000007f
+#define NV50_3D_CB_ADDR_BUFFER__SHIFT				0
+
+#define NV50_3D_CB_DATA(i0)				       (0x00000f04 + 0x4*(i0))
+#define NV50_3D_CB_DATA__ESIZE					0x00000004
+#define NV50_3D_CB_DATA__LEN					0x00000010
+
+#define NV50_3D_LOCAL_WARPS_LOG_ALLOC				0x00000f44
+
+#define NV50_3D_LOCAL_WARPS_NO_CLAMP				0x00000f48
+
+#define NV50_3D_STACK_WARPS_LOG_ALLOC				0x00000f4c
+
+#define NV50_3D_STACK_WARPS_NO_CLAMP				0x00000f50
+
+#define NV50_3D_STENCIL_BACK_FUNC_REF				0x00000f54
+
+#define NV50_3D_STENCIL_BACK_MASK				0x00000f58
+
+#define NV50_3D_STENCIL_BACK_FUNC_MASK				0x00000f5c
+
+#define NV50_3D_UNK0F60(i0)				       (0x00000f60 + 0x4*(i0))
+#define NV50_3D_UNK0F60__ESIZE					0x00000004
+#define NV50_3D_UNK0F60__LEN					0x00000004
+
+#define NV50_3D_GP_ADDRESS_HIGH					0x00000f70
+
+#define NV50_3D_GP_ADDRESS_LOW					0x00000f74
+
+#define NV50_3D_UNK0F78						0x00000f78
+
+#define NV50_3D_VP_ADDRESS_HIGH					0x00000f7c
+
+#define NV50_3D_VP_ADDRESS_LOW					0x00000f80
+
+#define NV50_3D_VERTEX_RUNOUT_ADDRESS_HIGH			0x00000f84
+
+#define NV50_3D_VERTEX_RUNOUT_ADDRESS_LOW			0x00000f88
+
+#define NV50_3D_UNK0F8C						0x00000f8c
+
+#define NV50_3D_COLOR_MASK_COMMON				0x00000f90
+
+#define NV50_3D_UNK0F94						0x00000f94
+
+#define NV50_3D_UNK0F98						0x00000f98
+
+#define NV50_3D_DEPTH_BOUNDS(i0)			       (0x00000f9c + 0x4*(i0))
+#define NV50_3D_DEPTH_BOUNDS__ESIZE				0x00000004
+#define NV50_3D_DEPTH_BOUNDS__LEN				0x00000002
+
+#define NV50_3D_FP_ADDRESS_HIGH					0x00000fa4
+
+#define NV50_3D_FP_ADDRESS_LOW					0x00000fa8
+
+#define NV50_3D_UNK0FAC						0x00000fac
+#define NV50_3D_UNK0FAC_UNK0					0x00000001
+#define NVA0_3D_UNK0FAC_UNK2					0x00000002
+#define NV50_3D_UNK0FAC_UNK1__MASK				0x000ffff0
+#define NV50_3D_UNK0FAC_UNK1__SHIFT				4
+
+#define NV50_3D_UNK0FB0						0x00000fb0
+
+#define NV50_3D_UNK0FB4						0x00000fb4
+
+#define NV50_3D_UNK0FB8						0x00000fb8
+
+#define NV50_3D_MSAA_MASK(i0)				       (0x00000fbc + 0x4*(i0))
+#define NV50_3D_MSAA_MASK__ESIZE				0x00000004
+#define NV50_3D_MSAA_MASK__LEN					0x00000004
+
+#define NV50_3D_CLIPID_ADDRESS_HIGH				0x00000fcc
+
+#define NV50_3D_CLIPID_ADDRESS_LOW				0x00000fd0
+
+#define NV50_3D_SEMANTIC_VIEWPORT				0x00000fd4
+#define NV50_3D_SEMANTIC_VIEWPORT_VIEWPORT_ID__MASK		0x000000ff
+#define NV50_3D_SEMANTIC_VIEWPORT_VIEWPORT_ID__SHIFT		0
+
+#define NV50_3D_UNK0FD8						0x00000fd8
+#define NV50_3D_UNK0FD8_UNK0					0x00000001
+#define NV50_3D_UNK0FD8_UNK1					0x00000010
+
+#define NV50_3D_UNK0FDC						0x00000fdc
+
+#define NV50_3D_ZETA_ADDRESS_HIGH				0x00000fe0
+
+#define NV50_3D_ZETA_ADDRESS_LOW				0x00000fe4
+
+#define NV50_3D_ZETA_FORMAT					0x00000fe8
+
+#define NV50_3D_ZETA_TILE_MODE					0x00000fec
+
+#define NV50_3D_ZETA_LAYER_STRIDE				0x00000ff0
+#define NV50_3D_ZETA_LAYER_STRIDE__SHR				2
+
+#define NV50_3D_SCREEN_SCISSOR_HORIZ				0x00000ff4
+#define NV50_3D_SCREEN_SCISSOR_HORIZ_W__MASK			0xffff0000
+#define NV50_3D_SCREEN_SCISSOR_HORIZ_W__SHIFT			16
+#define NV50_3D_SCREEN_SCISSOR_HORIZ_X__MASK			0x0000ffff
+#define NV50_3D_SCREEN_SCISSOR_HORIZ_X__SHIFT			0
+
+#define NV50_3D_SCREEN_SCISSOR_VERT				0x00000ff8
+#define NV50_3D_SCREEN_SCISSOR_VERT_H__MASK			0xffff0000
+#define NV50_3D_SCREEN_SCISSOR_VERT_H__SHIFT			16
+#define NV50_3D_SCREEN_SCISSOR_VERT_Y__MASK			0x0000ffff
+#define NV50_3D_SCREEN_SCISSOR_VERT_Y__SHIFT			0
+
+#define NV50_3D_UNK0FFC						0x00000ffc
+
+#define NV50_3D_VERTEX_ARRAY_PER_INSTANCE(i0)		       (0x00001000 + 0x4*(i0))
+#define NV50_3D_VERTEX_ARRAY_PER_INSTANCE__ESIZE		0x00000004
+#define NV50_3D_VERTEX_ARRAY_PER_INSTANCE__LEN			0x00000010
+
+#define NV50_3D_UNK1040(i0)				       (0x00001040 + 0x4*(i0))
+#define NV50_3D_UNK1040__ESIZE					0x00000004
+#define NV50_3D_UNK1040__LEN					0x00000010
+
+#define NV50_3D_VERTEX_ARRAY_LIMIT_HIGH(i0)		       (0x00001080 + 0x8*(i0))
+#define NV50_3D_VERTEX_ARRAY_LIMIT_HIGH__ESIZE			0x00000008
+#define NV50_3D_VERTEX_ARRAY_LIMIT_HIGH__LEN			0x00000010
+
+#define NV50_3D_VERTEX_ARRAY_LIMIT_LOW(i0)		       (0x00001084 + 0x8*(i0))
+#define NV50_3D_VERTEX_ARRAY_LIMIT_LOW__ESIZE			0x00000008
+#define NV50_3D_VERTEX_ARRAY_LIMIT_LOW__LEN			0x00000010
+
+#define NV50_3D_UNK1100						0x00001100
+
+#define NV84_3D_UNK1104						0x00001104
+#define NV84_3D_UNK1104_0__MASK					0x0000ffff
+#define NV84_3D_UNK1104_0__SHIFT				0
+#define NV84_3D_UNK1104_0__MAX					0x00002000
+#define NV84_3D_UNK1104_0__ALIGN				0x00000040
+#define NV84_3D_UNK1104_1__MASK					0xffff0000
+#define NV84_3D_UNK1104_1__SHIFT				16
+#define NV84_3D_UNK1104_1__MAX					0x00002000
+#define NV84_3D_UNK1104_1__ALIGN				0x00000040
+
+#define NV84_3D_UNK1108						0x00001108
+#define NV84_3D_UNK1108_0					0x00000001
+#define NV84_3D_UNK1108_1					0x00000010
+
+#define NV84_3D_UNK110C						0x0000110c
+
+#define NV84_3D_UNK1110						0x00001110
+
+#define NV84_3D_WRCACHE_FLUSH					0x00001114
+
+#define NV84_3D_VERTEX_ID_BASE					0x00001118
+
+#define NV84_3D_PRIMITIVE_ID					0x0000111c
+
+#define NVA3_3D_VTX_ATTR_MASK_UNK0DD0_ALT(i0)		       (0x00001120 + 0x4*(i0))
+#define NVA3_3D_VTX_ATTR_MASK_UNK0DD0_ALT__ESIZE		0x00000004
+#define NVA3_3D_VTX_ATTR_MASK_UNK0DD0_ALT__LEN			0x00000004
+
+#define NVA3_3D_VP_ATTR_EN_ALT(i0)			       (0x00001130 + 0x4*(i0))
+#define NVA3_3D_VP_ATTR_EN_ALT__ESIZE				0x00000004
+#define NVA3_3D_VP_ATTR_EN_ALT__LEN				0x00000004
+#define NVA3_3D_VP_ATTR_EN_ALT_7__MASK				0xf0000000
+#define NVA3_3D_VP_ATTR_EN_ALT_7__SHIFT				28
+#define NVA3_3D_VP_ATTR_EN_ALT_7_X				0x10000000
+#define NVA3_3D_VP_ATTR_EN_ALT_7_Y				0x20000000
+#define NVA3_3D_VP_ATTR_EN_ALT_7_Z				0x40000000
+#define NVA3_3D_VP_ATTR_EN_ALT_7_W				0x80000000
+#define NVA3_3D_VP_ATTR_EN_ALT_6__MASK				0x0f000000
+#define NVA3_3D_VP_ATTR_EN_ALT_6__SHIFT				24
+#define NVA3_3D_VP_ATTR_EN_ALT_6_X				0x01000000
+#define NVA3_3D_VP_ATTR_EN_ALT_6_Y				0x02000000
+#define NVA3_3D_VP_ATTR_EN_ALT_6_Z				0x04000000
+#define NVA3_3D_VP_ATTR_EN_ALT_6_W				0x08000000
+#define NVA3_3D_VP_ATTR_EN_ALT_5__MASK				0x00f00000
+#define NVA3_3D_VP_ATTR_EN_ALT_5__SHIFT				20
+#define NVA3_3D_VP_ATTR_EN_ALT_5_X				0x00100000
+#define NVA3_3D_VP_ATTR_EN_ALT_5_Y				0x00200000
+#define NVA3_3D_VP_ATTR_EN_ALT_5_Z				0x00400000
+#define NVA3_3D_VP_ATTR_EN_ALT_5_W				0x00800000
+#define NVA3_3D_VP_ATTR_EN_ALT_4__MASK				0x000f0000
+#define NVA3_3D_VP_ATTR_EN_ALT_4__SHIFT				16
+#define NVA3_3D_VP_ATTR_EN_ALT_4_X				0x00010000
+#define NVA3_3D_VP_ATTR_EN_ALT_4_Y				0x00020000
+#define NVA3_3D_VP_ATTR_EN_ALT_4_Z				0x00040000
+#define NVA3_3D_VP_ATTR_EN_ALT_4_W				0x00080000
+#define NVA3_3D_VP_ATTR_EN_ALT_3__MASK				0x0000f000
+#define NVA3_3D_VP_ATTR_EN_ALT_3__SHIFT				12
+#define NVA3_3D_VP_ATTR_EN_ALT_3_X				0x00001000
+#define NVA3_3D_VP_ATTR_EN_ALT_3_Y				0x00002000
+#define NVA3_3D_VP_ATTR_EN_ALT_3_Z				0x00004000
+#define NVA3_3D_VP_ATTR_EN_ALT_3_W				0x00008000
+#define NVA3_3D_VP_ATTR_EN_ALT_2__MASK				0x00000f00
+#define NVA3_3D_VP_ATTR_EN_ALT_2__SHIFT				8
+#define NVA3_3D_VP_ATTR_EN_ALT_2_X				0x00000100
+#define NVA3_3D_VP_ATTR_EN_ALT_2_Y				0x00000200
+#define NVA3_3D_VP_ATTR_EN_ALT_2_Z				0x00000400
+#define NVA3_3D_VP_ATTR_EN_ALT_2_W				0x00000800
+#define NVA3_3D_VP_ATTR_EN_ALT_1__MASK				0x000000f0
+#define NVA3_3D_VP_ATTR_EN_ALT_1__SHIFT				4
+#define NVA3_3D_VP_ATTR_EN_ALT_1_X				0x00000010
+#define NVA3_3D_VP_ATTR_EN_ALT_1_Y				0x00000020
+#define NVA3_3D_VP_ATTR_EN_ALT_1_Z				0x00000040
+#define NVA3_3D_VP_ATTR_EN_ALT_1_W				0x00000080
+#define NVA3_3D_VP_ATTR_EN_ALT_0__MASK				0x0000000f
+#define NVA3_3D_VP_ATTR_EN_ALT_0__SHIFT				0
+#define NVA3_3D_VP_ATTR_EN_ALT_0_X				0x00000001
+#define NVA3_3D_VP_ATTR_EN_ALT_0_Y				0x00000002
+#define NVA3_3D_VP_ATTR_EN_ALT_0_Z				0x00000004
+#define NVA3_3D_VP_ATTR_EN_ALT_0_W				0x00000008
+
+#define NVA3_3D_UNK1140						0x00001140
+
+#define NVA0_3D_UNK1144						0x00001144
+
+#define NVA0_3D_VTX_ATTR_DEFINE					0x0000114c
+#define NVA0_3D_VTX_ATTR_DEFINE_ATTR__MASK			0x000000ff
+#define NVA0_3D_VTX_ATTR_DEFINE_ATTR__SHIFT			0
+#define NVA0_3D_VTX_ATTR_DEFINE_COMP__MASK			0x00000700
+#define NVA0_3D_VTX_ATTR_DEFINE_COMP__SHIFT			8
+#define NVA0_3D_VTX_ATTR_DEFINE_COMP__MIN			0x00000001
+#define NVA0_3D_VTX_ATTR_DEFINE_COMP__MAX			0x00000004
+#define NVA0_3D_VTX_ATTR_DEFINE_SIZE__MASK			0x00007000
+#define NVA0_3D_VTX_ATTR_DEFINE_SIZE__SHIFT			12
+#define NVA0_3D_VTX_ATTR_DEFINE_SIZE_8				0x00001000
+#define NVA0_3D_VTX_ATTR_DEFINE_SIZE_16				0x00002000
+#define NVA0_3D_VTX_ATTR_DEFINE_SIZE_32				0x00004000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE__MASK			0x00070000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE__SHIFT			16
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_SNORM			0x00010000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_UNORM			0x00020000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_SINT			0x00030000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_UINT			0x00040000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_USCALED			0x00050000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_SSCALED			0x00060000
+#define NVA0_3D_VTX_ATTR_DEFINE_TYPE_FLOAT			0x00070000
+
+#define NVA0_3D_VTX_ATTR_DATA(i0)			       (0x00001150 + 0x4*(i0))
+#define NVA0_3D_VTX_ATTR_DATA__ESIZE				0x00000004
+#define NVA0_3D_VTX_ATTR_DATA__LEN				0x00000004
+
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT(i0)		       (0x00001160 + 0x4*(i0))
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT__ESIZE			0x00000004
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT__LEN			0x00000020
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_BUFFER__MASK		0x0000001f
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_BUFFER__SHIFT		0
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_CONST			0x00000040
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_OFFSET__MASK		0x001fff80
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_OFFSET__SHIFT		7
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT__MASK		0x07e00000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT__SHIFT		21
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_32_32_32_32	0x00200000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_32_32_32		0x00400000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_16_16_16_16	0x00600000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_32_32		0x00800000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_16_16_16		0x00a00000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_8_8_8_8		0x01400000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_16_16		0x01e00000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_32		0x02400000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_8_8_8		0x02600000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_8_8		0x03000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_16		0x03600000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_8		0x03a00000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_FORMAT_10_10_10_2	0x06000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE__MASK		0x38000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE__SHIFT		27
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_SNORM		0x08000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_UNORM		0x10000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_SINT		0x18000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_UINT		0x20000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_USCALED		0x28000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_SSCALED		0x30000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_TYPE_FLOAT		0x38000000
+#define NVA3_3D_VERTEX_ARRAY_ATTRIB_ALT_BGRA			0x80000000
+
+#define NV50_3D_RT_CONTROL					0x0000121c
+#define NV50_3D_RT_CONTROL_COUNT__MASK				0x0000000f
+#define NV50_3D_RT_CONTROL_COUNT__SHIFT				0
+#define NV50_3D_RT_CONTROL_MAP0__MASK				0x00000070
+#define NV50_3D_RT_CONTROL_MAP0__SHIFT				4
+#define NV50_3D_RT_CONTROL_MAP1__MASK				0x00000380
+#define NV50_3D_RT_CONTROL_MAP1__SHIFT				7
+#define NV50_3D_RT_CONTROL_MAP2__MASK				0x00001c00
+#define NV50_3D_RT_CONTROL_MAP2__SHIFT				10
+#define NV50_3D_RT_CONTROL_MAP3__MASK				0x0000e000
+#define NV50_3D_RT_CONTROL_MAP3__SHIFT				13
+#define NV50_3D_RT_CONTROL_MAP4__MASK				0x00070000
+#define NV50_3D_RT_CONTROL_MAP4__SHIFT				16
+#define NV50_3D_RT_CONTROL_MAP5__MASK				0x00380000
+#define NV50_3D_RT_CONTROL_MAP5__SHIFT				19
+#define NV50_3D_RT_CONTROL_MAP6__MASK				0x01c00000
+#define NV50_3D_RT_CONTROL_MAP6__SHIFT				22
+#define NV50_3D_RT_CONTROL_MAP7__MASK				0x0e000000
+#define NV50_3D_RT_CONTROL_MAP7__SHIFT				25
+
+#define NV50_3D_UNK1220						0x00001220
+
+#define NV50_3D_RT_ARRAY_MODE					0x00001224
+#define NV50_3D_RT_ARRAY_MODE_LAYERS__MASK			0x0000ffff
+#define NV50_3D_RT_ARRAY_MODE_LAYERS__SHIFT			0
+#define NV50_3D_RT_ARRAY_MODE_MODE__MASK			0x00010000
+#define NV50_3D_RT_ARRAY_MODE_MODE__SHIFT			16
+#define NV50_3D_RT_ARRAY_MODE_MODE_2D_ARRAY			0x00000000
+#define NV50_3D_RT_ARRAY_MODE_MODE_3D				0x00010000
+
+#define NV50_3D_ZETA_HORIZ					0x00001228
+
+#define NV50_3D_ZETA_VERT					0x0000122c
+
+#define NV50_3D_ZETA_ARRAY_MODE					0x00001230
+#define NV50_3D_ZETA_ARRAY_MODE_LAYERS__MASK			0x0000ffff
+#define NV50_3D_ZETA_ARRAY_MODE_LAYERS__SHIFT			0
+#define NV50_3D_ZETA_ARRAY_MODE_UNK				0x00010000
+
+#define NV50_3D_LINKED_TSC					0x00001234
+
+#define NV50_3D_UNK1238						0x00001238
+
+#define NVA0_3D_DRAW_TFB_BYTES					0x0000123c
+
+#define NV50_3D_RT_HORIZ(i0)				       (0x00001240 + 0x8*(i0))
+#define NV50_3D_RT_HORIZ__ESIZE					0x00000008
+#define NV50_3D_RT_HORIZ__LEN					0x00000008
+#define NV50_3D_RT_HORIZ_WIDTH__MASK				0x0fffffff
+#define NV50_3D_RT_HORIZ_WIDTH__SHIFT				0
+#define NV50_3D_RT_HORIZ_LINEAR					0x80000000
+
+#define NV50_3D_RT_VERT(i0)				       (0x00001244 + 0x8*(i0))
+#define NV50_3D_RT_VERT__ESIZE					0x00000008
+#define NV50_3D_RT_VERT__LEN					0x00000008
+
+#define NV50_3D_CB_DEF_ADDRESS_HIGH				0x00001280
+
+#define NV50_3D_CB_DEF_ADDRESS_LOW				0x00001284
+
+#define NV50_3D_CB_DEF_SET					0x00001288
+#define NV50_3D_CB_DEF_SET_SIZE__MASK				0x0000ffff
+#define NV50_3D_CB_DEF_SET_SIZE__SHIFT				0
+#define NV50_3D_CB_DEF_SET_BUFFER__MASK				0x007f0000
+#define NV50_3D_CB_DEF_SET_BUFFER__SHIFT			16
+
+#define NV50_3D_UNK128C						0x0000128c
+#define NV50_3D_UNK128C_0__MASK					0x00000003
+#define NV50_3D_UNK128C_0__SHIFT				0
+#define NV50_3D_UNK128C_1__MASK					0x00000030
+#define NV50_3D_UNK128C_1__SHIFT				4
+#define NV50_3D_UNK128C_2__MASK					0x00000300
+#define NV50_3D_UNK128C_2__SHIFT				8
+#define NV50_3D_UNK128C_3__MASK					0x00003000
+#define NV50_3D_UNK128C_3__SHIFT				12
+
+#define NV50_3D_CALL_LIMIT_LOG					0x00001290
+#define NV50_3D_CALL_LIMIT_LOG_VP__MASK				0x0000000f
+#define NV50_3D_CALL_LIMIT_LOG_VP__SHIFT			0
+#define NV50_3D_CALL_LIMIT_LOG_GP__MASK				0x000000f0
+#define NV50_3D_CALL_LIMIT_LOG_GP__SHIFT			4
+#define NV50_3D_CALL_LIMIT_LOG_FP__MASK				0x00000f00
+#define NV50_3D_CALL_LIMIT_LOG_FP__SHIFT			8
+
+#define NV50_3D_STRMOUT_BUFFERS_CTRL				0x00001294
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED		0x00000001
+#define NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE__MASK		0x00000002
+#define NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE__SHIFT		1
+#define NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_PRIMITIVES	0x00000000
+#define NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET		0x00000002
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__MASK		0x000000f0
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT		4
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MASK		0x000fff00
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT		8
+#define NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX		0x00000800
+
+#define NV50_3D_FP_RESULT_COUNT					0x00001298
+
+#define NV50_3D_VTX_UNK129C					0x0000129c
+
+#define NV50_3D_UNK12A0						0x000012a0
+
+#define NV50_3D_UNK12A8						0x000012a8
+#define NV50_3D_UNK12A8_UNK1					0x00000001
+#define NV50_3D_UNK12A8_UNK2__MASK				0x000ffff0
+#define NV50_3D_UNK12A8_UNK2__SHIFT				4
+
+#define NV50_3D_UNK12AC						0x000012ac
+
+#define NV50_3D_UNK12B0						0x000012b0
+#define NV50_3D_UNK12B0_UNK0__MASK				0x000000ff
+#define NV50_3D_UNK12B0_UNK0__SHIFT				0
+#define NV50_3D_UNK12B0_UNK1__MASK				0x0000ff00
+#define NV50_3D_UNK12B0_UNK1__SHIFT				8
+#define NV50_3D_UNK12B0_UNK2__MASK				0x00ff0000
+#define NV50_3D_UNK12B0_UNK2__SHIFT				16
+#define NV50_3D_UNK12B0_UNK3__MASK				0xff000000
+#define NV50_3D_UNK12B0_UNK3__SHIFT				24
+#define NV50_3D_UNK12B0_UNK3__MAX				0x00000080
+
+#define NV50_3D_UNK12B4						0x000012b4
+
+#define NV50_3D_UNK12B8						0x000012b8
+
+#define NV50_3D_DEPTH_TEST_ENABLE				0x000012cc
+
+#define NV50_3D_D3D_FILL_MODE					0x000012d0
+#define NV50_3D_D3D_FILL_MODE_POINT				0x00000001
+#define NV50_3D_D3D_FILL_MODE_WIREFRAME				0x00000002
+#define NV50_3D_D3D_FILL_MODE_SOLID				0x00000003
+
+#define NV50_3D_SHADE_MODEL					0x000012d4
+#define NV50_3D_SHADE_MODEL_FLAT				0x00001d00
+#define NV50_3D_SHADE_MODEL_SMOOTH				0x00001d01
+
+#define NV50_3D_LOCAL_ADDRESS_HIGH				0x000012d8
+
+#define NV50_3D_LOCAL_ADDRESS_LOW				0x000012dc
+
+#define NV50_3D_LOCAL_SIZE_LOG					0x000012e0
+
+#define NV50_3D_BLEND_INDEPENDENT				0x000012e4
+
+#define NV50_3D_DEPTH_WRITE_ENABLE				0x000012e8
+
+#define NV50_3D_ALPHA_TEST_ENABLE				0x000012ec
+
+#define NV50_3D_PM_SET(i0)				       (0x000012f0 + 0x4*(i0))
+#define NV50_3D_PM_SET__ESIZE					0x00000004
+#define NV50_3D_PM_SET__LEN					0x00000004
+
+#define NV50_3D_VB_ELEMENT_U8_SETUP				0x00001300
+#define NV50_3D_VB_ELEMENT_U8_SETUP_OFFSET__MASK		0xc0000000
+#define NV50_3D_VB_ELEMENT_U8_SETUP_OFFSET__SHIFT		30
+#define NV50_3D_VB_ELEMENT_U8_SETUP_COUNT__MASK			0x3fffffff
+#define NV50_3D_VB_ELEMENT_U8_SETUP_COUNT__SHIFT		0
+
+#define NV50_3D_VB_ELEMENT_U8					0x00001304
+#define NV50_3D_VB_ELEMENT_U8_I0__MASK				0x000000ff
+#define NV50_3D_VB_ELEMENT_U8_I0__SHIFT				0
+#define NV50_3D_VB_ELEMENT_U8_I1__MASK				0x0000ff00
+#define NV50_3D_VB_ELEMENT_U8_I1__SHIFT				8
+#define NV50_3D_VB_ELEMENT_U8_I2__MASK				0x00ff0000
+#define NV50_3D_VB_ELEMENT_U8_I2__SHIFT				16
+#define NV50_3D_VB_ELEMENT_U8_I3__MASK				0xff000000
+#define NV50_3D_VB_ELEMENT_U8_I3__SHIFT				24
+
+#define NV50_3D_D3D_CULL_MODE					0x00001308
+#define NV50_3D_D3D_CULL_MODE_NONE				0x00000001
+#define NV50_3D_D3D_CULL_MODE_FRONT				0x00000002
+#define NV50_3D_D3D_CULL_MODE_BACK				0x00000003
+
+#define NV50_3D_DEPTH_TEST_FUNC					0x0000130c
+#define NV50_3D_DEPTH_TEST_FUNC_NEVER				0x00000200
+#define NV50_3D_DEPTH_TEST_FUNC_LESS				0x00000201
+#define NV50_3D_DEPTH_TEST_FUNC_EQUAL				0x00000202
+#define NV50_3D_DEPTH_TEST_FUNC_LEQUAL				0x00000203
+#define NV50_3D_DEPTH_TEST_FUNC_GREATER				0x00000204
+#define NV50_3D_DEPTH_TEST_FUNC_NOTEQUAL			0x00000205
+#define NV50_3D_DEPTH_TEST_FUNC_GEQUAL				0x00000206
+#define NV50_3D_DEPTH_TEST_FUNC_ALWAYS				0x00000207
+
+#define NV50_3D_ALPHA_TEST_REF					0x00001310
+
+#define NV50_3D_ALPHA_TEST_FUNC					0x00001314
+#define NV50_3D_ALPHA_TEST_FUNC_NEVER				0x00000200
+#define NV50_3D_ALPHA_TEST_FUNC_LESS				0x00000201
+#define NV50_3D_ALPHA_TEST_FUNC_EQUAL				0x00000202
+#define NV50_3D_ALPHA_TEST_FUNC_LEQUAL				0x00000203
+#define NV50_3D_ALPHA_TEST_FUNC_GREATER				0x00000204
+#define NV50_3D_ALPHA_TEST_FUNC_NOTEQUAL			0x00000205
+#define NV50_3D_ALPHA_TEST_FUNC_GEQUAL				0x00000206
+#define NV50_3D_ALPHA_TEST_FUNC_ALWAYS				0x00000207
+
+#define NVA0_3D_DRAW_TFB_STRIDE					0x00001318
+#define NVA0_3D_DRAW_TFB_STRIDE__MIN				0x00000001
+#define NVA0_3D_DRAW_TFB_STRIDE__MAX				0x00000fff
+
+#define NV50_3D_BLEND_COLOR(i0)				       (0x0000131c + 0x4*(i0))
+#define NV50_3D_BLEND_COLOR__ESIZE				0x00000004
+#define NV50_3D_BLEND_COLOR__LEN				0x00000004
+
+#define NV50_3D_UNK132C						0x0000132c
+
+#define NV50_3D_TSC_FLUSH					0x00001330
+#define NV50_3D_TSC_FLUSH_SPECIFIC				0x00000001
+#define NV50_3D_TSC_FLUSH_ENTRY__MASK				0x03fffff0
+#define NV50_3D_TSC_FLUSH_ENTRY__SHIFT				4
+
+#define NV50_3D_TIC_FLUSH					0x00001334
+#define NV50_3D_TIC_FLUSH_SPECIFIC				0x00000001
+#define NV50_3D_TIC_FLUSH_ENTRY__MASK				0x03fffff0
+#define NV50_3D_TIC_FLUSH_ENTRY__SHIFT				4
+
+#define NV50_3D_TEX_CACHE_CTL					0x00001338
+#define NV50_3D_TEX_CACHE_CTL_UNK1__MASK			0x00000030
+#define NV50_3D_TEX_CACHE_CTL_UNK1__SHIFT			4
+
+#define NV50_3D_BLEND_SEPARATE_ALPHA				0x0000133c
+
+#define NV50_3D_BLEND_EQUATION_RGB				0x00001340
+#define NV50_3D_BLEND_EQUATION_RGB_FUNC_ADD			0x00008006
+#define NV50_3D_BLEND_EQUATION_RGB_MIN				0x00008007
+#define NV50_3D_BLEND_EQUATION_RGB_MAX				0x00008008
+#define NV50_3D_BLEND_EQUATION_RGB_FUNC_SUBTRACT		0x0000800a
+#define NV50_3D_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NV50_3D_BLEND_FUNC_SRC_RGB				0x00001344
+
+#define NV50_3D_BLEND_FUNC_DST_RGB				0x00001348
+
+#define NV50_3D_BLEND_EQUATION_ALPHA				0x0000134c
+#define NV50_3D_BLEND_EQUATION_ALPHA_FUNC_ADD			0x00008006
+#define NV50_3D_BLEND_EQUATION_ALPHA_MIN			0x00008007
+#define NV50_3D_BLEND_EQUATION_ALPHA_MAX			0x00008008
+#define NV50_3D_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT		0x0000800a
+#define NV50_3D_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NV50_3D_BLEND_FUNC_SRC_ALPHA				0x00001350
+
+#define NV50_3D_UNK1354						0x00001354
+
+#define NV50_3D_BLEND_FUNC_DST_ALPHA				0x00001358
+
+#define NV50_3D_BLEND_ENABLE_COMMON				0x0000135c
+
+#define NV50_3D_BLEND_ENABLE(i0)			       (0x00001360 + 0x4*(i0))
+#define NV50_3D_BLEND_ENABLE__ESIZE				0x00000004
+#define NV50_3D_BLEND_ENABLE__LEN				0x00000008
+
+#define NV50_3D_STENCIL_ENABLE					0x00001380
+
+#define NV50_3D_STENCIL_FRONT_OP_FAIL				0x00001384
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_ZERO			0x00000000
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_INVERT			0x0000150a
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_KEEP			0x00001e00
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_INCR			0x00001e02
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_DECR			0x00001e03
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_INCR_WRAP			0x00008507
+#define NV50_3D_STENCIL_FRONT_OP_FAIL_DECR_WRAP			0x00008508
+
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL				0x00001388
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_ZERO			0x00000000
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_INVERT			0x0000150a
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_KEEP			0x00001e00
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_INCR			0x00001e02
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_DECR			0x00001e03
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP		0x00008507
+#define NV50_3D_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP		0x00008508
+
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS				0x0000138c
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_ZERO			0x00000000
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_INVERT			0x0000150a
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_KEEP			0x00001e00
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_INCR			0x00001e02
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_DECR			0x00001e03
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_INCR_WRAP		0x00008507
+#define NV50_3D_STENCIL_FRONT_OP_ZPASS_DECR_WRAP		0x00008508
+
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC				0x00001390
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_NEVER			0x00000200
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_LESS			0x00000201
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_EQUAL			0x00000202
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_LEQUAL			0x00000203
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_GREATER			0x00000204
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL		0x00000205
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_GEQUAL			0x00000206
+#define NV50_3D_STENCIL_FRONT_FUNC_FUNC_ALWAYS			0x00000207
+
+#define NV50_3D_STENCIL_FRONT_FUNC_REF				0x00001394
+
+#define NV50_3D_STENCIL_FRONT_MASK				0x00001398
+
+#define NV50_3D_STENCIL_FRONT_FUNC_MASK				0x0000139c
+
+#define NV50_3D_UNK13A0						0x000013a0
+
+#define NVA0_3D_DRAW_TFB_BASE					0x000013a4
+
+#define NV50_3D_FRAG_COLOR_CLAMP_EN				0x000013a8
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_0				0x00000001
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_1				0x00000010
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_2				0x00000100
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_3				0x00001000
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_4				0x00010000
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_5				0x00100000
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_6				0x01000000
+#define NV50_3D_FRAG_COLOR_CLAMP_EN_7				0x10000000
+
+#define NV50_3D_SCREEN_Y_CONTROL				0x000013ac
+#define NV50_3D_SCREEN_Y_CONTROL_Y_NEGATE			0x00000001
+#define NV50_3D_SCREEN_Y_CONTROL_TRIANGLE_RAST_FLIP		0x00000010
+
+#define NV50_3D_LINE_WIDTH					0x000013b0
+
+#define NV50_3D_TEX_LIMITS(i0)				       (0x000013b4 + 0x4*(i0))
+#define NV50_3D_TEX_LIMITS__ESIZE				0x00000004
+#define NV50_3D_TEX_LIMITS__LEN					0x00000003
+#define NV50_3D_TEX_LIMITS_SAMPLERS_LOG2__MASK			0x0000000f
+#define NV50_3D_TEX_LIMITS_SAMPLERS_LOG2__SHIFT			0
+#define NV50_3D_TEX_LIMITS_SAMPLERS_LOG2__MIN			0x00000000
+#define NV50_3D_TEX_LIMITS_SAMPLERS_LOG2__MAX			0x00000004
+#define NV50_3D_TEX_LIMITS_TEXTURES_LOG2__MASK			0x000000f0
+#define NV50_3D_TEX_LIMITS_TEXTURES_LOG2__SHIFT			4
+#define NV50_3D_TEX_LIMITS_TEXTURES_LOG2__MIN			0x00000000
+#define NV50_3D_TEX_LIMITS_TEXTURES_LOG2__MAX			0x00000007
+
+#define NV50_3D_POINT_COORD_REPLACE_MAP(i0)		       (0x000013c0 + 0x4*(i0))
+#define NV50_3D_POINT_COORD_REPLACE_MAP__ESIZE			0x00000004
+#define NV50_3D_POINT_COORD_REPLACE_MAP__LEN			0x00000010
+
+#define NV50_3D_UNK1400_LANES					0x00001400
+
+#define NV50_3D_UNK1404						0x00001404
+
+#define NV50_3D_UNK1408						0x00001408
+
+#define NV50_3D_VP_START_ID					0x0000140c
+
+#define NV50_3D_GP_START_ID					0x00001410
+
+#define NV50_3D_FP_START_ID					0x00001414
+
+#define NVA3_3D_UNK1418						0x00001418
+
+#define NV50_3D_UNK141C						0x0000141c
+
+#define NV50_3D_GP_VERTEX_OUTPUT_COUNT				0x00001420
+#define NV50_3D_GP_VERTEX_OUTPUT_COUNT__MIN			0x00000001
+#define NV50_3D_GP_VERTEX_OUTPUT_COUNT__MAX			0x00000400
+
+#define NV50_3D_VERTEX_ARRAY_FLUSH				0x0000142c
+
+#define NV50_3D_UNK1430						0x00001430
+#define NV50_3D_UNK1430_UNK0					0x00000010
+#define NV50_3D_UNK1430_UNK1					0x00000100
+
+#define NV50_3D_VB_ELEMENT_BASE					0x00001434
+
+#define NV50_3D_VB_INSTANCE_BASE				0x00001438
+
+#define NV50_3D_CLEAR_FLAGS					0x0000143c
+#define NV50_3D_CLEAR_FLAGS_STENCIL_MASK			0x00000001
+#define NV50_3D_CLEAR_FLAGS_CLEAR_RECT__MASK			0x00000010
+#define NV50_3D_CLEAR_FLAGS_CLEAR_RECT__SHIFT			4
+#define NV50_3D_CLEAR_FLAGS_CLEAR_RECT_SCISSOR			0x00000000
+#define NV50_3D_CLEAR_FLAGS_CLEAR_RECT_VIEWPORT			0x00000010
+
+#define NV50_3D_CODE_CB_FLUSH					0x00001440
+
+#define NV50_3D_BIND_TSC(i0)				       (0x00001444 + 0x8*(i0))
+#define NV50_3D_BIND_TSC__ESIZE					0x00000008
+#define NV50_3D_BIND_TSC__LEN					0x00000003
+#define NV50_3D_BIND_TSC_VALID					0x00000001
+#define NV50_3D_BIND_TSC_SAMPLER__MASK				0x000000f0
+#define NV50_3D_BIND_TSC_SAMPLER__SHIFT				4
+#define NV50_3D_BIND_TSC_TSC__MASK				0x001ff000
+#define NV50_3D_BIND_TSC_TSC__SHIFT				12
+
+#define NV50_3D_BIND_TIC(i0)				       (0x00001448 + 0x8*(i0))
+#define NV50_3D_BIND_TIC__ESIZE					0x00000008
+#define NV50_3D_BIND_TIC__LEN					0x00000003
+#define NV50_3D_BIND_TIC_VALID					0x00000001
+#define NV50_3D_BIND_TIC_TEXTURE__MASK				0x000001fe
+#define NV50_3D_BIND_TIC_TEXTURE__SHIFT				1
+#define NV50_3D_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NV50_3D_BIND_TIC_TIC__SHIFT				9
+
+#define NV50_3D_BIND_TSC2(i0)				       (0x00001468 + 0x8*(i0))
+#define NV50_3D_BIND_TSC2__ESIZE				0x00000008
+#define NV50_3D_BIND_TSC2__LEN					0x00000003
+#define NV50_3D_BIND_TSC2_VALID					0x00000001
+#define NV50_3D_BIND_TSC2_SAMPLER__MASK				0x00000010
+#define NV50_3D_BIND_TSC2_SAMPLER__SHIFT			4
+#define NV50_3D_BIND_TSC2_TSC__MASK				0x001ff000
+#define NV50_3D_BIND_TSC2_TSC__SHIFT				12
+
+#define NV50_3D_BIND_TIC2(i0)				       (0x0000146c + 0x8*(i0))
+#define NV50_3D_BIND_TIC2__ESIZE				0x00000008
+#define NV50_3D_BIND_TIC2__LEN					0x00000003
+#define NV50_3D_BIND_TIC2_VALID					0x00000001
+#define NV50_3D_BIND_TIC2_TEXTURE__MASK				0x00000002
+#define NV50_3D_BIND_TIC2_TEXTURE__SHIFT			1
+#define NV50_3D_BIND_TIC2_TIC__MASK				0x7ffffe00
+#define NV50_3D_BIND_TIC2_TIC__SHIFT				9
+
+#define NV50_3D_STRMOUT_MAP(i0)				       (0x00001480 + 0x4*(i0))
+#define NV50_3D_STRMOUT_MAP__ESIZE				0x00000004
+#define NV50_3D_STRMOUT_MAP__LEN				0x00000020
+
+#define NV50_3D_CLIPID_HEIGHT					0x00001504
+#define NV50_3D_CLIPID_HEIGHT__MAX				0x00002000
+
+#define NV50_3D_CLIPID_FILL_RECT_HORIZ				0x00001508
+#define NV50_3D_CLIPID_FILL_RECT_HORIZ_LOW__MASK		0x0000ffff
+#define NV50_3D_CLIPID_FILL_RECT_HORIZ_LOW__SHIFT		0
+#define NV50_3D_CLIPID_FILL_RECT_HORIZ_HIGH__MASK		0xffff0000
+#define NV50_3D_CLIPID_FILL_RECT_HORIZ_HIGH__SHIFT		16
+
+#define NV50_3D_CLIPID_FILL_RECT_VERT				0x0000150c
+#define NV50_3D_CLIPID_FILL_RECT_VERT_LOW__MASK			0x0000ffff
+#define NV50_3D_CLIPID_FILL_RECT_VERT_LOW__SHIFT		0
+#define NV50_3D_CLIPID_FILL_RECT_VERT_HIGH__MASK		0xffff0000
+#define NV50_3D_CLIPID_FILL_RECT_VERT_HIGH__SHIFT		16
+
+#define NV50_3D_CLIP_DISTANCE_ENABLE				0x00001510
+#define NV50_3D_CLIP_DISTANCE_ENABLE_0				0x00000001
+#define NV50_3D_CLIP_DISTANCE_ENABLE_1				0x00000002
+#define NV50_3D_CLIP_DISTANCE_ENABLE_2				0x00000004
+#define NV50_3D_CLIP_DISTANCE_ENABLE_3				0x00000008
+#define NV50_3D_CLIP_DISTANCE_ENABLE_4				0x00000010
+#define NV50_3D_CLIP_DISTANCE_ENABLE_5				0x00000020
+#define NV50_3D_CLIP_DISTANCE_ENABLE_6				0x00000040
+#define NV50_3D_CLIP_DISTANCE_ENABLE_7				0x00000080
+
+#define NV50_3D_SAMPLECNT_ENABLE				0x00001514
+
+#define NV50_3D_POINT_SIZE					0x00001518
+
+#define NV50_3D_ZCULL_STATCTRS_ENABLE				0x0000151c
+
+#define NV50_3D_POINT_SPRITE_ENABLE				0x00001520
+
+#define NVA0_3D_UNK152C						0x0000152c
+#define NVA0_3D_UNK152C_UNK0					0x00000001
+#define NVA0_3D_UNK152C_UNK1					0x00000010
+#define NVA0_3D_UNK152C_UNK2					0x00000100
+#define NVA0_3D_UNK152C_UNK3__MASK				0x000ff000
+#define NVA0_3D_UNK152C_UNK3__SHIFT				12
+#define NVA0_3D_UNK152C_UNK3__MAX				0x00000028
+
+#define NV50_3D_COUNTER_RESET					0x00001530
+#define NV50_3D_COUNTER_RESET_SAMPLECNT				0x00000001
+#define NV50_3D_COUNTER_RESET_ZCULL_STATS			0x00000002
+#define NVA0_3D_COUNTER_RESET_STRMOUT_VERTICES			0x00000008
+#define NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK		0x00000010
+#define NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES		0x00000011
+#define NV50_3D_COUNTER_RESET_VFETCH_VERTICES			0x00000012
+#define NV50_3D_COUNTER_RESET_VFETCH_PRIMITIVES			0x00000013
+#define NV50_3D_COUNTER_RESET_VP_LAUNCHES			0x00000015
+#define NV50_3D_COUNTER_RESET_GP_LAUNCHES			0x0000001a
+#define NV50_3D_COUNTER_RESET_GP_PRIMITIVES_OUT			0x0000001b
+#define NV50_3D_COUNTER_RESET_RAST_PRIMITIVES_PRECLIP		0x0000001c
+#define NV50_3D_COUNTER_RESET_RAST_PRIMITIVES_POSTCLIP		0x0000001d
+#define NV50_3D_COUNTER_RESET_FP_PIXELS				0x0000001e
+
+#define NV50_3D_MULTISAMPLE_ENABLE				0x00001534
+
+#define NV50_3D_ZETA_ENABLE					0x00001538
+
+#define NV50_3D_MULTISAMPLE_CTRL				0x0000153c
+#define NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE		0x00000001
+#define NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE			0x00000010
+
+#define NV50_3D_NOPERSPECTIVE_BITMAP(i0)		       (0x00001540 + 0x4*(i0))
+#define NV50_3D_NOPERSPECTIVE_BITMAP__ESIZE			0x00000004
+#define NV50_3D_NOPERSPECTIVE_BITMAP__LEN			0x00000004
+
+#define NV50_3D_COND_ADDRESS_HIGH				0x00001550
+
+#define NV50_3D_COND_ADDRESS_LOW				0x00001554
+
+#define NV50_3D_COND_MODE					0x00001558
+#define NV50_3D_COND_MODE_NEVER					0x00000000
+#define NV50_3D_COND_MODE_ALWAYS				0x00000001
+#define NV50_3D_COND_MODE_RES_NON_ZERO				0x00000002
+#define NV50_3D_COND_MODE_EQUAL					0x00000003
+#define NV50_3D_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NV50_3D_TSC_ADDRESS_HIGH				0x0000155c
+
+#define NV50_3D_TSC_ADDRESS_LOW					0x00001560
+#define NV50_3D_TSC_ADDRESS_LOW__ALIGN				0x00000020
+
+#define NV50_3D_TSC_LIMIT					0x00001564
+#define NV50_3D_TSC_LIMIT__MAX					0x00001fff
+
+#define NV50_3D_UNK1568						0x00001568
+
+#define NV50_3D_POLYGON_OFFSET_FACTOR				0x0000156c
+
+#define NV50_3D_LINE_SMOOTH_ENABLE				0x00001570
+
+#define NV50_3D_TIC_ADDRESS_HIGH				0x00001574
+
+#define NV50_3D_TIC_ADDRESS_LOW					0x00001578
+
+#define NV50_3D_TIC_LIMIT					0x0000157c
+
+#define NV50_3D_PM_CONTROL(i0)				       (0x00001580 + 0x4*(i0))
+#define NV50_3D_PM_CONTROL__ESIZE				0x00000004
+#define NV50_3D_PM_CONTROL__LEN					0x00000004
+#define NV50_3D_PM_CONTROL_UNK0					0x00000001
+#define NV50_3D_PM_CONTROL_UNK1__MASK				0x00000070
+#define NV50_3D_PM_CONTROL_UNK1__SHIFT				4
+#define NV50_3D_PM_CONTROL_UNK2__MASK				0x00ffff00
+#define NV50_3D_PM_CONTROL_UNK2__SHIFT				8
+#define NV50_3D_PM_CONTROL_UNK3__MASK				0xff000000
+#define NV50_3D_PM_CONTROL_UNK3__SHIFT				24
+
+#define NV50_3D_ZCULL_REGION					0x00001590
+
+#define NV50_3D_STENCIL_TWO_SIDE_ENABLE				0x00001594
+
+#define NV50_3D_STENCIL_BACK_OP_FAIL				0x00001598
+#define NV50_3D_STENCIL_BACK_OP_FAIL_ZERO			0x00000000
+#define NV50_3D_STENCIL_BACK_OP_FAIL_INVERT			0x0000150a
+#define NV50_3D_STENCIL_BACK_OP_FAIL_KEEP			0x00001e00
+#define NV50_3D_STENCIL_BACK_OP_FAIL_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_BACK_OP_FAIL_INCR			0x00001e02
+#define NV50_3D_STENCIL_BACK_OP_FAIL_DECR			0x00001e03
+#define NV50_3D_STENCIL_BACK_OP_FAIL_INCR_WRAP			0x00008507
+#define NV50_3D_STENCIL_BACK_OP_FAIL_DECR_WRAP			0x00008508
+
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL				0x0000159c
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_ZERO			0x00000000
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_INVERT			0x0000150a
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_KEEP			0x00001e00
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_INCR			0x00001e02
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_DECR			0x00001e03
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_INCR_WRAP			0x00008507
+#define NV50_3D_STENCIL_BACK_OP_ZFAIL_DECR_WRAP			0x00008508
+
+#define NV50_3D_STENCIL_BACK_OP_ZPASS				0x000015a0
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_ZERO			0x00000000
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_INVERT			0x0000150a
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_KEEP			0x00001e00
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_REPLACE			0x00001e01
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_INCR			0x00001e02
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_DECR			0x00001e03
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_INCR_WRAP			0x00008507
+#define NV50_3D_STENCIL_BACK_OP_ZPASS_DECR_WRAP			0x00008508
+
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC				0x000015a4
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_NEVER			0x00000200
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_LESS			0x00000201
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_EQUAL			0x00000202
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_LEQUAL			0x00000203
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_GREATER			0x00000204
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_NOTEQUAL			0x00000205
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_GEQUAL			0x00000206
+#define NV50_3D_STENCIL_BACK_FUNC_FUNC_ALWAYS			0x00000207
+
+#define NV50_3D_UNK15A8						0x000015a8
+#define NV50_3D_UNK15A8_UNK1__MASK				0x00000007
+#define NV50_3D_UNK15A8_UNK1__SHIFT				0
+#define NV50_3D_UNK15A8_UNK2__MASK				0x00000070
+#define NV50_3D_UNK15A8_UNK2__SHIFT				4
+
+#define NV50_3D_UNK15AC						0x000015ac
+
+#define NV50_3D_UNK15B0						0x000015b0
+#define NV50_3D_UNK15B0_0					0x00000001
+#define NV50_3D_UNK15B0_1					0x00000010
+#define NV50_3D_UNK15B0_2					0x00000100
+
+#define NV50_3D_CSAA_ENABLE					0x000015b4
+
+#define NV50_3D_FRAMEBUFFER_SRGB				0x000015b8
+
+#define NV50_3D_POLYGON_OFFSET_UNITS				0x000015bc
+
+#define NVA3_3D_UNK15C4						0x000015c4
+
+#define NVA3_3D_UNK15C8						0x000015c8
+
+#define NV50_3D_LAYER						0x000015cc
+#define NV50_3D_LAYER_IDX__MASK					0x0000ffff
+#define NV50_3D_LAYER_IDX__SHIFT				0
+#define NV50_3D_LAYER_USE_GP					0x00010000
+
+#define NV50_3D_MULTISAMPLE_MODE				0x000015d0
+#define NV50_3D_MULTISAMPLE_MODE_MS1				0x00000000
+#define NV50_3D_MULTISAMPLE_MODE_MS2				0x00000001
+#define NV50_3D_MULTISAMPLE_MODE_MS4				0x00000002
+#define NV50_3D_MULTISAMPLE_MODE_MS8				0x00000003
+#define NV50_3D_MULTISAMPLE_MODE_MS8_ALT			0x00000004
+#define NV50_3D_MULTISAMPLE_MODE_MS2_ALT			0x00000005
+#define NV50_3D_MULTISAMPLE_MODE_UNK6				0x00000006
+#define NV50_3D_MULTISAMPLE_MODE_MS4_CS4			0x00000008
+#define NV50_3D_MULTISAMPLE_MODE_MS4_CS12			0x00000009
+#define NV50_3D_MULTISAMPLE_MODE_MS8_CS8			0x0000000a
+#define NV50_3D_MULTISAMPLE_MODE_MS8_CS24			0x0000000b
+
+#define NV50_3D_VERTEX_BEGIN_D3D				0x000015d4
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE__MASK		0x0fffffff
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE__SHIFT		0
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_POINTS		0x00000001
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINES		0x00000002
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINE_STRIP		0x00000003
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLES		0x00000004
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLE_STRIP	0x00000005
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINES_ADJACENCY	0x0000000a
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINE_STRIP_ADJACENCY	0x0000000b
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLES_ADJACENCY	0x0000000c
+#define NV50_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLE_STRIP_ADJACENCY	0x0000000d
+#define NV50_3D_VERTEX_BEGIN_D3D_INSTANCE_NEXT			0x10000000
+#define NV84_3D_VERTEX_BEGIN_D3D_PRIMITIVE_ID_CONT		0x20000000
+#define NVA0_3D_VERTEX_BEGIN_D3D_INSTANCE_CONT			0x40000000
+
+#define NV50_3D_VERTEX_END_D3D					0x000015d8
+#define NV50_3D_VERTEX_END_D3D_UNK0				0x00000001
+#define NVA0_3D_VERTEX_END_D3D_UNK1				0x00000002
+
+#define NV50_3D_VERTEX_BEGIN_GL					0x000015dc
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE__MASK			0x0fffffff
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE__SHIFT		0
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS		0x00000000
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINES			0x00000001
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_LOOP		0x00000002
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_STRIP		0x00000003
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES		0x00000004
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_STRIP	0x00000005
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_FAN		0x00000006
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_QUADS			0x00000007
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_QUAD_STRIP		0x00000008
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_POLYGON		0x00000009
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINES_ADJACENCY	0x0000000a
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_STRIP_ADJACENCY	0x0000000b
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES_ADJACENCY	0x0000000c
+#define NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_STRIP_ADJACENCY	0x0000000d
+#define NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT			0x10000000
+#define NV84_3D_VERTEX_BEGIN_GL_PRIMITIVE_ID_CONT		0x20000000
+#define NVA0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT			0x40000000
+
+#define NV50_3D_VERTEX_END_GL					0x000015e0
+#define NV50_3D_VERTEX_END_GL_UNK0				0x00000001
+#define NVA0_3D_VERTEX_END_GL_UNK1				0x00000002
+
+#define NV50_3D_EDGEFLAG					0x000015e4
+
+#define NV50_3D_VB_ELEMENT_U32					0x000015e8
+
+#define NV50_3D_VB_ELEMENT_U16_SETUP				0x000015ec
+#define NV50_3D_VB_ELEMENT_U16_SETUP_OFFSET__MASK		0xc0000000
+#define NV50_3D_VB_ELEMENT_U16_SETUP_OFFSET__SHIFT		30
+#define NV50_3D_VB_ELEMENT_U16_SETUP_COUNT__MASK		0x3fffffff
+#define NV50_3D_VB_ELEMENT_U16_SETUP_COUNT__SHIFT		0
+
+#define NV50_3D_VB_ELEMENT_U16					0x000015f0
+#define NV50_3D_VB_ELEMENT_U16_I0__MASK				0x0000ffff
+#define NV50_3D_VB_ELEMENT_U16_I0__SHIFT			0
+#define NV50_3D_VB_ELEMENT_U16_I1__MASK				0xffff0000
+#define NV50_3D_VB_ELEMENT_U16_I1__SHIFT			16
+
+#define NV50_3D_VERTEX_BASE_HIGH				0x000015f4
+
+#define NV50_3D_VERTEX_BASE_LOW					0x000015f8
+
+#define NV50_3D_VERTEX_DATA					0x00001640
+
+#define NV50_3D_PRIM_RESTART_ENABLE				0x00001644
+
+#define NV50_3D_PRIM_RESTART_INDEX				0x00001648
+
+#define NV50_3D_VP_GP_BUILTIN_ATTR_EN				0x0000164c
+#define NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID			0x00000001
+#define NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID		0x00000010
+#define NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID		0x00000100
+#define NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12			0x00001000
+
+#define NV50_3D_VP_ATTR_EN(i0)				       (0x00001650 + 0x4*(i0))
+#define NV50_3D_VP_ATTR_EN__ESIZE				0x00000004
+#define NV50_3D_VP_ATTR_EN__LEN					0x00000002
+#define NV50_3D_VP_ATTR_EN_7__MASK				0xf0000000
+#define NV50_3D_VP_ATTR_EN_7__SHIFT				28
+#define NV50_3D_VP_ATTR_EN_7_X					0x10000000
+#define NV50_3D_VP_ATTR_EN_7_Y					0x20000000
+#define NV50_3D_VP_ATTR_EN_7_Z					0x40000000
+#define NV50_3D_VP_ATTR_EN_7_W					0x80000000
+#define NV50_3D_VP_ATTR_EN_6__MASK				0x0f000000
+#define NV50_3D_VP_ATTR_EN_6__SHIFT				24
+#define NV50_3D_VP_ATTR_EN_6_X					0x01000000
+#define NV50_3D_VP_ATTR_EN_6_Y					0x02000000
+#define NV50_3D_VP_ATTR_EN_6_Z					0x04000000
+#define NV50_3D_VP_ATTR_EN_6_W					0x08000000
+#define NV50_3D_VP_ATTR_EN_5__MASK				0x00f00000
+#define NV50_3D_VP_ATTR_EN_5__SHIFT				20
+#define NV50_3D_VP_ATTR_EN_5_X					0x00100000
+#define NV50_3D_VP_ATTR_EN_5_Y					0x00200000
+#define NV50_3D_VP_ATTR_EN_5_Z					0x00400000
+#define NV50_3D_VP_ATTR_EN_5_W					0x00800000
+#define NV50_3D_VP_ATTR_EN_4__MASK				0x000f0000
+#define NV50_3D_VP_ATTR_EN_4__SHIFT				16
+#define NV50_3D_VP_ATTR_EN_4_X					0x00010000
+#define NV50_3D_VP_ATTR_EN_4_Y					0x00020000
+#define NV50_3D_VP_ATTR_EN_4_Z					0x00040000
+#define NV50_3D_VP_ATTR_EN_4_W					0x00080000
+#define NV50_3D_VP_ATTR_EN_3__MASK				0x0000f000
+#define NV50_3D_VP_ATTR_EN_3__SHIFT				12
+#define NV50_3D_VP_ATTR_EN_3_X					0x00001000
+#define NV50_3D_VP_ATTR_EN_3_Y					0x00002000
+#define NV50_3D_VP_ATTR_EN_3_Z					0x00004000
+#define NV50_3D_VP_ATTR_EN_3_W					0x00008000
+#define NV50_3D_VP_ATTR_EN_2__MASK				0x00000f00
+#define NV50_3D_VP_ATTR_EN_2__SHIFT				8
+#define NV50_3D_VP_ATTR_EN_2_X					0x00000100
+#define NV50_3D_VP_ATTR_EN_2_Y					0x00000200
+#define NV50_3D_VP_ATTR_EN_2_Z					0x00000400
+#define NV50_3D_VP_ATTR_EN_2_W					0x00000800
+#define NV50_3D_VP_ATTR_EN_1__MASK				0x000000f0
+#define NV50_3D_VP_ATTR_EN_1__SHIFT				4
+#define NV50_3D_VP_ATTR_EN_1_X					0x00000010
+#define NV50_3D_VP_ATTR_EN_1_Y					0x00000020
+#define NV50_3D_VP_ATTR_EN_1_Z					0x00000040
+#define NV50_3D_VP_ATTR_EN_1_W					0x00000080
+#define NV50_3D_VP_ATTR_EN_0__MASK				0x0000000f
+#define NV50_3D_VP_ATTR_EN_0__SHIFT				0
+#define NV50_3D_VP_ATTR_EN_0_X					0x00000001
+#define NV50_3D_VP_ATTR_EN_0_Y					0x00000002
+#define NV50_3D_VP_ATTR_EN_0_Z					0x00000004
+#define NV50_3D_VP_ATTR_EN_0_W					0x00000008
+
+#define NV50_3D_POINT_SMOOTH_ENABLE				0x00001658
+
+#define NV50_3D_POINT_RASTER_RULES				0x0000165c
+#define NV50_3D_POINT_RASTER_RULES_OGL				0x00000000
+#define NV50_3D_POINT_RASTER_RULES_D3D				0x00000001
+
+#define NV50_3D_POINT_SPRITE_CTRL				0x00001660
+#define NV50_3D_POINT_SPRITE_CTRL_COORD_ORIGIN__MASK		0x00000010
+#define NV50_3D_POINT_SPRITE_CTRL_COORD_ORIGIN__SHIFT		4
+#define NV50_3D_POINT_SPRITE_CTRL_COORD_ORIGIN_LOWER_LEFT	0x00000000
+#define NV50_3D_POINT_SPRITE_CTRL_COORD_ORIGIN_UPPER_LEFT	0x00000010
+
+#define NVA0_3D_TEX_MISC					0x00001664
+#define NVA0_3D_TEX_MISC_UNK1					0x00000002
+#define NVA0_3D_TEX_MISC_SEAMLESS_CUBE_MAP			0x00000004
+
+#define NV50_3D_LINE_SMOOTH_BLUR				0x00001668
+#define NV50_3D_LINE_SMOOTH_BLUR_LOW				0x00000000
+#define NV50_3D_LINE_SMOOTH_BLUR_MEDIUM				0x00000001
+#define NV50_3D_LINE_SMOOTH_BLUR_HIGH				0x00000002
+
+#define NV50_3D_LINE_STIPPLE_ENABLE				0x0000166c
+
+#define NV50_3D_COVERAGE_LUT(i0)			       (0x00001670 + 0x4*(i0))
+#define NV50_3D_COVERAGE_LUT__ESIZE				0x00000004
+#define NV50_3D_COVERAGE_LUT__LEN				0x00000004
+#define NV50_3D_COVERAGE_LUT_0__MASK				0x000000ff
+#define NV50_3D_COVERAGE_LUT_0__SHIFT				0
+#define NV50_3D_COVERAGE_LUT_1__MASK				0x0000ff00
+#define NV50_3D_COVERAGE_LUT_1__SHIFT				8
+#define NV50_3D_COVERAGE_LUT_2__MASK				0x00ff0000
+#define NV50_3D_COVERAGE_LUT_2__SHIFT				16
+#define NV50_3D_COVERAGE_LUT_3__MASK				0xff000000
+#define NV50_3D_COVERAGE_LUT_3__SHIFT				24
+
+#define NV50_3D_LINE_STIPPLE					0x00001680
+#define NV50_3D_LINE_STIPPLE_FACTOR_M1__MASK			0x000000ff
+#define NV50_3D_LINE_STIPPLE_FACTOR_M1__SHIFT			0
+#define NV50_3D_LINE_STIPPLE_PATTERN__MASK			0x00ffff00
+#define NV50_3D_LINE_STIPPLE_PATTERN__SHIFT			8
+
+#define NV50_3D_PROVOKING_VERTEX_LAST				0x00001684
+
+#define NV50_3D_VERTEX_TWO_SIDE_ENABLE				0x00001688
+
+#define NV50_3D_POLYGON_STIPPLE_ENABLE				0x0000168c
+
+#define NV50_3D_UNK1690						0x00001690
+#define NV50_3D_UNK1690_ALWAYS_DERIV				0x00000001
+#define NV50_3D_UNK1690_UNK16					0x00010000
+
+#define NV50_3D_SET_PROGRAM_CB					0x00001694
+#define NV50_3D_SET_PROGRAM_CB_PROGRAM__MASK			0x000000f0
+#define NV50_3D_SET_PROGRAM_CB_PROGRAM__SHIFT			4
+#define NV50_3D_SET_PROGRAM_CB_PROGRAM_VERTEX			0x00000000
+#define NV50_3D_SET_PROGRAM_CB_PROGRAM_GEOMETRY			0x00000020
+#define NV50_3D_SET_PROGRAM_CB_PROGRAM_FRAGMENT			0x00000030
+#define NV50_3D_SET_PROGRAM_CB_INDEX__MASK			0x00000f00
+#define NV50_3D_SET_PROGRAM_CB_INDEX__SHIFT			8
+#define NV50_3D_SET_PROGRAM_CB_BUFFER__MASK			0x0007f000
+#define NV50_3D_SET_PROGRAM_CB_BUFFER__SHIFT			12
+#define NV50_3D_SET_PROGRAM_CB_VALID				0x00000001
+
+#define NV50_3D_UNK1698						0x00001698
+#define NV50_3D_UNK1698_0					0x00000001
+#define NV50_3D_UNK1698_1					0x00000010
+#define NV50_3D_UNK1698_2					0x00000100
+
+#define NVA3_3D_SAMPLE_SHADING					0x0000169c
+#define NVA3_3D_SAMPLE_SHADING_MIN_SAMPLES__MASK		0x0000000f
+#define NVA3_3D_SAMPLE_SHADING_MIN_SAMPLES__SHIFT		0
+#define NVA3_3D_SAMPLE_SHADING_ENABLE				0x00000010
+
+#define NVA3_3D_UNK16A0						0x000016a0
+
+#define NV50_3D_VP_RESULT_MAP_SIZE				0x000016ac
+
+#define NV50_3D_VP_REG_ALLOC_TEMP				0x000016b0
+
+#define NVA0_3D_UNK16B4						0x000016b4
+#define NVA0_3D_UNK16B4_UNK0					0x00000001
+#define NVA3_3D_UNK16B4_UNK1					0x00000002
+
+#define NV50_3D_VP_REG_ALLOC_RESULT				0x000016b8
+
+#define NV50_3D_VP_RESULT_MAP(i0)			       (0x000016bc + 0x4*(i0))
+#define NV50_3D_VP_RESULT_MAP__ESIZE				0x00000004
+#define NV50_3D_VP_RESULT_MAP__LEN				0x00000011
+#define NV50_3D_VP_RESULT_MAP_0__MASK				0x000000ff
+#define NV50_3D_VP_RESULT_MAP_0__SHIFT				0
+#define NV50_3D_VP_RESULT_MAP_1__MASK				0x0000ff00
+#define NV50_3D_VP_RESULT_MAP_1__SHIFT				8
+#define NV50_3D_VP_RESULT_MAP_2__MASK				0x00ff0000
+#define NV50_3D_VP_RESULT_MAP_2__SHIFT				16
+#define NV50_3D_VP_RESULT_MAP_3__MASK				0xff000000
+#define NV50_3D_VP_RESULT_MAP_3__SHIFT				24
+
+#define NV50_3D_POLYGON_STIPPLE_PATTERN(i0)		       (0x00001700 + 0x4*(i0))
+#define NV50_3D_POLYGON_STIPPLE_PATTERN__ESIZE			0x00000004
+#define NV50_3D_POLYGON_STIPPLE_PATTERN__LEN			0x00000020
+
+#define NVA0_3D_STRMOUT_OFFSET(i0)			       (0x00001780 + 0x4*(i0))
+#define NVA0_3D_STRMOUT_OFFSET__ESIZE				0x00000004
+#define NVA0_3D_STRMOUT_OFFSET__LEN				0x00000004
+
+#define NV50_3D_GP_ENABLE					0x00001798
+
+#define NV50_3D_GP_REG_ALLOC_TEMP				0x000017a0
+
+#define NV50_3D_GP_REG_ALLOC_RESULT				0x000017a8
+
+#define NV50_3D_GP_RESULT_MAP_SIZE				0x000017ac
+
+#define NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE			0x000017b0
+#define NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS			0x00000001
+#define NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP		0x00000002
+#define NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP		0x00000003
+
+#define NV50_3D_RASTERIZE_ENABLE				0x000017b4
+
+#define NV50_3D_STRMOUT_ENABLE					0x000017b8
+
+#define NV50_3D_GP_RESULT_MAP(i0)			       (0x000017fc + 0x4*(i0))
+#define NV50_3D_GP_RESULT_MAP__ESIZE				0x00000004
+#define NV50_3D_GP_RESULT_MAP__LEN				0x00000021
+#define NV50_3D_GP_RESULT_MAP_0__MASK				0x000000ff
+#define NV50_3D_GP_RESULT_MAP_0__SHIFT				0
+#define NV50_3D_GP_RESULT_MAP_1__MASK				0x0000ff00
+#define NV50_3D_GP_RESULT_MAP_1__SHIFT				8
+#define NV50_3D_GP_RESULT_MAP_2__MASK				0x00ff0000
+#define NV50_3D_GP_RESULT_MAP_2__SHIFT				16
+#define NV50_3D_GP_RESULT_MAP_3__MASK				0xff000000
+#define NV50_3D_GP_RESULT_MAP_3__SHIFT				24
+
+#define NV50_3D_POLYGON_OFFSET_CLAMP				0x0000187c
+
+#define NVA3_3D_VERTEX_ARRAY_PER_INSTANCE_ALT(i0)	       (0x00001880 + 0x4*(i0))
+#define NVA3_3D_VERTEX_ARRAY_PER_INSTANCE_ALT__ESIZE		0x00000004
+#define NVA3_3D_VERTEX_ARRAY_PER_INSTANCE_ALT__LEN		0x00000020
+
+#define NV50_3D_GP_VIEWPORT_ID_ENABLE				0x00001900
+
+#define NV50_3D_SEMANTIC_COLOR					0x00001904
+#define NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK			0x000000ff
+#define NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT			0
+#define NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK			0x0000ff00
+#define NV50_3D_SEMANTIC_COLOR_BFC0_ID__SHIFT			8
+#define NV50_3D_SEMANTIC_COLOR_COLR_NR__MASK			0x00ff0000
+#define NV50_3D_SEMANTIC_COLOR_COLR_NR__SHIFT			16
+#define NV50_3D_SEMANTIC_COLOR_CLMP_EN				0x01000000
+
+#define NV50_3D_SEMANTIC_CLIP					0x00001908
+#define NV50_3D_SEMANTIC_CLIP_CLIP_START__MASK			0x000000ff
+#define NV50_3D_SEMANTIC_CLIP_CLIP_START__SHIFT			0
+#define NV50_3D_SEMANTIC_CLIP_CLIP_NUM__MASK			0x00000f00
+#define NV50_3D_SEMANTIC_CLIP_CLIP_NUM__SHIFT			8
+
+#define NV50_3D_SEMANTIC_LAYER					0x0000190c
+#define NV50_3D_SEMANTIC_LAYER_LAYER_ID__MASK			0x000000ff
+#define NV50_3D_SEMANTIC_LAYER_LAYER_ID__SHIFT			0
+
+#define NV50_3D_SEMANTIC_PTSZ					0x00001910
+#define NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK			0x00000001
+#define NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__SHIFT			0
+#define NV50_3D_SEMANTIC_PTSZ_PTSZ_ID__MASK			0x00000ff0
+#define NV50_3D_SEMANTIC_PTSZ_PTSZ_ID__SHIFT			4
+
+#define NV50_3D_SEMANTIC_PRIM_ID				0x00001914
+#define NV50_3D_SEMANTIC_PRIM_ID_PRIM_ID__MASK			0x000000ff
+#define NV50_3D_SEMANTIC_PRIM_ID_PRIM_ID__SHIFT			0
+
+#define NV50_3D_CULL_FACE_ENABLE				0x00001918
+
+#define NV50_3D_FRONT_FACE					0x0000191c
+#define NV50_3D_FRONT_FACE_CW					0x00000900
+#define NV50_3D_FRONT_FACE_CCW					0x00000901
+
+#define NV50_3D_CULL_FACE					0x00001920
+#define NV50_3D_CULL_FACE_FRONT					0x00000404
+#define NV50_3D_CULL_FACE_BACK					0x00000405
+#define NV50_3D_CULL_FACE_FRONT_AND_BACK			0x00000408
+
+#define NV50_3D_LINE_LAST_PIXEL					0x00001924
+
+#define NVA3_3D_FP_MULTISAMPLE					0x00001928
+#define NVA3_3D_FP_MULTISAMPLE_EXPORT_SAMPLE_MASK		0x00000001
+#define NVA3_3D_FP_MULTISAMPLE_FORCE_PER_SAMPLE			0x00000002
+
+#define NV50_3D_VIEWPORT_TRANSFORM_EN				0x0000192c
+
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL				0x0000193c
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK0			0x00000001
+#define NVA0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1			0x00000002
+#define NVA0_3D_VIEW_VOLUME_CLIP_CTRL_UNK2			0x00000004
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_NEAR		0x00000008
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_FAR		0x00000010
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK7			0x00000080
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK10			0x00000400
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK11			0x00000800
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12__MASK		0x00003000
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12__SHIFT		12
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK0		0x00000000
+#define NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK1		0x00001000
+#define NV84_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK2		0x00002000
+
+#define NV50_3D_CLIP_DISTANCE_MODE				0x00001940
+#define NV50_3D_CLIP_DISTANCE_MODE_0__MASK			0x00000001
+#define NV50_3D_CLIP_DISTANCE_MODE_0__SHIFT			0
+#define NV50_3D_CLIP_DISTANCE_MODE_0_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_0_CULL			0x00000001
+#define NV50_3D_CLIP_DISTANCE_MODE_1__MASK			0x00000010
+#define NV50_3D_CLIP_DISTANCE_MODE_1__SHIFT			4
+#define NV50_3D_CLIP_DISTANCE_MODE_1_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_1_CULL			0x00000010
+#define NV50_3D_CLIP_DISTANCE_MODE_2__MASK			0x00000100
+#define NV50_3D_CLIP_DISTANCE_MODE_2__SHIFT			8
+#define NV50_3D_CLIP_DISTANCE_MODE_2_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_2_CULL			0x00000100
+#define NV50_3D_CLIP_DISTANCE_MODE_3__MASK			0x00001000
+#define NV50_3D_CLIP_DISTANCE_MODE_3__SHIFT			12
+#define NV50_3D_CLIP_DISTANCE_MODE_3_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_3_CULL			0x00001000
+#define NV50_3D_CLIP_DISTANCE_MODE_4__MASK			0x00010000
+#define NV50_3D_CLIP_DISTANCE_MODE_4__SHIFT			16
+#define NV50_3D_CLIP_DISTANCE_MODE_4_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_4_CULL			0x00010000
+#define NV50_3D_CLIP_DISTANCE_MODE_5__MASK			0x00100000
+#define NV50_3D_CLIP_DISTANCE_MODE_5__SHIFT			20
+#define NV50_3D_CLIP_DISTANCE_MODE_5_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_5_CULL			0x00100000
+#define NV50_3D_CLIP_DISTANCE_MODE_6__MASK			0x01000000
+#define NV50_3D_CLIP_DISTANCE_MODE_6__SHIFT			24
+#define NV50_3D_CLIP_DISTANCE_MODE_6_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_6_CULL			0x01000000
+#define NV50_3D_CLIP_DISTANCE_MODE_7__MASK			0x10000000
+#define NV50_3D_CLIP_DISTANCE_MODE_7__SHIFT			28
+#define NV50_3D_CLIP_DISTANCE_MODE_7_CLIP			0x00000000
+#define NV50_3D_CLIP_DISTANCE_MODE_7_CULL			0x10000000
+
+#define NVA3_3D_UNK1944						0x00001944
+
+#define NV50_3D_CLIP_RECTS_EN					0x0000194c
+
+#define NV50_3D_CLIP_RECTS_MODE					0x00001950
+#define NV50_3D_CLIP_RECTS_MODE_INSIDE_ANY			0x00000000
+#define NV50_3D_CLIP_RECTS_MODE_OUTSIDE_ALL			0x00000001
+#define NV50_3D_CLIP_RECTS_MODE_NEVER				0x00000002
+
+#define NV50_3D_ZCULL_VALIDATE					0x00001954
+#define NV50_3D_ZCULL_VALIDATE_CLEAR_UNK0			0x00000001
+#define NV50_3D_ZCULL_VALIDATE_CLEAR_UNK1			0x00000010
+
+#define NV50_3D_ZCULL_INVALIDATE				0x00001958
+
+#define NVA3_3D_UNK1960						0x00001960
+#define NVA3_3D_UNK1960_0					0x00000001
+#define NVA3_3D_UNK1960_1					0x00000010
+
+#define NV50_3D_UNK1968						0x00001968
+#define NV50_3D_UNK1968_0					0x00000001
+#define NV50_3D_UNK1968_1					0x00000010
+
+#define NV50_3D_FP_CTRL_UNK196C					0x0000196c
+#define NV50_3D_FP_CTRL_UNK196C_0				0x00000001
+#define NV50_3D_FP_CTRL_UNK196C_1				0x00000010
+
+#define NV50_3D_UNK1978						0x00001978
+
+#define NV50_3D_CLIPID_ENABLE					0x0000197c
+
+#define NV50_3D_CLIPID_WIDTH					0x00001980
+#define NV50_3D_CLIPID_WIDTH__MAX				0x00002000
+#define NV50_3D_CLIPID_WIDTH__ALIGN				0x00000040
+
+#define NV50_3D_CLIPID_ID					0x00001984
+
+#define NV50_3D_FP_INTERPOLANT_CTRL				0x00001988
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK__MASK			0xff000000
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK__SHIFT		24
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK_X			0x01000000
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK_Y			0x02000000
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK_Z			0x04000000
+#define NV50_3D_FP_INTERPOLANT_CTRL_UMASK_W			0x08000000
+#define NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__MASK		0x00ff0000
+#define NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT	16
+#define NV50_3D_FP_INTERPOLANT_CTRL_OFFSET__MASK		0x0000ff00
+#define NV50_3D_FP_INTERPOLANT_CTRL_OFFSET__SHIFT		8
+#define NV50_3D_FP_INTERPOLANT_CTRL_COUNT__MASK			0x000000ff
+#define NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT		0
+
+#define NV50_3D_FP_REG_ALLOC_TEMP				0x0000198c
+
+#define NV50_3D_REG_MODE					0x000019a0
+#define NV50_3D_REG_MODE_PACKED					0x00000001
+#define NV50_3D_REG_MODE_STRIPED				0x00000002
+
+#define NV50_3D_FP_CONTROL					0x000019a8
+#define NV50_3D_FP_CONTROL_MULTIPLE_RESULTS			0x00000001
+#define NV50_3D_FP_CONTROL_EXPORTS_Z				0x00000100
+#define NV50_3D_FP_CONTROL_USES_KIL				0x00100000
+
+#define NV50_3D_DEPTH_BOUNDS_EN					0x000019bc
+
+#define NV50_3D_UNK19C0						0x000019c0
+
+#define NV50_3D_LOGIC_OP_ENABLE					0x000019c4
+
+#define NV50_3D_LOGIC_OP					0x000019c8
+#define NV50_3D_LOGIC_OP_CLEAR					0x00001500
+#define NV50_3D_LOGIC_OP_AND					0x00001501
+#define NV50_3D_LOGIC_OP_AND_REVERSE				0x00001502
+#define NV50_3D_LOGIC_OP_COPY					0x00001503
+#define NV50_3D_LOGIC_OP_AND_INVERTED				0x00001504
+#define NV50_3D_LOGIC_OP_NOOP					0x00001505
+#define NV50_3D_LOGIC_OP_XOR					0x00001506
+#define NV50_3D_LOGIC_OP_OR					0x00001507
+#define NV50_3D_LOGIC_OP_NOR					0x00001508
+#define NV50_3D_LOGIC_OP_EQUIV					0x00001509
+#define NV50_3D_LOGIC_OP_INVERT					0x0000150a
+#define NV50_3D_LOGIC_OP_OR_REVERSE				0x0000150b
+#define NV50_3D_LOGIC_OP_COPY_INVERTED				0x0000150c
+#define NV50_3D_LOGIC_OP_OR_INVERTED				0x0000150d
+#define NV50_3D_LOGIC_OP_NAND					0x0000150e
+#define NV50_3D_LOGIC_OP_SET					0x0000150f
+
+#define NV50_3D_ZETA_COMP_ENABLE				0x000019cc
+
+#define NV50_3D_CLEAR_BUFFERS					0x000019d0
+#define NV50_3D_CLEAR_BUFFERS_Z					0x00000001
+#define NV50_3D_CLEAR_BUFFERS_S					0x00000002
+#define NV50_3D_CLEAR_BUFFERS_R					0x00000004
+#define NV50_3D_CLEAR_BUFFERS_G					0x00000008
+#define NV50_3D_CLEAR_BUFFERS_B					0x00000010
+#define NV50_3D_CLEAR_BUFFERS_A					0x00000020
+#define NV50_3D_CLEAR_BUFFERS_RT__MASK				0x000003c0
+#define NV50_3D_CLEAR_BUFFERS_RT__SHIFT				6
+#define NV50_3D_CLEAR_BUFFERS_LAYER__MASK			0x001ffc00
+#define NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT			10
+
+#define NV50_3D_CLIPID_FILL					0x000019d4
+
+#define NV50_3D_UNK19D8(i0)				       (0x000019d8 + 0x4*(i0))
+#define NV50_3D_UNK19D8__ESIZE					0x00000004
+#define NV50_3D_UNK19D8__LEN					0x00000002
+
+#define NV50_3D_RT_COMP_ENABLE(i0)			       (0x000019e0 + 0x4*(i0))
+#define NV50_3D_RT_COMP_ENABLE__ESIZE				0x00000004
+#define NV50_3D_RT_COMP_ENABLE__LEN				0x00000008
+
+#define NV50_3D_COLOR_MASK(i0)				       (0x00001a00 + 0x4*(i0))
+#define NV50_3D_COLOR_MASK__ESIZE				0x00000004
+#define NV50_3D_COLOR_MASK__LEN					0x00000008
+#define NV50_3D_COLOR_MASK_R					0x0000000f
+#define NV50_3D_COLOR_MASK_G					0x000000f0
+#define NV50_3D_COLOR_MASK_B					0x00000f00
+#define NV50_3D_COLOR_MASK_A					0x0000f000
+
+#define NV50_3D_UNK1A20						0x00001a20
+
+#define NV50_3D_DELAY						0x00001a24
+
+#define NV50_3D_UNK1A28						0x00001a28
+#define NV50_3D_UNK1A28_0__MASK					0x000000ff
+#define NV50_3D_UNK1A28_0__SHIFT				0
+#define NV50_3D_UNK1A28_1					0x00000100
+
+#define NV50_3D_UNK1A2C						0x00001a2c
+
+#define NV50_3D_UNK1A30						0x00001a30
+
+#define NV50_3D_UNK1A34						0x00001a34
+
+#define NV50_3D_UNK1A38						0x00001a38
+
+#define NV50_3D_UNK1A3C						0x00001a3c
+
+#define NV50_3D_UNK1A40(i0)				       (0x00001a40 + 0x4*(i0))
+#define NV50_3D_UNK1A40__ESIZE					0x00000004
+#define NV50_3D_UNK1A40__LEN					0x00000010
+#define NV50_3D_UNK1A40_0__MASK					0x00000007
+#define NV50_3D_UNK1A40_0__SHIFT				0
+#define NV50_3D_UNK1A40_1__MASK					0x00000070
+#define NV50_3D_UNK1A40_1__SHIFT				4
+#define NV50_3D_UNK1A40_2__MASK					0x00000700
+#define NV50_3D_UNK1A40_2__SHIFT				8
+#define NV50_3D_UNK1A40_3__MASK					0x00007000
+#define NV50_3D_UNK1A40_3__SHIFT				12
+#define NV50_3D_UNK1A40_4__MASK					0x00070000
+#define NV50_3D_UNK1A40_4__SHIFT				16
+#define NV50_3D_UNK1A40_5__MASK					0x00700000
+#define NV50_3D_UNK1A40_5__SHIFT				20
+#define NV50_3D_UNK1A40_6__MASK					0x07000000
+#define NV50_3D_UNK1A40_6__SHIFT				24
+#define NV50_3D_UNK1A40_7__MASK					0x70000000
+#define NV50_3D_UNK1A40_7__SHIFT				28
+
+#define NV50_3D_STRMOUT_ADDRESS_HIGH(i0)		       (0x00001a80 + 0x10*(i0))
+#define NV50_3D_STRMOUT_ADDRESS_HIGH__ESIZE			0x00000010
+#define NV50_3D_STRMOUT_ADDRESS_HIGH__LEN			0x00000004
+
+#define NV50_3D_STRMOUT_ADDRESS_LOW(i0)			       (0x00001a84 + 0x10*(i0))
+#define NV50_3D_STRMOUT_ADDRESS_LOW__ESIZE			0x00000010
+#define NV50_3D_STRMOUT_ADDRESS_LOW__LEN			0x00000004
+
+#define NV50_3D_STRMOUT_NUM_ATTRIBS(i0)			       (0x00001a88 + 0x10*(i0))
+#define NV50_3D_STRMOUT_NUM_ATTRIBS__ESIZE			0x00000010
+#define NV50_3D_STRMOUT_NUM_ATTRIBS__LEN			0x00000004
+#define NV50_3D_STRMOUT_NUM_ATTRIBS__MAX			0x00000040
+
+#define NVA0_3D_STRMOUT_OFFSET_LIMIT(i0)		       (0x00001a8c + 0x10*(i0))
+#define NVA0_3D_STRMOUT_OFFSET_LIMIT__ESIZE			0x00000010
+#define NVA0_3D_STRMOUT_OFFSET_LIMIT__LEN			0x00000004
+
+#define NV50_3D_VERTEX_ARRAY_ATTRIB(i0)			       (0x00001ac0 + 0x4*(i0))
+#define NV50_3D_VERTEX_ARRAY_ATTRIB__ESIZE			0x00000004
+#define NV50_3D_VERTEX_ARRAY_ATTRIB__LEN			0x00000010
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_BUFFER__MASK		0x0000000f
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_BUFFER__SHIFT		0
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_CONST			0x00000010
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_OFFSET__MASK		0x0007ffe0
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_OFFSET__SHIFT		5
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT__MASK		0x01f80000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT__SHIFT		19
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32		0x00080000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32		0x00100000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16		0x00180000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_32_32		0x00200000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16		0x00280000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8		0x00500000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_16_16		0x00780000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_32			0x00900000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8		0x00980000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_8_8			0x00c00000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_16			0x00d80000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_8			0x00e80000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_10_10_10_2		0x01800000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE__MASK			0x7e000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE__SHIFT			25
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT			0x7e000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_UNORM			0x24000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_SNORM			0x12000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_USCALED		0x5a000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_SSCALED		0x6c000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_UINT			0x48000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_SINT			0x36000000
+#define NV50_3D_VERTEX_ARRAY_ATTRIB_BGRA			0x80000000
+
+#define NV50_3D_QUERY_ADDRESS_HIGH				0x00001b00
+
+#define NV50_3D_QUERY_ADDRESS_LOW				0x00001b04
+
+#define NV50_3D_QUERY_SEQUENCE					0x00001b08
+
+#define NV50_3D_QUERY_GET					0x00001b0c
+#define NV50_3D_QUERY_GET_MODE__MASK				0x00000003
+#define NV50_3D_QUERY_GET_MODE__SHIFT				0
+#define NV50_3D_QUERY_GET_MODE_WRITE_UNK0			0x00000000
+#define NV50_3D_QUERY_GET_MODE_SYNC				0x00000001
+#define NV50_3D_QUERY_GET_MODE_WRITE_UNK2			0x00000002
+#define NV50_3D_QUERY_GET_UNK4					0x00000010
+#define NVA0_3D_QUERY_GET_INDEX__MASK				0x000000e0
+#define NVA0_3D_QUERY_GET_INDEX__SHIFT				5
+#define NV50_3D_QUERY_GET_UNK8					0x00000100
+#define NV50_3D_QUERY_GET_UNIT__MASK				0x0000f000
+#define NV50_3D_QUERY_GET_UNIT__SHIFT				12
+#define NV50_3D_QUERY_GET_UNIT_UNK00				0x00000000
+#define NV50_3D_QUERY_GET_UNIT_VFETCH				0x00001000
+#define NV50_3D_QUERY_GET_UNIT_VP				0x00002000
+#define NV50_3D_QUERY_GET_UNIT_RAST				0x00004000
+#define NV50_3D_QUERY_GET_UNIT_STRMOUT				0x00005000
+#define NV50_3D_QUERY_GET_UNIT_GP				0x00006000
+#define NV50_3D_QUERY_GET_UNIT_ZCULL				0x00007000
+#define NV50_3D_QUERY_GET_UNIT_TPROP				0x0000a000
+#define NV50_3D_QUERY_GET_UNIT_UNK0C				0x0000c000
+#define NV50_3D_QUERY_GET_UNIT_CROP				0x0000f000
+#define NV50_3D_QUERY_GET_SYNC_COND__MASK			0x00010000
+#define NV50_3D_QUERY_GET_SYNC_COND__SHIFT			16
+#define NV50_3D_QUERY_GET_SYNC_COND_NEQUAL			0x00000000
+#define NV50_3D_QUERY_GET_SYNC_COND_GREATER			0x00010000
+#define NV50_3D_QUERY_GET_INTR					0x00100000
+#define NV50_3D_QUERY_GET_TYPE__MASK				0x00800000
+#define NV50_3D_QUERY_GET_TYPE__SHIFT				23
+#define NV50_3D_QUERY_GET_TYPE_QUERY				0x00000000
+#define NV50_3D_QUERY_GET_TYPE_COUNTER				0x00800000
+#define NV50_3D_QUERY_GET_QUERY_SELECT__MASK			0x0f000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT__SHIFT			24
+#define NV50_3D_QUERY_GET_QUERY_SELECT_ZERO			0x00000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_SAMPLECNT		0x01000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_STRMOUT_NO_OVERFLOW	0x02000000
+#define NVA0_3D_QUERY_GET_QUERY_SELECT_STRMOUT_DROPPED_PRIMITIVES	0x03000000
+#define NVA0_3D_QUERY_GET_QUERY_SELECT_STRMOUT_VERTICES		0x04000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_ZCULL_STAT_UNK0		0x05000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_ZCULL_STAT_UNK1		0x06000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_ZCULL_STAT_UNK2		0x07000000
+#define NV50_3D_QUERY_GET_QUERY_SELECT_ZCULL_STAT_UNK3		0x08000000
+#define NVA0_3D_QUERY_GET_QUERY_SELECT_RT_UNK14			0x0c000000
+#define NVA0_3D_QUERY_GET_QUERY_SELECT_STRMOUT_OFFSET		0x0d000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT__MASK			0x0f000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT__SHIFT			24
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_VFETCH_VERTICES	0x00000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_VFETCH_PRIMITIVES	0x01000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_VP_LAUNCHES		0x02000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_GP_LAUNCHES		0x03000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_GP_PRIMITIVES_OUT	0x04000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_TRANSFORM_FEEDBACK	0x05000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_GENERATED_PRIMITIVES	0x06000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_RAST_PRIMITIVES_PRECLIP	0x07000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_RAST_PRIMITIVES_POSTCLIP	0x08000000
+#define NV50_3D_QUERY_GET_COUNTER_SELECT_FP_PIXELS		0x09000000
+#define NV84_3D_QUERY_GET_COUNTER_SELECT_UNK0A			0x0a000000
+#define NVA0_3D_QUERY_GET_COUNTER_SELECT_UNK0C			0x0c000000
+#define NV50_3D_QUERY_GET_SHORT					0x10000000
+
+#define NVA3_3D_VP_RESULT_MAP_ALT(i0)			       (0x00001b3c + 0x4*(i0))
+#define NVA3_3D_VP_RESULT_MAP_ALT__ESIZE			0x00000004
+#define NVA3_3D_VP_RESULT_MAP_ALT__LEN				0x00000020
+#define NVA3_3D_VP_RESULT_MAP_ALT_0__MASK			0x000000ff
+#define NVA3_3D_VP_RESULT_MAP_ALT_0__SHIFT			0
+#define NVA3_3D_VP_RESULT_MAP_ALT_1__MASK			0x0000ff00
+#define NVA3_3D_VP_RESULT_MAP_ALT_1__SHIFT			8
+#define NVA3_3D_VP_RESULT_MAP_ALT_2__MASK			0x00ff0000
+#define NVA3_3D_VP_RESULT_MAP_ALT_2__SHIFT			16
+#define NVA3_3D_VP_RESULT_MAP_ALT_3__MASK			0xff000000
+#define NVA3_3D_VP_RESULT_MAP_ALT_3__SHIFT			24
+
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT(i0)		       (0x00001c00 + 0x10*(i0))
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT__ESIZE			0x00000010
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT__LEN			0x00000020
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT_STRIDE__MASK		0x00000fff
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT_STRIDE__SHIFT		0
+#define NVA3_3D_VERTEX_ARRAY_FETCH_ALT_ENABLE			0x20000000
+
+#define NVA3_3D_VERTEX_ARRAY_START_HIGH_ALT(i0)		       (0x00001c04 + 0x10*(i0))
+#define NVA3_3D_VERTEX_ARRAY_START_HIGH_ALT__ESIZE		0x00000010
+#define NVA3_3D_VERTEX_ARRAY_START_HIGH_ALT__LEN		0x00000020
+
+#define NVA3_3D_VERTEX_ARRAY_START_LOW_ALT(i0)		       (0x00001c08 + 0x10*(i0))
+#define NVA3_3D_VERTEX_ARRAY_START_LOW_ALT__ESIZE		0x00000010
+#define NVA3_3D_VERTEX_ARRAY_START_LOW_ALT__LEN			0x00000020
+
+#define NVA3_3D_VERTEX_ARRAY_DIVISOR_ALT(i0)		       (0x00001c0c + 0x10*(i0))
+#define NVA3_3D_VERTEX_ARRAY_DIVISOR_ALT__ESIZE			0x00000010
+#define NVA3_3D_VERTEX_ARRAY_DIVISOR_ALT__LEN			0x00000020
+
+#define NVA3_3D_IBLEND(i0)				       (0x00001e00 + 0x20*(i0))
+#define NVA3_3D_IBLEND__ESIZE					0x00000020
+#define NVA3_3D_IBLEND__LEN					0x00000008
+
+#define NVA3_3D_IBLEND_SEPARATE_ALPHA(i0)		       (0x00001e00 + 0x20*(i0))
+
+#define NVA3_3D_IBLEND_EQUATION_RGB(i0)			       (0x00001e04 + 0x20*(i0))
+#define NVA3_3D_IBLEND_EQUATION_RGB_FUNC_ADD			0x00008006
+#define NVA3_3D_IBLEND_EQUATION_RGB_MIN				0x00008007
+#define NVA3_3D_IBLEND_EQUATION_RGB_MAX				0x00008008
+#define NVA3_3D_IBLEND_EQUATION_RGB_FUNC_SUBTRACT		0x0000800a
+#define NVA3_3D_IBLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVA3_3D_IBLEND_FUNC_SRC_RGB(i0)			       (0x00001e08 + 0x20*(i0))
+
+#define NVA3_3D_IBLEND_FUNC_DST_RGB(i0)			       (0x00001e0c + 0x20*(i0))
+
+#define NVA3_3D_IBLEND_EQUATION_ALPHA(i0)		       (0x00001e10 + 0x20*(i0))
+#define NVA3_3D_IBLEND_EQUATION_ALPHA_FUNC_ADD			0x00008006
+#define NVA3_3D_IBLEND_EQUATION_ALPHA_MIN			0x00008007
+#define NVA3_3D_IBLEND_EQUATION_ALPHA_MAX			0x00008008
+#define NVA3_3D_IBLEND_EQUATION_ALPHA_FUNC_SUBTRACT		0x0000800a
+#define NVA3_3D_IBLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVA3_3D_IBLEND_FUNC_SRC_ALPHA(i0)		       (0x00001e14 + 0x20*(i0))
+
+#define NVA3_3D_IBLEND_FUNC_DST_ALPHA(i0)		       (0x00001e18 + 0x20*(i0))
+
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_HIGH_ALT(i0)		       (0x00001f00 + 0x8*(i0))
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_HIGH_ALT__ESIZE		0x00000008
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_HIGH_ALT__LEN		0x00000020
+
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_LOW_ALT(i0)		       (0x00001f04 + 0x8*(i0))
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_LOW_ALT__ESIZE		0x00000008
+#define NVA3_3D_VERTEX_ARRAY_LIMIT_LOW_ALT__LEN			0x00000020
+
+
+#endif /* RNNDB_NV50_3D_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_3ddefs.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_3ddefs.xml.h
new file mode 100644
index 00000000000..f26ac45da40
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_3ddefs.xml.h
@@ -0,0 +1,98 @@
+#ifndef NV_3DDEFS_XML
+#define NV_3DDEFS_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nv50_3d.xml    (  26312 bytes, from 2010-10-08 10:10:01)
+- copyright.xml  (   6498 bytes, from 2010-10-03 13:18:37)
+- nv_defs.xml    (   4437 bytes, from 2010-07-06 07:43:58)
+- nv_3ddefs.xml  (  16397 bytes, from 2010-10-08 13:30:38)
+- nv_object.xml  (  11249 bytes, from 2010-10-07 15:31:28)
+- nvchipsets.xml (   2824 bytes, from 2010-07-07 13:41:20)
+- nv50_defs.xml  (   4482 bytes, from 2010-10-03 13:18:37)
+
+Copyright (C) 2006-2010 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro, curro_, currojerez)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#define NV50_3D_BLEND_FACTOR_ZERO				0x00004000
+#define NV50_3D_BLEND_FACTOR_ONE				0x00004001
+#define NV50_3D_BLEND_FACTOR_SRC_COLOR				0x00004300
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC_COLOR		0x00004301
+#define NV50_3D_BLEND_FACTOR_SRC_ALPHA				0x00004302
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA		0x00004303
+#define NV50_3D_BLEND_FACTOR_DST_ALPHA				0x00004304
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_DST_ALPHA		0x00004305
+#define NV50_3D_BLEND_FACTOR_DST_COLOR				0x00004306
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_DST_COLOR		0x00004307
+#define NV50_3D_BLEND_FACTOR_SRC_ALPHA_SATURATE			0x00004308
+#define NV50_3D_BLEND_FACTOR_CONSTANT_COLOR			0x0000c001
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR		0x0000c002
+#define NV50_3D_BLEND_FACTOR_CONSTANT_ALPHA			0x0000c003
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA		0x0000c004
+#define NV50_3D_BLEND_FACTOR_SRC1_COLOR				0x0000c900
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR		0x0000c901
+#define NV50_3D_BLEND_FACTOR_SRC1_ALPHA				0x0000c902
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA		0x0000c903
+
+#endif /* NV_3DDEFS_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
new file mode 100644
index 00000000000..bdd6a63d1f1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -0,0 +1,223 @@
+
+#ifndef __NV50_BLIT_H__
+#define __NV50_BLIT_H__
+
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+void *
+nv50_blitter_make_fp(struct pipe_context *,
+                     unsigned mode,
+                     enum pipe_texture_target);
+
+unsigned
+nv50_blit_select_mode(const struct pipe_blit_info *);
+
+/* Converted to a pipe->blit. */
+void
+nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
+
+#define NV50_BLIT_MODE_PASS  0 /* pass through TEX $t0/$s0 output */
+#define NV50_BLIT_MODE_Z24S8 1 /* encode ZS values for RGBA unorm8 */
+#define NV50_BLIT_MODE_S8Z24 2
+#define NV50_BLIT_MODE_X24S8 3
+#define NV50_BLIT_MODE_S8X24 4
+#define NV50_BLIT_MODE_Z24X8 5
+#define NV50_BLIT_MODE_X8Z24 6
+#define NV50_BLIT_MODE_ZS    7 /* put $t0/$s0 into R, $t1/$s1 into G */
+#define NV50_BLIT_MODE_XS    8 /* put $t1/$s1 into G */
+#define NV50_BLIT_MODES      9
+
+/* CUBE and RECT textures are reinterpreted as 2D(_ARRAY) */
+#define NV50_BLIT_TEXTURE_BUFFER    0
+#define NV50_BLIT_TEXTURE_1D        1
+#define NV50_BLIT_TEXTURE_2D        2
+#define NV50_BLIT_TEXTURE_3D        3
+#define NV50_BLIT_TEXTURE_1D_ARRAY  4
+#define NV50_BLIT_TEXTURE_2D_ARRAY  5
+#define NV50_BLIT_MAX_TEXTURE_TYPES 6
+
+static INLINE unsigned
+nv50_blit_texture_type(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D: return NV50_BLIT_TEXTURE_1D;
+   case PIPE_TEXTURE_2D: return NV50_BLIT_TEXTURE_2D;
+   case PIPE_TEXTURE_3D: return NV50_BLIT_TEXTURE_3D;
+   case PIPE_TEXTURE_1D_ARRAY: return NV50_BLIT_TEXTURE_1D_ARRAY;
+   case PIPE_TEXTURE_2D_ARRAY: return NV50_BLIT_TEXTURE_2D_ARRAY;
+   default:
+      assert(target == PIPE_BUFFER);
+      return NV50_BLIT_TEXTURE_BUFFER;
+   }
+}
+
+static INLINE unsigned
+nv50_blit_get_tgsi_texture_target(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D: return TGSI_TEXTURE_1D;
+   case PIPE_TEXTURE_2D: return TGSI_TEXTURE_2D;
+   case PIPE_TEXTURE_3D: return TGSI_TEXTURE_3D;
+   case PIPE_TEXTURE_1D_ARRAY: return TGSI_TEXTURE_1D_ARRAY;
+   case PIPE_TEXTURE_2D_ARRAY: return TGSI_TEXTURE_2D_ARRAY;
+   default:
+      assert(target == PIPE_BUFFER);
+      return TGSI_TEXTURE_BUFFER;
+   }
+}
+
+static INLINE enum pipe_texture_target
+nv50_blit_reinterpret_pipe_texture_target(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return PIPE_TEXTURE_2D_ARRAY;
+   case PIPE_TEXTURE_RECT:
+      return PIPE_TEXTURE_2D;
+   default:
+      return target;
+   }
+}
+
+static INLINE unsigned
+nv50_blit_get_filter(const struct pipe_blit_info *info)
+{
+   if (info->dst.resource->nr_samples < info->src.resource->nr_samples)
+      return util_format_is_depth_or_stencil(info->src.format) ? 0 : 1;
+
+   if (info->filter != PIPE_TEX_FILTER_LINEAR)
+      return 0;
+
+   if ((info->dst.box.width ==  info->src.box.width ||
+        info->dst.box.width == -info->src.box.width) &&
+       (info->dst.box.height ==  info->src.box.height ||
+        info->dst.box.height == -info->src.box.height))
+      return 0;
+
+   return 1;
+}
+
+/* Since shaders cannot export stencil, we cannot copy stencil values when
+ * rendering to ZETA, so we attach the ZS surface to a colour render target.
+ */
+static INLINE enum pipe_format
+nv50_blit_zeta_to_colour_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      return PIPE_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   case PIPE_FORMAT_Z32_FLOAT:
+      return PIPE_FORMAT_R32_FLOAT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return PIPE_FORMAT_R32G32_FLOAT;
+   default:
+      assert(0);
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+
+static INLINE uint16_t
+nv50_blit_derive_color_mask(const struct pipe_blit_info *info)
+{
+   const unsigned mask = info->mask;
+
+   uint16_t color_mask = 0;
+
+   switch (info->dst.format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      if (mask & PIPE_MASK_S)
+         color_mask |= 0x1000;
+      /* fall through */
+   case PIPE_FORMAT_Z24X8_UNORM:
+      if (mask & PIPE_MASK_Z)
+         color_mask |= 0x0111;
+      break;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      if (mask & PIPE_MASK_Z)
+         color_mask |= 0x1110;
+      if (mask & PIPE_MASK_S)
+         color_mask |= 0x0001;
+      break;
+   default:
+      if (mask & (PIPE_MASK_R | PIPE_MASK_Z)) color_mask |= 0x0001;
+      if (mask & (PIPE_MASK_G | PIPE_MASK_S)) color_mask |= 0x0010;
+      if (mask & PIPE_MASK_B) color_mask |= 0x0100;
+      if (mask & PIPE_MASK_A) color_mask |= 0x1000;
+      break;
+   }
+
+   return color_mask;
+}
+
+static INLINE uint32_t
+nv50_blit_eng2d_get_mask(const struct pipe_blit_info *info)
+{
+   uint32_t mask = 0;
+
+   switch (info->dst.format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      if (info->mask & PIPE_MASK_Z) mask |= 0x00ffffff;
+      if (info->mask & PIPE_MASK_S) mask |= 0xff000000;
+      break;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      if (info->mask & PIPE_MASK_Z) mask |= 0xffffff00;
+      if (info->mask & PIPE_MASK_S) mask |= 0x000000ff;
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      if (info->mask & PIPE_MASK_Z) mask = 0x00ffffff;
+      break;
+   default:
+      mask = 0xffffffff;
+      break;
+   }
+   return mask;
+}
+
+#if NOUVEAU_DRIVER == 0xc0
+# define nv50_format_table nvc0_format_table
+#endif
+
+/* return TRUE for formats that can be converted among each other by NVC0_2D */
+static INLINE boolean
+nv50_2d_dst_format_faithful(enum pipe_format format)
+{
+   const uint64_t mask =
+       NV50_ENG2D_SUPPORTED_FORMATS &
+      ~NV50_ENG2D_NOCONVERT_FORMATS;
+   uint8_t id = nv50_format_table[format].rt;
+   return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
+}
+static INLINE boolean
+nv50_2d_src_format_faithful(enum pipe_format format)
+{
+   const uint64_t mask =
+      NV50_ENG2D_SUPPORTED_FORMATS &
+    ~(NV50_ENG2D_LUMINANCE_FORMATS | NV50_ENG2D_INTENSITY_FORMATS);
+   uint8_t id = nv50_format_table[format].rt;
+   return (id >= 0xc0) && (mask & (1ULL << (id - 0xc0)));
+}
+
+static INLINE boolean
+nv50_2d_format_supported(enum pipe_format format)
+{
+   uint8_t id = nv50_format_table[format].rt;
+   return (id >= 0xc0) &&
+      (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0)));
+}
+
+static INLINE boolean
+nv50_2d_dst_format_ops_supported(enum pipe_format format)
+{
+   uint8_t id = nv50_format_table[format].rt;
+   return (id >= 0xc0) &&
+      (NV50_ENG2D_OPERATION_FORMATS & (1ULL << (id - 0xc0)));
+}
+
+#endif /* __NV50_BLIT_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
new file mode 100644
index 00000000000..b6bdf79b389
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
+
+#ifdef NV50_WITH_DRAW_MODULE
+#include "draw/draw_context.h"
+#endif
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_screen.h"
+#include "nv50/nv50_resource.h"
+
+static void
+nv50_flush(struct pipe_context *pipe,
+           struct pipe_fence_handle **fence,
+           unsigned flags)
+{
+   struct nouveau_screen *screen = nouveau_screen(pipe->screen);
+
+   if (fence)
+      nouveau_fence_ref(screen->fence.current, (struct nouveau_fence **)fence);
+
+   PUSH_KICK(screen->pushbuf);
+
+   nouveau_context_update_frame_stats(nouveau_context(pipe));
+}
+
+static void
+nv50_texture_barrier(struct pipe_context *pipe)
+{
+   struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+
+   BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
+   PUSH_DATA (push, 0x20);
+}
+
+void
+nv50_default_kick_notify(struct nouveau_pushbuf *push)
+{
+   struct nv50_screen *screen = push->user_priv;
+
+   if (screen) {
+      nouveau_fence_next(&screen->base);
+      nouveau_fence_update(&screen->base, TRUE);
+      if (screen->cur_ctx)
+         screen->cur_ctx->state.flushed = TRUE;
+   }
+}
+
+static void
+nv50_context_unreference_resources(struct nv50_context *nv50)
+{
+   unsigned s, i;
+
+   nouveau_bufctx_del(&nv50->bufctx_3d);
+   nouveau_bufctx_del(&nv50->bufctx);
+
+   util_unreference_framebuffer_state(&nv50->framebuffer);
+
+   for (i = 0; i < nv50->num_vtxbufs; ++i)
+      pipe_resource_reference(&nv50->vtxbuf[i].buffer, NULL);
+
+   pipe_resource_reference(&nv50->idxbuf.buffer, NULL);
+
+   for (s = 0; s < 3; ++s) {
+      for (i = 0; i < nv50->num_textures[s]; ++i)
+         pipe_sampler_view_reference(&nv50->textures[s][i], NULL);
+
+      for (i = 0; i < NV50_MAX_PIPE_CONSTBUFS; ++i)
+         if (!nv50->constbuf[s][i].user)
+            pipe_resource_reference(&nv50->constbuf[s][i].u.buf, NULL);
+   }
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   if (nv50_context_screen(nv50)->cur_ctx == nv50) {
+      nv50->base.pushbuf->kick_notify = NULL;
+      nv50_context_screen(nv50)->cur_ctx = NULL;
+      nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL);
+   }
+   /* need to flush before destroying the bufctx */
+   nouveau_pushbuf_kick(nv50->base.pushbuf, nv50->base.pushbuf->channel);
+
+   nv50_context_unreference_resources(nv50);
+
+#ifdef NV50_WITH_DRAW_MODULE
+   draw_destroy(nv50->draw);
+#endif
+
+   nouveau_context_destroy(&nv50->base);
+}
+
+static int
+nv50_invalidate_resource_storage(struct nouveau_context *ctx,
+                                 struct pipe_resource *res,
+                                 int ref)
+{
+   struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned s, i;
+
+   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+      for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
+         if (nv50->framebuffer.cbufs[i] &&
+             nv50->framebuffer.cbufs[i]->texture == res) {
+            nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (nv50->framebuffer.zsbuf &&
+          nv50->framebuffer.zsbuf->texture == res) {
+         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+         if (!--ref)
+            return ref;
+      }
+   }
+
+   if (res->bind & PIPE_BIND_VERTEX_BUFFER) {
+      for (i = 0; i < nv50->num_vtxbufs; ++i) {
+         if (nv50->vtxbuf[i].buffer == res) {
+            nv50->dirty |= NV50_NEW_ARRAYS;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_INDEX_BUFFER) {
+      if (nv50->idxbuf.buffer == res)
+         if (!--ref)
+            return ref;
+   }
+
+   if (res->bind & PIPE_BIND_SAMPLER_VIEW) {
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < nv50->num_textures[s]; ++i) {
+         if (nv50->textures[s][i] &&
+             nv50->textures[s][i]->texture == res) {
+            nv50->dirty |= NV50_NEW_TEXTURES;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
+   }
+
+   if (res->bind & PIPE_BIND_CONSTANT_BUFFER) {
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < nv50->num_vtxbufs; ++i) {
+         if (!nv50->constbuf[s][i].user &&
+             nv50->constbuf[s][i].u.buf == res) {
+            nv50->dirty |= NV50_NEW_CONSTBUF;
+            nv50->constbuf_dirty[s] |= 1 << i;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
+   }
+
+   return ref;
+}
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, void *priv)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   struct nv50_context *nv50;
+   struct pipe_context *pipe;
+   int ret;
+   uint32_t flags;
+
+   nv50 = CALLOC_STRUCT(nv50_context);
+   if (!nv50)
+      return NULL;
+   pipe = &nv50->base.pipe;
+
+   if (!nv50_blitctx_create(nv50))
+      goto out_err;
+
+   nv50->base.pushbuf = screen->base.pushbuf;
+   nv50->base.client = screen->base.client;
+
+   ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_COUNT,
+                            &nv50->bufctx_3d);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+   if (ret)
+      goto out_err;
+
+   nv50->base.screen    = &screen->base;
+   nv50->base.copy_data = nv50_m2mf_copy_linear;
+   nv50->base.push_data = nv50_sifc_linear_u8;
+   nv50->base.push_cb   = nv50_cb_push;
+
+   nv50->screen = screen;
+   pipe->screen = pscreen;
+   pipe->priv = priv;
+
+   pipe->destroy = nv50_destroy;
+
+   pipe->draw_vbo = nv50_draw_vbo;
+   pipe->clear = nv50_clear;
+
+   pipe->flush = nv50_flush;
+   pipe->texture_barrier = nv50_texture_barrier;
+
+   if (!screen->cur_ctx) {
+      screen->cur_ctx = nv50;
+      nouveau_pushbuf_bufctx(screen->base.pushbuf, nv50->bufctx);
+   }
+   nv50->base.pushbuf->kick_notify = nv50_default_kick_notify;
+
+   nv50_init_query_functions(nv50);
+   nv50_init_surface_functions(nv50);
+   nv50_init_state_functions(nv50);
+   nv50_init_resource_functions(pipe);
+
+   nv50->base.invalidate_resource_storage = nv50_invalidate_resource_storage;
+
+#ifdef NV50_WITH_DRAW_MODULE
+   /* no software fallbacks implemented */
+   nv50->draw = draw_create(pipe);
+   assert(nv50->draw);
+   draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+#endif
+
+   if (screen->base.device->chipset < 0x84 ||
+       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+      /* PMPEG */
+      nouveau_context_init_vdec(&nv50->base);
+   } else if (screen->base.device->chipset < 0x98 ||
+              screen->base.device->chipset == 0xa0) {
+      /* VP2 */
+      pipe->create_video_codec = nv84_create_decoder;
+      pipe->create_video_buffer = nv84_video_buffer_create;
+   } else {
+      /* VP3/4 */
+      pipe->create_video_codec = nv98_create_decoder;
+      pipe->create_video_buffer = nv98_video_buffer_create;
+   }
+
+   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
+
+   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code);
+   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
+   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
+   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+
+   flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
+
+   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
+   BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
+
+   nv50->base.scratch.bo_size = 2 << 20;
+
+   return pipe;
+
+out_err:
+   if (nv50) {
+      if (nv50->bufctx_3d)
+         nouveau_bufctx_del(&nv50->bufctx_3d);
+      if (nv50->bufctx)
+         nouveau_bufctx_del(&nv50->bufctx);
+      if (nv50->blit)
+         FREE(nv50->blit);
+      FREE(nv50);
+   }
+   return NULL;
+}
+
+void
+nv50_bufctx_fence(struct nouveau_bufctx *bufctx, boolean on_flush)
+{
+   struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
+   struct nouveau_list *it;
+
+   for (it = list->next; it != list; it = it->next) {
+      struct nouveau_bufref *ref = (struct nouveau_bufref *)it;
+      struct nv04_resource *res = ref->priv;
+      if (res)
+         nv50_resource_validate(res, (unsigned)ref->priv_data);
+   }
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
new file mode 100644
index 00000000000..ee6eb0ef715
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -0,0 +1,322 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+#include "util/u_dynarray.h"
+
+#ifdef NV50_WITH_DRAW_MODULE
+#include "draw/draw_vertex.h"
+#endif
+
+#include "nv50/nv50_debug.h"
+#include "nv50/nv50_winsys.h"
+#include "nv50/nv50_stateobj.h"
+#include "nv50/nv50_screen.h"
+#include "nv50/nv50_program.h"
+#include "nv50/nv50_resource.h"
+#include "nv50/nv50_transfer.h"
+
+#include "nouveau_context.h"
+#include "nv_object.xml.h"
+#include "nv_m2mf.xml.h"
+#include "nv50/nv50_3ddefs.xml.h"
+#include "nv50/nv50_3d.xml.h"
+#include "nv50/nv50_2d.xml.h"
+
+#define NV50_NEW_BLEND        (1 << 0)
+#define NV50_NEW_RASTERIZER   (1 << 1)
+#define NV50_NEW_ZSA          (1 << 2)
+#define NV50_NEW_VERTPROG     (1 << 3)
+#define NV50_NEW_GMTYPROG     (1 << 6)
+#define NV50_NEW_FRAGPROG     (1 << 7)
+#define NV50_NEW_BLEND_COLOUR (1 << 8)
+#define NV50_NEW_STENCIL_REF  (1 << 9)
+#define NV50_NEW_CLIP         (1 << 10)
+#define NV50_NEW_SAMPLE_MASK  (1 << 11)
+#define NV50_NEW_FRAMEBUFFER  (1 << 12)
+#define NV50_NEW_STIPPLE      (1 << 13)
+#define NV50_NEW_SCISSOR      (1 << 14)
+#define NV50_NEW_VIEWPORT     (1 << 15)
+#define NV50_NEW_ARRAYS       (1 << 16)
+#define NV50_NEW_VERTEX       (1 << 17)
+#define NV50_NEW_CONSTBUF     (1 << 18)
+#define NV50_NEW_TEXTURES     (1 << 19)
+#define NV50_NEW_SAMPLERS     (1 << 20)
+#define NV50_NEW_STRMOUT      (1 << 21)
+#define NV50_NEW_CONTEXT      (1 << 31)
+
+#define NV50_BIND_FB          0
+#define NV50_BIND_VERTEX      1
+#define NV50_BIND_VERTEX_TMP  2
+#define NV50_BIND_INDEX       3
+#define NV50_BIND_TEXTURES    4
+#define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
+#define NV50_BIND_SO         53
+#define NV50_BIND_SCREEN     54
+#define NV50_BIND_TLS        55
+#define NV50_BIND_COUNT      56
+#define NV50_BIND_2D          0
+#define NV50_BIND_M2MF        0
+#define NV50_BIND_FENCE       1
+
+#define NV50_CB_TMP 123
+/* fixed constant buffer binding points - low indices for user's constbufs */
+#define NV50_CB_PVP 124
+#define NV50_CB_PGP 126
+#define NV50_CB_PFP 125
+#define NV50_CB_AUX 127
+
+
+struct nv50_blitctx;
+
+boolean nv50_blitctx_create(struct nv50_context *);
+
+struct nv50_context {
+   struct nouveau_context base;
+
+   struct nv50_screen *screen;
+
+   struct nouveau_bufctx *bufctx_3d;
+   struct nouveau_bufctx *bufctx;
+
+   uint32_t dirty;
+
+   struct {
+      uint32_t instance_elts; /* bitmask of per-instance elements */
+      uint32_t instance_base;
+      uint32_t interpolant_ctrl;
+      uint32_t semantic_color;
+      uint32_t semantic_psize;
+      int32_t index_bias;
+      boolean uniform_buffer_bound[3];
+      boolean prim_restart;
+      boolean point_sprite;
+      boolean rt_serialize;
+      boolean flushed;
+      boolean rasterizer_discard;
+      uint8_t tls_required;
+      boolean new_tls_space;
+      uint8_t num_vtxbufs;
+      uint8_t num_vtxelts;
+      uint8_t num_textures[3];
+      uint8_t num_samplers[3];
+      uint8_t prim_size;
+      uint16_t scissor;
+   } state;
+
+   struct nv50_blend_stateobj *blend;
+   struct nv50_rasterizer_stateobj *rast;
+   struct nv50_zsa_stateobj *zsa;
+   struct nv50_vertex_stateobj *vertex;
+
+   struct nv50_program *vertprog;
+   struct nv50_program *gmtyprog;
+   struct nv50_program *fragprog;
+
+   struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
+   uint16_t constbuf_dirty[3];
+   uint16_t constbuf_valid[3];
+
+   struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+   unsigned num_vtxbufs;
+   struct pipe_index_buffer idxbuf;
+   uint32_t vbo_fifo; /* bitmask of vertex elements to be pushed to FIFO */
+   uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
+   uint32_t vbo_constant; /* bitmask of user buffers with stride 0 */
+   uint32_t vb_elt_first; /* from pipe_draw_info, for vertex upload */
+   uint32_t vb_elt_limit; /* max - min element (count - 1) */
+   uint32_t instance_off; /* base vertex for instanced arrays */
+   uint32_t instance_max; /* max instance for current draw call */
+
+   struct pipe_sampler_view *textures[3][PIPE_MAX_SAMPLERS];
+   unsigned num_textures[3];
+   struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
+   unsigned num_samplers[3];
+
+   uint8_t num_so_targets;
+   uint8_t so_targets_dirty;
+   struct pipe_stream_output_target *so_target[4];
+
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_blend_color blend_colour;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_poly_stipple stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_viewport_state viewport;
+   struct pipe_clip_state clip;
+
+   unsigned sample_mask;
+
+   boolean vbo_push_hint;
+
+   struct pipe_query *cond_query;
+   boolean cond_cond;
+   uint cond_mode;
+
+   struct nv50_blitctx *blit;
+
+#ifdef NV50_WITH_DRAW_MODULE
+   struct draw_context *draw;
+#endif
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+   return (struct nv50_context *)pipe;
+}
+
+static INLINE struct nv50_screen *
+nv50_context_screen(struct nv50_context *nv50)
+{
+   return nv50_screen(&nv50->base.screen->base);
+}
+
+/* return index used in nv50_context arrays for a specific shader type */
+static INLINE unsigned
+nv50_context_shader_stage(unsigned pipe)
+{
+   switch (pipe) {
+   case PIPE_SHADER_VERTEX: return 0;
+   case PIPE_SHADER_FRAGMENT: return 1;
+   case PIPE_SHADER_GEOMETRY: return 2;
+   case PIPE_SHADER_COMPUTE: return 3;
+   default:
+      assert(!"invalid/unhandled shader type");
+      return 0;
+   }
+}
+
+/* nv50_context.c */
+struct pipe_context *nv50_create(struct pipe_screen *, void *);
+
+void nv50_bufctx_fence(struct nouveau_bufctx *, boolean on_flush);
+
+void nv50_default_kick_notify(struct nouveau_pushbuf *);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
+
+/* nv50_query.c */
+void nv50_init_query_functions(struct nv50_context *);
+void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
+                               struct pipe_query *, unsigned result_offset);
+void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
+void nva0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *,
+                                unsigned index, boolean seralize);
+
+#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
+
+/* nv50_shader_state.c */
+void nv50_vertprog_validate(struct nv50_context *);
+void nv50_gmtyprog_validate(struct nv50_context *);
+void nv50_fragprog_validate(struct nv50_context *);
+void nv50_fp_linkage_validate(struct nv50_context *);
+void nv50_gp_linkage_validate(struct nv50_context *);
+void nv50_constbufs_validate(struct nv50_context *);
+void nv50_validate_derived_rs(struct nv50_context *);
+void nv50_stream_output_validate(struct nv50_context *);
+
+/* nv50_state.c */
+extern void nv50_init_state_functions(struct nv50_context *);
+
+/* nv50_state_validate.c */
+/* @words: check for space before emitting relocs */
+extern boolean nv50_state_validate(struct nv50_context *, uint32_t state_mask,
+                                   unsigned space_words);
+
+/* nv50_surface.c */
+extern void nv50_clear(struct pipe_context *, unsigned buffers,
+                       const union pipe_color_union *color,
+                       double depth, unsigned stencil);
+extern void nv50_init_surface_functions(struct nv50_context *);
+
+/* nv50_tex.c */
+void nv50_validate_textures(struct nv50_context *);
+void nv50_validate_samplers(struct nv50_context *);
+
+struct pipe_sampler_view *
+nv50_create_texture_view(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_sampler_view *,
+                         uint32_t flags,
+                         enum pipe_texture_target);
+struct pipe_sampler_view *
+nv50_create_sampler_view(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_sampler_view *);
+
+/* nv50_transfer.c */
+void
+nv50_m2mf_transfer_rect(struct nv50_context *,
+                        const struct nv50_m2mf_rect *dst,
+                        const struct nv50_m2mf_rect *src,
+                        uint32_t nblocksx, uint32_t nblocksy);
+void
+nv50_sifc_linear_u8(struct nouveau_context *pipe,
+                    struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                    unsigned size, const void *data);
+void
+nv50_m2mf_copy_linear(struct nouveau_context *pipe,
+                      struct nouveau_bo *dst, unsigned dstoff, unsigned dstdom,
+                      struct nouveau_bo *src, unsigned srcoff, unsigned srcdom,
+                      unsigned size);
+void
+nv50_cb_push(struct nouveau_context *nv,
+             struct nouveau_bo *bo, unsigned domain,
+             unsigned base, unsigned size,
+             unsigned offset, unsigned words, const uint32_t *data);
+
+/* nv50_vbo.c */
+void nv50_draw_vbo(struct pipe_context *, const struct pipe_draw_info *);
+
+void *
+nv50_vertex_state_create(struct pipe_context *pipe,
+                         unsigned num_elements,
+                         const struct pipe_vertex_element *elements);
+void
+nv50_vertex_state_delete(struct pipe_context *pipe, void *hwcso);
+
+void nv50_vertex_arrays_validate(struct nv50_context *nv50);
+
+/* nv50_push.c */
+void nv50_push_vbo(struct nv50_context *, const struct pipe_draw_info *);
+
+/* nv84_video.c */
+struct pipe_video_codec *
+nv84_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ);
+
+struct pipe_video_buffer *
+nv84_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *template);
+
+int
+nv84_screen_get_video_param(struct pipe_screen *pscreen,
+                            enum pipe_video_profile profile,
+                            enum pipe_video_entrypoint entrypoint,
+                            enum pipe_video_cap param);
+
+boolean
+nv84_screen_video_supported(struct pipe_screen *screen,
+                            enum pipe_format format,
+                            enum pipe_video_profile profile,
+                            enum pipe_video_entrypoint entrypoint);
+
+/* nv98_video.c */
+struct pipe_video_codec *
+nv98_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ);
+
+struct pipe_video_buffer *
+nv98_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *template);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_debug.h b/src/gallium/drivers/nouveau/nv50/nv50_debug.h
new file mode 100644
index 00000000000..f3dee621519
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_debug.h
@@ -0,0 +1,25 @@
+
+#ifndef __NV50_DEBUG_H__
+#define __NV50_DEBUG_H__
+
+#include <stdio.h>
+
+#include "util/u_debug.h"
+
+#define NV50_DEBUG_MISC       0x0001
+#define NV50_DEBUG_SHADER     0x0100
+#define NV50_DEBUG_PROG_IR    0x0200
+#define NV50_DEBUG_PROG_RA    0x0400
+#define NV50_DEBUG_PROG_CFLOW 0x0800
+#define NV50_DEBUG_PROG_ALL   0x1f00
+
+#define NV50_DEBUG 0
+
+#define NOUVEAU_ERR(fmt, args...)                                 \
+   fprintf(stderr, "%s:%d - "fmt, __FUNCTION__, __LINE__, ##args)
+
+#define NV50_DBGMSG(ch, args...)           \
+   if ((NV50_DEBUG) & (NV50_DEBUG_##ch))        \
+      debug_printf(args)
+
+#endif /* __NV50_DEBUG_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_defs.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_defs.xml.h
new file mode 100644
index 00000000000..2e42843fa56
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_defs.xml.h
@@ -0,0 +1,200 @@
+#ifndef NV50_DEFS_XML
+#define NV50_DEFS_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/nv50_defs.xml    (   7783 bytes, from 2013-02-14 13:56:25)
+- ./rnndb/copyright.xml  (   6452 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nvchipsets.xml (   3704 bytes, from 2012-08-18 12:48:55)
+
+Copyright (C) 2006-2013 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#define NV50_VSTATUS_IDLE					0x00000000
+#define NV50_VSTATUS_BUSY					0x00000001
+#define NV50_VSTATUS_UNK2					0x00000002
+#define NV50_VSTATUS_WAITING					0x00000003
+#define NV50_VSTATUS_BLOCKED					0x00000005
+#define NV50_VSTATUS_FAULTED					0x00000006
+#define NV50_VSTATUS_PAUSED					0x00000007
+#define NV50_SURFACE_FORMAT_BITMAP				0x0000001c
+#define NV50_SURFACE_FORMAT_UNK1D				0x0000001d
+#define NV50_SURFACE_FORMAT_RGBA32_FLOAT			0x000000c0
+#define NV50_SURFACE_FORMAT_RGBA32_SINT				0x000000c1
+#define NV50_SURFACE_FORMAT_RGBA32_UINT				0x000000c2
+#define NV50_SURFACE_FORMAT_RGBX32_FLOAT			0x000000c3
+#define NV50_SURFACE_FORMAT_RGBX32_SINT				0x000000c4
+#define NV50_SURFACE_FORMAT_RGBX32_UINT				0x000000c5
+#define NV50_SURFACE_FORMAT_RGBA16_UNORM			0x000000c6
+#define NV50_SURFACE_FORMAT_RGBA16_SNORM			0x000000c7
+#define NV50_SURFACE_FORMAT_RGBA16_SINT				0x000000c8
+#define NV50_SURFACE_FORMAT_RGBA16_UINT				0x000000c9
+#define NV50_SURFACE_FORMAT_RGBA16_FLOAT			0x000000ca
+#define NV50_SURFACE_FORMAT_RG32_FLOAT				0x000000cb
+#define NV50_SURFACE_FORMAT_RG32_SINT				0x000000cc
+#define NV50_SURFACE_FORMAT_RG32_UINT				0x000000cd
+#define NV50_SURFACE_FORMAT_RGBX16_FLOAT			0x000000ce
+#define NV50_SURFACE_FORMAT_BGRA8_UNORM				0x000000cf
+#define NV50_SURFACE_FORMAT_BGRA8_SRGB				0x000000d0
+#define NV50_SURFACE_FORMAT_RGB10_A2_UNORM			0x000000d1
+#define NV50_SURFACE_FORMAT_RGB10_A2_UINT			0x000000d2
+#define NV50_SURFACE_FORMAT_RGBA8_UNORM				0x000000d5
+#define NV50_SURFACE_FORMAT_RGBA8_SRGB				0x000000d6
+#define NV50_SURFACE_FORMAT_RGBA8_SNORM				0x000000d7
+#define NV50_SURFACE_FORMAT_RGBA8_SINT				0x000000d8
+#define NV50_SURFACE_FORMAT_RGBA8_UINT				0x000000d9
+#define NV50_SURFACE_FORMAT_RG16_UNORM				0x000000da
+#define NV50_SURFACE_FORMAT_RG16_SNORM				0x000000db
+#define NV50_SURFACE_FORMAT_RG16_SINT				0x000000dc
+#define NV50_SURFACE_FORMAT_RG16_UINT				0x000000dd
+#define NV50_SURFACE_FORMAT_RG16_FLOAT				0x000000de
+#define NV50_SURFACE_FORMAT_BGR10_A2_UNORM			0x000000df
+#define NV50_SURFACE_FORMAT_R11G11B10_FLOAT			0x000000e0
+#define NV50_SURFACE_FORMAT_R32_SINT				0x000000e3
+#define NV50_SURFACE_FORMAT_R32_UINT				0x000000e4
+#define NV50_SURFACE_FORMAT_R32_FLOAT				0x000000e5
+#define NV50_SURFACE_FORMAT_BGRX8_UNORM				0x000000e6
+#define NV50_SURFACE_FORMAT_BGRX8_SRGB				0x000000e7
+#define NV50_SURFACE_FORMAT_B5G6R5_UNORM			0x000000e8
+#define NV50_SURFACE_FORMAT_BGR5_A1_UNORM			0x000000e9
+#define NV50_SURFACE_FORMAT_RG8_UNORM				0x000000ea
+#define NV50_SURFACE_FORMAT_RG8_SNORM				0x000000eb
+#define NV50_SURFACE_FORMAT_RG8_SINT				0x000000ec
+#define NV50_SURFACE_FORMAT_RG8_UINT				0x000000ed
+#define NV50_SURFACE_FORMAT_R16_UNORM				0x000000ee
+#define NV50_SURFACE_FORMAT_R16_SNORM				0x000000ef
+#define NV50_SURFACE_FORMAT_R16_SINT				0x000000f0
+#define NV50_SURFACE_FORMAT_R16_UINT				0x000000f1
+#define NV50_SURFACE_FORMAT_R16_FLOAT				0x000000f2
+#define NV50_SURFACE_FORMAT_R8_UNORM				0x000000f3
+#define NV50_SURFACE_FORMAT_R8_SNORM				0x000000f4
+#define NV50_SURFACE_FORMAT_R8_SINT				0x000000f5
+#define NV50_SURFACE_FORMAT_R8_UINT				0x000000f6
+#define NV50_SURFACE_FORMAT_A8_UNORM				0x000000f7
+#define NV50_SURFACE_FORMAT_BGR5_X1_UNORM			0x000000f8
+#define NV50_SURFACE_FORMAT_RGBX8_UNORM				0x000000f9
+#define NV50_SURFACE_FORMAT_RGBX8_SRGB				0x000000fa
+#define NV50_SURFACE_FORMAT_BGR5_X1_UNORM_UNKFB			0x000000fb
+#define NV50_SURFACE_FORMAT_BGR5_X1_UNORM_UNKFC			0x000000fc
+#define NV50_SURFACE_FORMAT_BGRX8_UNORM_UNKFD			0x000000fd
+#define NV50_SURFACE_FORMAT_BGRX8_UNORM_UNKFE			0x000000fe
+#define NV50_SURFACE_FORMAT_Y32_UINT_UNKFF			0x000000ff
+#define NV50_ZETA_FORMAT_Z32_FLOAT				0x0000000a
+#define NV50_ZETA_FORMAT_Z16_UNORM				0x00000013
+#define NV50_ZETA_FORMAT_S8_Z24_UNORM				0x00000014
+#define NV50_ZETA_FORMAT_Z24_X8_UNORM				0x00000015
+#define NV50_ZETA_FORMAT_Z24_S8_UNORM				0x00000016
+#define NV50_ZETA_FORMAT_Z24_C8_UNORM				0x00000018
+#define NV50_ZETA_FORMAT_Z32_S8_X24_FLOAT			0x00000019
+#define NV50_ZETA_FORMAT_Z24_X8_S8_C8_X16_UNORM			0x0000001d
+#define NV50_ZETA_FORMAT_Z32_X8_C8_X16_FLOAT			0x0000001e
+#define NV50_ZETA_FORMAT_Z32_S8_C8_X16_FLOAT			0x0000001f
+#define NVE4_IMAGE_FORMAT_RGBA32_FLOAT				0x00000002
+#define NVE4_IMAGE_FORMAT_RGBA32_SINT				0x00000003
+#define NVE4_IMAGE_FORMAT_RGBA32_UINT				0x00000004
+#define NVE4_IMAGE_FORMAT_RGBA16_UNORM				0x00000008
+#define NVE4_IMAGE_FORMAT_RGBA16_SNORM				0x00000009
+#define NVE4_IMAGE_FORMAT_RGBA16_SINT				0x0000000a
+#define NVE4_IMAGE_FORMAT_RGBA16_UINT				0x0000000b
+#define NVE4_IMAGE_FORMAT_RGBA16_FLOAT				0x0000000c
+#define NVE4_IMAGE_FORMAT_RG32_FLOAT				0x0000000d
+#define NVE4_IMAGE_FORMAT_RG32_SINT				0x0000000e
+#define NVE4_IMAGE_FORMAT_RG32_UINT				0x0000000f
+#define NVE4_IMAGE_FORMAT_RGB10_A2_UNORM			0x00000013
+#define NVE4_IMAGE_FORMAT_RGB10_A2_UINT				0x00000015
+#define NVE4_IMAGE_FORMAT_RGBA8_UNORM				0x00000018
+#define NVE4_IMAGE_FORMAT_RGBA8_SNORM				0x0000001a
+#define NVE4_IMAGE_FORMAT_RGBA8_SINT				0x0000001b
+#define NVE4_IMAGE_FORMAT_RGBA8_UINT				0x0000001c
+#define NVE4_IMAGE_FORMAT_RG16_UNORM				0x0000001d
+#define NVE4_IMAGE_FORMAT_RG16_SNORM				0x0000001e
+#define NVE4_IMAGE_FORMAT_RG16_SINT				0x0000001f
+#define NVE4_IMAGE_FORMAT_RG16_UINT				0x00000020
+#define NVE4_IMAGE_FORMAT_RG16_FLOAT				0x00000021
+#define NVE4_IMAGE_FORMAT_R11G11B10_FLOAT			0x00000024
+#define NVE4_IMAGE_FORMAT_R32_SINT				0x00000027
+#define NVE4_IMAGE_FORMAT_R32_UINT				0x00000028
+#define NVE4_IMAGE_FORMAT_R32_FLOAT				0x00000029
+#define NVE4_IMAGE_FORMAT_RG8_UNORM				0x0000002e
+#define NVE4_IMAGE_FORMAT_RG8_SNORM				0x0000002f
+#define NVE4_IMAGE_FORMAT_RG8_SINT				0x00000030
+#define NVE4_IMAGE_FORMAT_RG8_UINT				0x00000031
+#define NVE4_IMAGE_FORMAT_R16_UNORM				0x00000032
+#define NVE4_IMAGE_FORMAT_R16_SNORM				0x00000033
+#define NVE4_IMAGE_FORMAT_R16_SINT				0x00000034
+#define NVE4_IMAGE_FORMAT_R16_UINT				0x00000035
+#define NVE4_IMAGE_FORMAT_R16_FLOAT				0x00000036
+#define NVE4_IMAGE_FORMAT_R8_UNORM				0x00000037
+#define NVE4_IMAGE_FORMAT_R8_SNORM				0x00000038
+#define NVE4_IMAGE_FORMAT_R8_SINT				0x00000039
+#define NVE4_IMAGE_FORMAT_R8_UINT				0x0000003a
+#define NV50_QUERY__SIZE					0x00000010
+#define NV50_QUERY_COUNTER					0x00000000
+
+#define NV50_QUERY_RES						0x00000004
+
+#define NV50_QUERY_TIME						0x00000008
+
+
+#endif /* NV50_DEFS_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_draw.c b/src/gallium/drivers/nouveau/nv50/nv50_draw.c
new file mode 100644
index 00000000000..fa68cd8ee6a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_draw.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50/nv50_context.h"
+
+struct nv50_render_stage {
+   struct draw_stage stage;
+   struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+   return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+   FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+   struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+   rs->nv50 = nv50;
+   rs->stage.draw = nv50->draw;
+   rs->stage.destroy = nv50_render_destroy;
+   rs->stage.point = nv50_render_point;
+   rs->stage.line = nv50_render_line;
+   rs->stage.tri = nv50_render_tri;
+   rs->stage.flush = nv50_render_flush;
+   rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+   return &rs->stage;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
new file mode 100644
index 00000000000..0a7e812ba13
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if NOUVEAU_DRIVER == 0xc0
+# include "nvc0/nvc0_screen.h"
+# include "nvc0/nvc0_3d.xml.h"
+#else
+# include "nv50/nv50_screen.h"
+# include "nv50/nv50_3d.xml.h"
+#endif
+#include "nv50/nv50_texture.xml.h"
+#include "nv50/nv50_defs.xml.h"
+
+#include "pipe/p_defines.h"
+
+/* Abbreviated usage masks:
+ * T: texturing
+ * R: render target
+ * B: render target, blendable
+ * C: render target (color), blendable only on nvc0
+ * D: scanout/display target, blendable
+ * Z: depth/stencil
+ * V: vertex fetch
+ * I: image / surface, implies T
+ */
+#define U_V   PIPE_BIND_VERTEX_BUFFER
+#define U_T   PIPE_BIND_SAMPLER_VIEW
+#define U_I   PIPE_BIND_SHADER_RESOURCE | PIPE_BIND_COMPUTE_RESOURCE
+#define U_TR  PIPE_BIND_RENDER_TARGET | U_T
+#define U_IR  U_TR | U_I
+#define U_TB  PIPE_BIND_BLENDABLE | U_TR
+#define U_IB  PIPE_BIND_BLENDABLE | U_IR
+#define U_TD  PIPE_BIND_SCANOUT | PIPE_BIND_DISPLAY_TARGET | U_TB
+#define U_TZ  PIPE_BIND_DEPTH_STENCIL | U_T
+#define U_TV  U_V | U_T
+#define U_TRV U_V | U_TR
+#define U_IRV U_V | U_IR
+#define U_TBV U_V | U_TB
+#define U_IBV U_V | U_IB
+#define U_TDV U_V | U_TD
+#if NOUVEAU_DRIVER == 0xc0
+# define U_TC  U_TB
+# define U_IC  U_IB
+# define U_TCV U_TBV
+# define U_ICV U_IBV
+# define U_tV  U_TV
+#else
+# define U_TC  U_TR
+# define U_IC  U_IR
+# define U_TCV U_TRV
+# define U_ICV U_IRV
+# define U_tV  U_V
+#endif
+
+#define NV50_SURFACE_FORMAT_NONE 0
+#define NV50_ZETA_FORMAT_NONE 0
+
+/* for vertex buffers: */
+#define NV50_TIC_0_FMT_8_8_8    NV50_TIC_0_FMT_8_8_8_8
+#define NV50_TIC_0_FMT_16_16_16 NV50_TIC_0_FMT_16_16_16_16
+#define NV50_TIC_0_FMT_32_32_32 NVC0_TIC_0_FMT_32_32_32
+
+#if NOUVEAU_DRIVER == 0xc0
+# define NVXX_3D_VAF_SIZE(s) NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_##s
+# define NVXX_3D_VAF_TYPE(t) NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_##t
+#else
+# define NVXX_3D_VAF_SIZE(s) NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_##s
+# define NVXX_3D_VAF_TYPE(t) NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_##t
+#endif
+
+#define TBLENT_A_(pf, sf, r, g, b, a, t0, t1, t2, t3, sz, u, br)        \
+   [PIPE_FORMAT_##pf] = {                                               \
+      sf,                                                               \
+      (NV50_TIC_MAP_##r << NV50_TIC_0_MAPR__SHIFT) |                    \
+      (NV50_TIC_MAP_##g << NV50_TIC_0_MAPG__SHIFT) |                    \
+      (NV50_TIC_MAP_##b << NV50_TIC_0_MAPB__SHIFT) |                    \
+      (NV50_TIC_MAP_##a << NV50_TIC_0_MAPA__SHIFT) |                    \
+      (NV50_TIC_TYPE_##t0 << NV50_TIC_0_TYPE0__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t1 << NV50_TIC_0_TYPE1__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t2 << NV50_TIC_0_TYPE2__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t3 << NV50_TIC_0_TYPE3__SHIFT) |                 \
+      NV50_TIC_0_FMT_##sz,                                              \
+      NVXX_3D_VAF_SIZE(sz) |                                            \
+      NVXX_3D_VAF_TYPE(t0) | (br << 31),                                \
+      U_##u                                                             \
+   }
+
+#define TBLENT_B_(pf, sf, r, g, b, a, t0, t1, t2, t3, sz, u)            \
+   [PIPE_FORMAT_##pf] = {                                               \
+      sf,                                                               \
+      (NV50_TIC_MAP_##r << NV50_TIC_0_MAPR__SHIFT) |                    \
+      (NV50_TIC_MAP_##g << NV50_TIC_0_MAPG__SHIFT) |                    \
+      (NV50_TIC_MAP_##b << NV50_TIC_0_MAPB__SHIFT) |                    \
+      (NV50_TIC_MAP_##a << NV50_TIC_0_MAPA__SHIFT) |                    \
+      (NV50_TIC_TYPE_##t0 << NV50_TIC_0_TYPE0__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t1 << NV50_TIC_0_TYPE1__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t2 << NV50_TIC_0_TYPE2__SHIFT) |                 \
+      (NV50_TIC_TYPE_##t3 << NV50_TIC_0_TYPE3__SHIFT) |                 \
+      NV50_TIC_0_FMT_##sz, 0, U_##u                                     \
+   }
+
+#define C4A(p, n, r, g, b, a, t, s, u, br)                              \
+   TBLENT_A_(p, NV50_SURFACE_FORMAT_##n, r, g, b, a, t, t, t, t, s, u, br)
+#define C4B(p, n, r, g, b, a, t, s, u)                                  \
+   TBLENT_B_(p, NV50_SURFACE_FORMAT_##n, r, g, b, a, t, t, t, t, s, u)
+
+#define ZXB(p, n, r, g, b, a, t, s, u)                                  \
+   TBLENT_B_(p, NV50_ZETA_FORMAT_##n,                                   \
+             r, g, b, ONE_FLOAT, t, UINT, UINT, UINT, s, u)
+#define ZSB(p, n, r, g, b, a, t, s, u)                                  \
+   TBLENT_B_(p, NV50_ZETA_FORMAT_##n,                                   \
+             r, g, b, ONE_FLOAT, t, UINT, UINT, UINT, s, u)
+#define SZB(p, n, r, g, b, a, t, s, u)                                  \
+   TBLENT_B_(p, NV50_ZETA_FORMAT_##n,                                   \
+             r, g, b, ONE_FLOAT, UINT, t, UINT, UINT, s, u)
+
+#define F3A(p, n, r, g, b, a, t, s, u)          \
+   C4A(p, n, r, g, b, ONE_FLOAT, t, s, u, 0)
+#define I3A(p, n, r, g, b, a, t, s, u)          \
+   C4A(p, n, r, g, b, ONE_INT, t, s, u, 0)
+#define F3B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, g, b, ONE_FLOAT, t, s, u)
+#define I3B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, g, b, ONE_INT, t, s, u)
+
+#define F2A(p, n, r, g, b, a, t, s, u)          \
+   C4A(p, n, r, g, ZERO, ONE_FLOAT, t, s, u, 0)
+#define I2A(p, n, r, g, b, a, t, s, u)          \
+   C4A(p, n, r, g, ZERO, ONE_INT, t, s, u, 0)
+#define F2B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, g, ZERO, ONE_FLOAT, t, s, u)
+#define I2B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, g, ZERO, ONE_INT, t, s, u)
+
+#define F1A(p, n, r, g, b, a, t, s, u)             \
+   C4A(p, n, r, ZERO, ZERO, ONE_FLOAT, t, s, u, 0)
+#define I1A(p, n, r, g, b, a, t, s, u)             \
+   C4A(p, n, r, ZERO, ZERO, ONE_INT, t, s, u, 0)
+#define F1B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, ZERO, ZERO, ONE_FLOAT, t, s, u)
+#define I1B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, r, ZERO, ZERO, ONE_INT, t, s, u)
+
+#define A1B(p, n, r, g, b, a, t, s, u)          \
+   C4B(p, n, ZERO, ZERO, ZERO, a, t, s, u)
+
+#if NOUVEAU_DRIVER == 0xc0
+const struct nvc0_format nvc0_format_table[PIPE_FORMAT_COUNT] =
+#else
+const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
+#endif
+{
+   C4A(B8G8R8A8_UNORM, BGRA8_UNORM, C2, C1, C0, C3, UNORM, 8_8_8_8, TDV, 1),
+   F3A(B8G8R8X8_UNORM, BGRX8_UNORM, C2, C1, C0, xx, UNORM, 8_8_8_8, TD),
+   C4A(B8G8R8A8_SRGB, BGRA8_SRGB, C2, C1, C0, C3, UNORM, 8_8_8_8, TD, 1),
+   F3A(B8G8R8X8_SRGB, BGRX8_SRGB, C2, C1, C0, xx, UNORM, 8_8_8_8, TD),
+   C4A(R8G8B8A8_UNORM, RGBA8_UNORM, C0, C1, C2, C3, UNORM, 8_8_8_8, IBV, 0),
+   F3A(R8G8B8X8_UNORM, RGBX8_UNORM, C0, C1, C2, xx, UNORM, 8_8_8_8, TB),
+   C4A(R8G8B8A8_SRGB, RGBA8_SRGB, C0, C1, C2, C3, UNORM, 8_8_8_8, TB, 0),
+   F3B(R8G8B8X8_SRGB, RGBX8_SRGB, C0, C1, C2, xx, UNORM, 8_8_8_8, TB),
+
+   ZXB(Z16_UNORM, Z16_UNORM, C0, C0, C0, xx, UNORM, Z16, TZ),
+   ZXB(Z32_FLOAT, Z32_FLOAT, C0, C0, C0, xx, FLOAT, Z32, TZ),
+   ZXB(Z24X8_UNORM, Z24_X8_UNORM, C0, C0, C0, xx, UNORM, Z24_X8, TZ),
+   ZSB(Z24_UNORM_S8_UINT, Z24_S8_UNORM, C0, C0, C0, xx, UNORM, Z24_S8, TZ),
+   ZSB(X24S8_UINT, NONE, C1, C1, C1, xx, UNORM, Z24_S8, T),
+   SZB(S8_UINT_Z24_UNORM, S8_Z24_UNORM, C1, C1, C1, xx, UNORM, S8_Z24, TZ),
+   SZB(S8X24_UINT, NONE, C0, C0, C0, xx, UNORM, S8_Z24, T),
+   ZSB(Z32_FLOAT_S8X24_UINT, Z32_S8_X24_FLOAT, C0, C0, C0, xx, FLOAT,
+       Z32_S8_X24, TZ),
+   ZSB(X32_S8X24_UINT, NONE, C1, C1, C1, xx, FLOAT, Z32_S8_X24, T),
+
+   F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, T),
+   C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TB),
+   F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TB),
+   C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T),
+   F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T),
+   F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T),
+
+   C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2,
+       IBV, 0),
+   C4A(B10G10R10A2_UNORM, BGR10_A2_UNORM, C2, C1, C0, C3, UNORM, 10_10_10_2,
+       TBV, 1),
+   C4A(R10G10B10A2_SNORM, NONE, C0, C1, C2, C3, SNORM, 10_10_10_2, TV, 0),
+   C4A(B10G10R10A2_SNORM, NONE, C2, C1, C0, C3, SNORM, 10_10_10_2, TV, 1),
+
+   F3B(R11G11B10_FLOAT, R11G11B10_FLOAT, C0, C1, C2, xx, FLOAT, 11_11_10, IB),
+
+   F3B(L8_UNORM, R8_UNORM, C0, C0, C0, xx, UNORM, 8, TB),
+   F3B(L8_SRGB, R8_UNORM, C0, C0, C0, xx, UNORM, 8, TB),
+   F3B(L8_SNORM, R8_SNORM, C0, C0, C0, xx, SNORM, 8, TC),
+   I3B(L8_SINT, R8_SINT, C0, C0, C0, xx, SINT, 8, TR),
+   I3B(L8_UINT, R8_UINT, C0, C0, C0, xx, UINT, 8, TR),
+   F3B(L16_UNORM, R16_UNORM, C0, C0, C0, xx, UNORM, 16, TC),
+   F3B(L16_SNORM, R16_SNORM, C0, C0, C0, xx, SNORM, 16, TC),
+   F3B(L16_FLOAT, R16_FLOAT, C0, C0, C0, xx, FLOAT, 16, TB),
+   I3B(L16_SINT, R16_SINT, C0, C0, C0, xx, SINT, 16, TR),
+   I3B(L16_UINT, R16_UINT, C0, C0, C0, xx, UINT, 16, TR),
+   F3B(L32_FLOAT, R32_FLOAT, C0, C0, C0, xx, FLOAT, 32, TB),
+   I3B(L32_SINT, R32_SINT, C0, C0, C0, xx, SINT, 32, TR),
+   I3B(L32_UINT, R32_UINT, C0, C0, C0, xx, UINT, 32, TR),
+
+   C4B(I8_UNORM, R8_UNORM, C0, C0, C0, C0, UNORM, 8, TR),
+   C4B(I8_SNORM, R8_SNORM, C0, C0, C0, C0, SNORM, 8, TR),
+   C4B(I8_SINT, R8_SINT, C0, C0, C0, C0, SINT, 8, TR),
+   C4B(I8_UINT, R8_UINT, C0, C0, C0, C0, UINT, 8, TR),
+   C4B(I16_UNORM, R16_UNORM, C0, C0, C0, C0, UNORM, 16, TR),
+   C4B(I16_SNORM, R16_SNORM, C0, C0, C0, C0, SNORM, 16, TR),
+   C4B(I16_FLOAT, R16_FLOAT, C0, C0, C0, C0, FLOAT, 16, TR),
+   C4B(I16_SINT, R16_SINT, C0, C0, C0, C0, SINT, 16, TR),
+   C4B(I16_UINT, R16_UINT, C0, C0, C0, C0, UINT, 16, TR),
+   C4B(I32_FLOAT, R32_FLOAT, C0, C0, C0, C0, FLOAT, 32, TR),
+   C4B(I32_SINT, R32_SINT, C0, C0, C0, C0, SINT, 32, TR),
+   C4B(I32_UINT, R32_UINT, C0, C0, C0, C0, UINT, 32, TR),
+
+   A1B(A8_UNORM, A8_UNORM, xx, xx, xx, C0, UNORM, 8, TB),
+   A1B(A8_SNORM, R8_SNORM, xx, xx, xx, C0, SNORM, 8, T),
+   A1B(A8_SINT, R8_SINT, xx, xx, xx, C0, SINT, 8, T),
+   A1B(A8_UINT, R8_UINT, xx, xx, xx, C0, UINT, 8, T),
+   A1B(A16_UNORM, R16_UNORM, xx, xx, xx, C0, UNORM, 16, T),
+   A1B(A16_SNORM, R16_SNORM, xx, xx, xx, C0, SNORM, 16, T),
+   A1B(A16_FLOAT, R16_FLOAT, xx, xx, xx, C0, FLOAT, 16, T),
+   A1B(A16_SINT, R16_SINT, xx, xx, xx, C0, SINT, 16, T),
+   A1B(A16_UINT, R16_UINT, xx, xx, xx, C0, UINT, 16, T),
+   A1B(A32_FLOAT, R32_FLOAT, xx, xx, xx, C0, FLOAT, 32, T),
+   A1B(A32_SINT, R32_SINT, xx, xx, xx, C0, SINT, 32, T),
+   A1B(A32_UINT, R32_UINT, xx, xx, xx, C0, UINT, 32, T),
+
+   C4B(L4A4_UNORM, NONE, C0, C0, C0, C1, UNORM, 4_4, T),
+   C4B(L8A8_UNORM, RG8_UNORM, C0, C0, C0, C1, UNORM, 8_8, T),
+   C4B(L8A8_SNORM, RG8_SNORM, C0, C0, C0, C1, SNORM, 8_8, T),
+   C4B(L8A8_SRGB, RG8_UNORM, C0, C0, C0, C1, UNORM, 8_8, T),
+   C4B(L8A8_SINT, RG8_SINT, C0, C0, C0, C1, SINT, 8_8, T),
+   C4B(L8A8_UINT, RG8_UINT, C0, C0, C0, C1, UINT, 8_8, T),
+   C4B(L16A16_UNORM, RG16_UNORM, C0, C0, C0, C1, UNORM, 16_16, T),
+   C4B(L16A16_SNORM, RG16_SNORM, C0, C0, C0, C1, SNORM, 16_16, T),
+   C4B(L16A16_FLOAT, RG16_FLOAT, C0, C0, C0, C1, FLOAT, 16_16, T),
+   C4B(L16A16_SINT, RG16_SINT, C0, C0, C0, C1, SINT, 16_16, T),
+   C4B(L16A16_UINT, RG16_UINT, C0, C0, C0, C1, UINT, 16_16, T),
+   C4B(L32A32_FLOAT, RG32_FLOAT, C0, C0, C0, C1, FLOAT, 32_32, T),
+   C4B(L32A32_SINT, RG32_SINT, C0, C0, C0, C1, SINT, 32_32, T),
+   C4B(L32A32_UINT, RG32_UINT, C0, C0, C0, C1, UINT, 32_32, T),
+
+   F3B(DXT1_RGB,   NONE, C0, C1, C2, xx, UNORM, DXT1, T),
+   F3B(DXT1_SRGB,  NONE, C0, C1, C2, xx, UNORM, DXT1, T),
+   C4B(DXT1_RGBA,  NONE, C0, C1, C2, C3, UNORM, DXT1, T),
+   C4B(DXT1_SRGBA, NONE, C0, C1, C2, C3, UNORM, DXT1, T),
+   C4B(DXT3_RGBA,  NONE, C0, C1, C2, C3, UNORM, DXT3, T),
+   C4B(DXT3_SRGBA, NONE, C0, C1, C2, C3, UNORM, DXT3, T),
+   C4B(DXT5_RGBA,  NONE, C0, C1, C2, C3, UNORM, DXT5, T),
+   C4B(DXT5_SRGBA, NONE, C0, C1, C2, C3, UNORM, DXT5, T),
+
+   F1B(RGTC1_UNORM, NONE, C0, xx, xx, xx, UNORM, RGTC1, T),
+   F1B(RGTC1_SNORM, NONE, C0, xx, xx, xx, SNORM, RGTC1, T),
+   F2B(RGTC2_UNORM, NONE, C0, C1, xx, xx, UNORM, RGTC2, T),
+   F2B(RGTC2_SNORM, NONE, C0, C1, xx, xx, SNORM, RGTC2, T),
+   F3B(LATC1_UNORM, NONE, C0, C0, C0, xx, UNORM, RGTC1, T),
+   F3B(LATC1_SNORM, NONE, C0, C0, C0, xx, SNORM, RGTC1, T),
+   C4B(LATC2_UNORM, NONE, C0, C0, C0, C1, UNORM, RGTC2, T),
+   C4B(LATC2_SNORM, NONE, C0, C0, C0, C1, SNORM, RGTC2, T),
+
+   C4A(R32G32B32A32_FLOAT, RGBA32_FLOAT, C0, C1, C2, C3, FLOAT, 32_32_32_32,
+       IBV, 0),
+   C4A(R32G32B32A32_UNORM, NONE, C0, C1, C2, C3, UNORM, 32_32_32_32, TV, 0),
+   C4A(R32G32B32A32_SNORM, NONE, C0, C1, C2, C3, SNORM, 32_32_32_32, TV, 0),
+   C4A(R32G32B32A32_SINT, RGBA32_SINT, C0, C1, C2, C3, SINT, 32_32_32_32,
+       IRV, 0),
+   C4A(R32G32B32A32_UINT, RGBA32_UINT, C0, C1, C2, C3, UINT, 32_32_32_32,
+       IRV, 0),
+   F3B(R32G32B32X32_FLOAT, RGBX32_FLOAT, C0, C1, C2, xx, FLOAT, 32_32_32_32, TB),
+   I3B(R32G32B32X32_SINT, RGBX32_SINT, C0, C1, C2, xx, SINT, 32_32_32_32, TR),
+   I3B(R32G32B32X32_UINT, RGBX32_UINT, C0, C1, C2, xx, UINT, 32_32_32_32, TR),
+
+   F2A(R32G32_FLOAT, RG32_FLOAT, C0, C1, xx, xx, FLOAT, 32_32, IBV),
+   F2A(R32G32_UNORM, NONE, C0, C1, xx, xx, UNORM, 32_32, TV),
+   F2A(R32G32_SNORM, NONE, C0, C1, xx, xx, SNORM, 32_32, TV),
+   I2A(R32G32_SINT, RG32_SINT, C0, C1, xx, xx, SINT, 32_32, IRV),
+   I2A(R32G32_UINT, RG32_UINT, C0, C1, xx, xx, UINT, 32_32, IRV),
+
+   F1A(R32_FLOAT, R32_FLOAT, C0, xx, xx, xx, FLOAT, 32, IBV),
+   F1A(R32_UNORM, NONE, C0, xx, xx, xx, UNORM, 32, TV),
+   F1A(R32_SNORM, NONE, C0, xx, xx, xx, SNORM, 32, TV),
+   I1A(R32_SINT, R32_SINT, C0, xx, xx, xx, SINT, 32, IRV),
+   I1A(R32_UINT, R32_UINT, C0, xx, xx, xx, UINT, 32, IRV),
+
+   C4A(R16G16B16A16_FLOAT, RGBA16_FLOAT, C0, C1, C2, C3, FLOAT, 16_16_16_16,
+       IBV, 0),
+   C4A(R16G16B16A16_UNORM, RGBA16_UNORM, C0, C1, C2, C3, UNORM, 16_16_16_16,
+       ICV, 0),
+   C4A(R16G16B16A16_SNORM, RGBA16_SNORM, C0, C1, C2, C3, SNORM, 16_16_16_16,
+       ICV, 0),
+   C4A(R16G16B16A16_SINT, RGBA16_SINT, C0, C1, C2, C3, SINT, 16_16_16_16,
+       IRV, 0),
+   C4A(R16G16B16A16_UINT, RGBA16_UINT, C0, C1, C2, C3, UINT, 16_16_16_16,
+       IRV, 0),
+   F3B(R16G16B16X16_FLOAT, RGBX16_FLOAT, C0, C1, C2, xx, FLOAT, 16_16_16_16, TB),
+   F3B(R16G16B16X16_UNORM, RGBA16_UNORM, C0, C1, C2, xx, UNORM, 16_16_16_16, T),
+   F3B(R16G16B16X16_SNORM, RGBA16_SNORM, C0, C1, C2, xx, SNORM, 16_16_16_16, T),
+   I3B(R16G16B16X16_SINT, RGBA16_SINT, C0, C1, C2, xx, SINT, 16_16_16_16, T),
+   I3B(R16G16B16X16_UINT, RGBA16_UINT, C0, C1, C2, xx, UINT, 16_16_16_16, T),
+
+   F2A(R16G16_FLOAT, RG16_FLOAT, C0, C1, xx, xx, FLOAT, 16_16, IBV),
+   F2A(R16G16_UNORM, RG16_UNORM, C0, C1, xx, xx, UNORM, 16_16, ICV),
+   F2A(R16G16_SNORM, RG16_SNORM, C0, C1, xx, xx, SNORM, 16_16, ICV),
+   I2A(R16G16_SINT, RG16_SINT, C0, C1, xx, xx, SINT, 16_16, IRV),
+   I2A(R16G16_UINT, RG16_UINT, C0, C1, xx, xx, UINT, 16_16, IRV),
+
+   F1A(R16_FLOAT, R16_FLOAT, C0, xx, xx, xx, FLOAT, 16, IBV),
+   F1A(R16_UNORM, R16_UNORM, C0, xx, xx, xx, UNORM, 16, ICV),
+   F1A(R16_SNORM, R16_SNORM, C0, xx, xx, xx, SNORM, 16, ICV),
+   I1A(R16_SINT, R16_SINT, C0, xx, xx, xx, SINT, 16, IRV),
+   I1A(R16_UINT, R16_UINT, C0, xx, xx, xx, UINT, 16, IRV),
+
+   C4A(R8G8B8A8_SNORM, RGBA8_SNORM, C0, C1, C2, C3, SNORM, 8_8_8_8, ICV, 0),
+   C4A(R8G8B8A8_SINT, RGBA8_SINT, C0, C1, C2, C3, SINT, 8_8_8_8, IRV, 0),
+   C4A(R8G8B8A8_UINT, RGBA8_UINT, C0, C1, C2, C3, UINT, 8_8_8_8, IRV, 0),
+   F3B(R8G8B8X8_SNORM, RGBA8_SNORM, C0, C1, C2, xx, SNORM, 8_8_8_8, T),
+   I3B(R8G8B8X8_SINT, RGBA8_SINT, C0, C1, C2, xx, SINT, 8_8_8_8, T),
+   I3B(R8G8B8X8_UINT, RGBA8_UINT, C0, C1, C2, xx, UINT, 8_8_8_8, T),
+
+   F2A(R8G8_UNORM, RG8_UNORM, C0, C1, xx, xx, UNORM, 8_8, IBV),
+   F2A(R8G8_SNORM, RG8_SNORM, C0, C1, xx, xx, SNORM, 8_8, ICV),
+   I2A(R8G8_SINT, RG8_SINT, C0, C1, xx, xx, SINT, 8_8, IRV),
+   I2A(R8G8_UINT, RG8_UINT, C0, C1, xx, xx, UINT, 8_8, IRV),
+
+   F1A(R8_UNORM, R8_UNORM, C0, xx, xx, xx, UNORM, 8, IBV),
+   F1A(R8_SNORM, R8_SNORM, C0, xx, xx, xx, SNORM, 8, ICV),
+   I1A(R8_SINT, R8_SINT, C0, xx, xx, xx, SINT, 8, IRV),
+   I1A(R8_UINT, R8_UINT, C0, xx, xx, xx, UINT, 8, IRV),
+
+   F3B(R8G8_B8G8_UNORM, NONE, C0, C1, C2, xx, UNORM, U8_YA8_V8_YB8, T),
+   F3B(G8R8_B8R8_UNORM, NONE, C1, C0, C2, xx, UNORM, U8_YA8_V8_YB8, T),
+   F3B(G8R8_G8B8_UNORM, NONE, C0, C1, C2, xx, UNORM, YA8_U8_YB8_V8, T),
+   F3B(R8G8_R8B8_UNORM, NONE, C1, C0, C2, xx, UNORM, YA8_U8_YB8_V8, T),
+
+   F1B(R1_UNORM, BITMAP, C0, xx, xx, xx, UNORM, BITMAP, T),
+
+   C4B(R4A4_UNORM, NONE, C0, ZERO, ZERO, C1, UNORM, 4_4, T),
+   C4B(R8A8_UNORM, NONE, C0, ZERO, ZERO, C1, UNORM, 8_8, T),
+   C4B(A4R4_UNORM, NONE, C1, ZERO, ZERO, C0, UNORM, 4_4, T),
+   C4B(A8R8_UNORM, NONE, C1, ZERO, ZERO, C0, UNORM, 8_8, T),
+
+   TBLENT_B_(R8SG8SB8UX8U_NORM, 0,
+             C0, C1, C2, ONE_FLOAT, SNORM, SNORM, UNORM, UNORM, 8_8_8_8, T),
+   TBLENT_B_(R5SG5SB6U_NORM, 0,
+             C0, C1, C2, ONE_FLOAT, SNORM, SNORM, UNORM, UNORM, 5_5_6, T),
+
+   /* vertex-only formats: */
+
+   C4A(R32G32B32A32_SSCALED, NONE, C0, C1, C2, C3, SSCALED, 32_32_32_32, V, 0),
+   C4A(R32G32B32A32_USCALED, NONE, C0, C1, C2, C3, USCALED, 32_32_32_32, V, 0),
+   F3A(R32G32B32_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 32_32_32, tV),
+   F3A(R32G32B32_UNORM, NONE, C0, C1, C2, xx, UNORM, 32_32_32, V),
+   F3A(R32G32B32_SNORM, NONE, C0, C1, C2, xx, SNORM, 32_32_32, V),
+   I3A(R32G32B32_SINT, NONE, C0, C1, C2, xx, SINT, 32_32_32, tV),
+   I3A(R32G32B32_UINT, NONE, C0, C1, C2, xx, UINT, 32_32_32, tV),
+   F3A(R32G32B32_SSCALED, NONE, C0, C1, C2, xx, SSCALED, 32_32_32, V),
+   F3A(R32G32B32_USCALED, NONE, C0, C1, C2, xx, USCALED, 32_32_32, V),
+   F2A(R32G32_SSCALED, NONE, C0, C1, xx, xx, SSCALED, 32_32, V),
+   F2A(R32G32_USCALED, NONE, C0, C1, xx, xx, USCALED, 32_32, V),
+   F1A(R32_SSCALED, NONE, C0, xx, xx, xx, SSCALED, 32, V),
+   F1A(R32_USCALED, NONE, C0, xx, xx, xx, USCALED, 32, V),
+
+   C4A(R16G16B16A16_SSCALED, NONE, C0, C1, C2, C3, SSCALED, 16_16_16_16, V, 0),
+   C4A(R16G16B16A16_USCALED, NONE, C0, C1, C2, C3, USCALED, 16_16_16_16, V, 0),
+   F3A(R16G16B16_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 16_16_16, V),
+   F3A(R16G16B16_UNORM, NONE, C0, C1, C2, xx, UNORM, 16_16_16, V),
+   F3A(R16G16B16_SNORM, NONE, C0, C1, C2, xx, SNORM, 16_16_16, V),
+   I3A(R16G16B16_SINT, NONE, C0, C1, C2, xx, SINT, 16_16_16, V),
+   I3A(R16G16B16_UINT, NONE, C0, C1, C2, xx, UINT, 16_16_16, V),
+   F3A(R16G16B16_SSCALED, NONE, C0, C1, C2, xx, SSCALED, 16_16_16, V),
+   F3A(R16G16B16_USCALED, NONE, C0, C1, C2, xx, USCALED, 16_16_16, V),
+   F2A(R16G16_SSCALED, NONE, C0, C1, xx, xx, SSCALED, 16_16, V),
+   F2A(R16G16_USCALED, NONE, C0, C1, xx, xx, USCALED, 16_16, V),
+   F1A(R16_SSCALED, NONE, C0, xx, xx, xx, SSCALED, 16, V),
+   F1A(R16_USCALED, NONE, C0, xx, xx, xx, USCALED, 16, V),
+
+   C4A(R8G8B8A8_SSCALED, NONE, C0, C1, C2, C3, SSCALED, 8_8_8_8, V, 0),
+   C4A(R8G8B8A8_USCALED, NONE, C0, C1, C2, C3, USCALED, 8_8_8_8, V, 0),
+   F3A(R8G8B8_UNORM, NONE, C0, C1, C2, xx, UNORM, 8_8_8, V),
+   F3A(R8G8B8_SNORM, NONE, C0, C1, C2, xx, SNORM, 8_8_8, V),
+   I2A(R8G8B8_SINT, NONE, C0, C1, C2, xx, SINT, 8_8_8, V),
+   I2A(R8G8B8_UINT, NONE, C0, C1, C2, xx, UINT, 8_8_8, V),
+   F3A(R8G8B8_SSCALED, NONE, C0, C1, C2, xx, SSCALED, 8_8_8, V),
+   F3A(R8G8B8_USCALED, NONE, C0, C1, C2, xx, USCALED, 8_8_8, V),
+   F2A(R8G8_SSCALED, NONE, C0, C1, xx, xx, SSCALED, 8_8, V),
+   F2A(R8G8_USCALED, NONE, C0, C1, xx, xx, USCALED, 8_8, V),
+   F1A(R8_SSCALED, NONE, C0, xx, xx, xx, SSCALED, 8, V),
+   F1A(R8_USCALED, NONE, C0, xx, xx, xx, USCALED, 8, V),
+
+   /* FIXED types: not supported natively, converted on VBO push */
+
+   C4B(R32G32B32A32_FIXED, NONE, C0, C1, C2, C3, FLOAT, 32_32_32_32, V),
+   F3B(R32G32B32_FIXED, NONE, C0, C1, C2, xx, FLOAT, 32_32_32, V),
+   F2B(R32G32_FIXED, NONE, C0, C1, xx, xx, FLOAT, 32_32, V),
+   F1B(R32_FIXED, NONE, C0, xx, xx, xx, FLOAT, 32, V),
+
+   C4B(R64G64B64A64_FLOAT, NONE, C0, C1, C2, C3, FLOAT, 32_32_32_32, V),
+   F3B(R64G64B64_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 32_32_32, V),
+   F2B(R64G64_FLOAT, NONE, C0, C1, xx, xx, FLOAT, 32_32, V),
+   F1B(R64_FLOAT, NONE, C0, xx, xx, xx, FLOAT, 32, V),
+};
+
+#if 0
+const uint8_t nv50_rt_format_map[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_Z16_UNORM]            = NV50_ZETA_FORMAT_Z16_UNORM,
+   [PIPE_FORMAT_Z24X8_UNORM]          = NV50_ZETA_FORMAT_Z24_X8_UNORM,
+   [PIPE_FORMAT_Z24_UNORM_S8_UINT]    = NV50_ZETA_FORMAT_Z24_S8_UNORM,
+   [PIPE_FORMAT_S8_UINT_Z24_UNORM]    = NV50_ZETA_FORMAT_S8_Z24_UNORM,
+   [PIPE_FORMAT_Z32_FLOAT]            = NV50_ZETA_FORMAT_Z32_FLOAT,
+   [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = NV50_ZETA_FORMAT_Z32_S8_X24_FLOAT,
+
+   [PIPE_FORMAT_R1_UNORM] = NV50_SURFACE_FORMAT_BITMAP,
+
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = NV50_SURFACE_FORMAT_RGBA32_FLOAT,
+   [PIPE_FORMAT_R32G32B32X32_FLOAT] = NV50_SURFACE_FORMAT_RGBX32_FLOAT,
+   [PIPE_FORMAT_R32G32B32A32_SINT]  = NV50_SURFACE_FORMAT_RGBA32_SINT,
+   [PIPE_FORMAT_R32G32B32X32_SINT]  = NV50_SURFACE_FORMAT_RGBX32_SINT,
+   [PIPE_FORMAT_R32G32B32A32_UINT]  = NV50_SURFACE_FORMAT_RGBA32_UINT,
+   [PIPE_FORMAT_R32G32B32X32_UINT]  = NV50_SURFACE_FORMAT_RGBX32_UINT,
+
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = NV50_SURFACE_FORMAT_RGBA16_FLOAT,
+   [PIPE_FORMAT_R16G16B16X16_FLOAT] = NV50_SURFACE_FORMAT_RGBX16_FLOAT,
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = NV50_SURFACE_FORMAT_RGBA16_UNORM,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = NV50_SURFACE_FORMAT_RGBA16_SNORM,
+   [PIPE_FORMAT_R16G16B16A16_SINT]  = NV50_SURFACE_FORMAT_RGBA16_SINT,
+   [PIPE_FORMAT_R16G16B16A16_UINT]  = NV50_SURFACE_FORMAT_RGBA16_UINT,
+
+   [PIPE_FORMAT_B8G8R8A8_UNORM] = NV50_SURFACE_FORMAT_BGRA8_UNORM,
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = NV50_SURFACE_FORMAT_RGBA8_UNORM,
+   [PIPE_FORMAT_B8G8R8X8_UNORM] = NV50_SURFACE_FORMAT_BGRX8_UNORM,
+   [PIPE_FORMAT_R8G8B8X8_UNORM] = NV50_SURFACE_FORMAT_RGBX8_UNORM,
+   [PIPE_FORMAT_B8G8R8A8_SRGB]  = NV50_SURFACE_FORMAT_BGRA8_SRGB,
+   [PIPE_FORMAT_R8G8B8A8_SRGB]  = NV50_SURFACE_FORMAT_RGBA8_SRGB,
+   [PIPE_FORMAT_B8G8R8X8_SRGB]  = NV50_SURFACE_FORMAT_BGRX8_SRGB,
+   [PIPE_FORMAT_R8G8B8X8_SRGB]  = NV50_SURFACE_FORMAT_RGBX8_SRGB,
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = NV50_SURFACE_FORMAT_RGBA8_SNORM,
+   [PIPE_FORMAT_R8G8B8A8_SINT]  = NV50_SURFACE_FORMAT_RGBA8_SINT,
+   [PIPE_FORMAT_R8G8B8A8_UINT]  = NV50_SURFACE_FORMAT_RGBA8_UINT,
+
+   [PIPE_FORMAT_R11G11B10_FLOAT] = NV50_SURFACE_FORMAT_R11G11B10_FLOAT,
+
+   [PIPE_FORMAT_B10G10R10A2_UNORM] = NV50_SURFACE_FORMAT_BGR10_A2_UNORM,
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = NV50_SURFACE_FORMAT_RGB10_A2_UNORM,
+   [PIPE_FORMAT_R10G10B10A2_UINT]  = NV50_SURFACE_FORMAT_RGB10_A2_UINT,
+
+   [PIPE_FORMAT_B5G6R5_UNORM] = NV50_SURFACE_FORMAT_B5G6R5_UNORM,
+
+   [PIPE_FORMAT_B5G5R5A1_UNORM] = NV50_SURFACE_FORMAT_BGR5_A1_UNORM,
+   [PIPE_FORMAT_B5G5R5X1_UNORM] = NV50_SURFACE_FORMAT_BGR5_X1_UNORM,
+
+   [PIPE_FORMAT_R32G32_FLOAT] = NV50_SURFACE_FORMAT_RG32_FLOAT,
+   [PIPE_FORMAT_R32G32_SINT]  = NV50_SURFACE_FORMAT_RG32_SINT,
+   [PIPE_FORMAT_R32G32_UINT]  = NV50_SURFACE_FORMAT_RG32_UINT,
+
+   [PIPE_FORMAT_R16G16_FLOAT] = NV50_SURFACE_FORMAT_RG16_FLOAT,
+   [PIPE_FORMAT_R16G16_UNORM] = NV50_SURFACE_FORMAT_RG16_UNORM,
+   [PIPE_FORMAT_R16G16_SNORM] = NV50_SURFACE_FORMAT_RG16_SNORM,
+   [PIPE_FORMAT_R16G16_SINT]  = NV50_SURFACE_FORMAT_RG16_SINT,
+   [PIPE_FORMAT_R16G16_UINT]  = NV50_SURFACE_FORMAT_RG16_UINT,
+
+   [PIPE_FORMAT_R8G8_UNORM] = NV50_SURFACE_FORMAT_RG8_UNORM,
+   [PIPE_FORMAT_R8G8_SNORM] = NV50_SURFACE_FORMAT_RG8_SNORM,
+   [PIPE_FORMAT_R8G8_SINT]  = NV50_SURFACE_FORMAT_RG8_SINT,
+   [PIPE_FORMAT_R8G8_UINT]  = NV50_SURFACE_FORMAT_RG8_UINT,
+
+   [PIPE_FORMAT_R32_FLOAT] = NV50_SURFACE_FORMAT_R32_FLOAT,
+   [PIPE_FORMAT_R32_SINT]  = NV50_SURFACE_FORMAT_R32_SINT,
+   [PIPE_FORMAT_R32_UINT]  = NV50_SURFACE_FORMAT_R32_UINT,
+
+   [PIPE_FORMAT_R16_FLOAT] = NV50_SURFACE_FORMAT_R16_FLOAT,
+   [PIPE_FORMAT_R16_UNORM] = NV50_SURFACE_FORMAT_R16_UNORM,
+   [PIPE_FORMAT_R16_SNORM] = NV50_SURFACE_FORMAT_R16_SNORM,
+   [PIPE_FORMAT_R16_SINT]  = NV50_SURFACE_FORMAT_R16_SINT,
+   [PIPE_FORMAT_R16_UINT]  = NV50_SURFACE_FORMAT_R16_UINT,
+
+   [PIPE_FORMAT_R8_UNORM] = NV50_SURFACE_FORMAT_R8_UNORM,
+   [PIPE_FORMAT_R8_SNORM] = NV50_SURFACE_FORMAT_R8_SNORM,
+   [PIPE_FORMAT_R8_SINT]  = NV50_SURFACE_FORMAT_R8_SINT,
+   [PIPE_FORMAT_R8_UINT]  = NV50_SURFACE_FORMAT_R8_UINT,
+
+   [PIPE_FORMAT_A8_UNORM] = NV50_SURFACE_FORMAT_A8_UNORM
+};
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
new file mode 100644
index 00000000000..513d8f96aac
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_resource.h"
+
+uint32_t
+nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz)
+{
+   uint32_t tile_mode = 0x000;
+
+   if (ny > 64) tile_mode = 0x040; /* height 128 tiles */
+   else
+   if (ny > 32) tile_mode = 0x030; /* height 64 tiles */
+   else
+   if (ny > 16) tile_mode = 0x020; /* height 32 tiles */
+   else
+   if (ny >  8) tile_mode = 0x010; /* height 16 tiles */
+
+   if (nz == 1)
+      return tile_mode;
+   else
+      if (tile_mode > 0x020)
+         tile_mode = 0x020;
+
+   if (nz > 16 && tile_mode < 0x020)
+      return tile_mode | 0x500; /* depth 32 tiles */
+   if (nz > 8) return tile_mode | 0x400; /* depth 16 tiles */
+   if (nz > 4) return tile_mode | 0x300; /* depth 8 tiles */
+   if (nz > 2) return tile_mode | 0x200; /* depth 4 tiles */
+
+   return tile_mode | 0x100;
+}
+
+static uint32_t
+nv50_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz)
+{
+   return nv50_tex_choose_tile_dims_helper(nx, ny * 2, nz);
+}
+
+static uint32_t
+nv50_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+{
+   const unsigned ms = mt->ms_x + mt->ms_y;
+
+   uint32_t tile_flags;
+
+   if (unlikely(mt->base.base.flags & NOUVEAU_RESOURCE_FLAG_LINEAR))
+      return 0;
+   if (unlikely(mt->base.base.bind & PIPE_BIND_CURSOR))
+      return 0;
+
+   switch (mt->base.base.format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      tile_flags = 0x6c + ms;
+      break;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      tile_flags = 0x18 + ms;
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      tile_flags = 0x128 + ms;
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      tile_flags = 0x40 + ms;
+      break;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      tile_flags = 0x60 + ms;
+      break;
+   default:
+      switch (util_format_get_blocksizebits(mt->base.base.format)) {
+      case 128:
+         assert(ms < 3);
+         tile_flags = 0x74;
+         break;
+      case 64:
+         switch (ms) {
+         case 2: tile_flags = 0xfc; break;
+         case 3: tile_flags = 0xfd; break;
+         default:
+            tile_flags = 0x70;
+            break;
+         }
+         break;
+      case 32:
+         if (mt->base.base.bind & PIPE_BIND_SCANOUT) {
+            assert(ms == 0);
+            tile_flags = 0x7a;
+         } else {
+            switch (ms) {
+            case 2: tile_flags = 0xf8; break;
+            case 3: tile_flags = 0xf9; break;
+            default:
+               tile_flags = 0x70;
+               break;
+            }
+         }
+         break;
+      case 16:
+      case 8:
+         tile_flags = 0x70;
+         break;
+      default:
+         return 0;
+      }
+      if (mt->base.base.bind & PIPE_BIND_CURSOR)
+         tile_flags = 0;
+   }
+
+   if (!compressed)
+      tile_flags &= ~0x180;
+
+   return tile_flags;
+}
+
+void
+nv50_miptree_destroy(struct pipe_screen *pscreen, struct pipe_resource *pt)
+{
+   struct nv50_miptree *mt = nv50_miptree(pt);
+
+   nouveau_bo_ref(NULL, &mt->base.bo);
+
+   nouveau_fence_ref(NULL, &mt->base.fence);
+   nouveau_fence_ref(NULL, &mt->base.fence_wr);
+
+   NOUVEAU_DRV_STAT(nouveau_screen(pscreen), tex_obj_current_count, -1);
+   NOUVEAU_DRV_STAT(nouveau_screen(pscreen), tex_obj_current_bytes,
+                    -(uint64_t)mt->total_size);
+
+   FREE(mt);
+}
+
+boolean
+nv50_miptree_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_resource *pt,
+                        struct winsys_handle *whandle)
+{
+   struct nv50_miptree *mt = nv50_miptree(pt);
+   unsigned stride;
+
+   if (!mt || !mt->base.bo)
+      return FALSE;
+
+   stride = mt->level[0].pitch;
+
+   return nouveau_screen_bo_get_handle(pscreen,
+                                       mt->base.bo,
+                                       stride,
+                                       whandle);
+}
+
+const struct u_resource_vtbl nv50_miptree_vtbl =
+{
+   nv50_miptree_get_handle,         /* get_handle */
+   nv50_miptree_destroy,            /* resource_destroy */
+   nv50_miptree_transfer_map,       /* transfer_map */
+   u_default_transfer_flush_region, /* transfer_flush_region */
+   nv50_miptree_transfer_unmap,     /* transfer_unmap */
+   u_default_transfer_inline_write  /* transfer_inline_write */
+};
+
+static INLINE boolean
+nv50_miptree_init_ms_mode(struct nv50_miptree *mt)
+{
+   switch (mt->base.base.nr_samples) {
+   case 8:
+      mt->ms_mode = NV50_3D_MULTISAMPLE_MODE_MS8;
+      mt->ms_x = 2;
+      mt->ms_y = 1;
+      break;
+   case 4:
+      mt->ms_mode = NV50_3D_MULTISAMPLE_MODE_MS4;
+      mt->ms_x = 1;
+      mt->ms_y = 1;
+      break;
+   case 2:
+      mt->ms_mode = NV50_3D_MULTISAMPLE_MODE_MS2;
+      mt->ms_x = 1;
+      break;
+   case 1:
+   case 0:
+      mt->ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
+      break;
+   default:
+      NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
+      return FALSE;
+   }
+   return TRUE;
+}
+
+boolean
+nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align)
+{
+   struct pipe_resource *pt = &mt->base.base;
+   const unsigned blocksize = util_format_get_blocksize(pt->format);
+   unsigned h = pt->height0;
+
+   if (util_format_is_depth_or_stencil(pt->format))
+      return FALSE;
+
+   if ((pt->last_level > 0) || (pt->depth0 > 1) || (pt->array_size > 1))
+      return FALSE;
+   if (mt->ms_x | mt->ms_y)
+      return FALSE;
+
+   mt->level[0].pitch = align(pt->width0 * blocksize, pitch_align);
+
+   /* Account for very generous prefetch (allocate size as if tiled). */
+   h = MAX2(h, 8);
+   h = util_next_power_of_two(h);
+
+   mt->total_size = mt->level[0].pitch * h;
+
+   return TRUE;
+}
+
+static void
+nv50_miptree_init_layout_video(struct nv50_miptree *mt)
+{
+   const struct pipe_resource *pt = &mt->base.base;
+   const unsigned blocksize = util_format_get_blocksize(pt->format);
+
+   assert(pt->last_level == 0);
+   assert(mt->ms_x == 0 && mt->ms_y == 0);
+   assert(!util_format_is_compressed(pt->format));
+
+   mt->layout_3d = pt->target == PIPE_TEXTURE_3D;
+
+   mt->level[0].tile_mode = 0x20;
+   mt->level[0].pitch = align(pt->width0 * blocksize, 64);
+   mt->total_size = align(pt->height0, 16) * mt->level[0].pitch * (mt->layout_3d ? pt->depth0 : 1);
+
+   if (pt->array_size > 1) {
+      mt->layer_stride = align(mt->total_size, NV50_TILE_SIZE(0x20));
+      mt->total_size = mt->layer_stride * pt->array_size;
+   }
+}
+
+static void
+nv50_miptree_init_layout_tiled(struct nv50_miptree *mt)
+{
+   struct pipe_resource *pt = &mt->base.base;
+   unsigned w, h, d, l;
+   const unsigned blocksize = util_format_get_blocksize(pt->format);
+
+   mt->layout_3d = pt->target == PIPE_TEXTURE_3D;
+
+   w = pt->width0 << mt->ms_x;
+   h = pt->height0 << mt->ms_y;
+
+   /* For 3D textures, a mipmap is spanned by all the layers, for array
+    * textures and cube maps, each layer contains its own mipmaps.
+    */
+   d = mt->layout_3d ? pt->depth0 : 1;
+
+   for (l = 0; l <= pt->last_level; ++l) {
+      struct nv50_miptree_level *lvl = &mt->level[l];
+      unsigned tsx, tsy, tsz;
+      unsigned nbx = util_format_get_nblocksx(pt->format, w);
+      unsigned nby = util_format_get_nblocksy(pt->format, h);
+
+      lvl->offset = mt->total_size;
+
+      lvl->tile_mode = nv50_tex_choose_tile_dims(nbx, nby, d);
+
+      tsx = NV50_TILE_SIZE_X(lvl->tile_mode); /* x is tile row pitch in bytes */
+      tsy = NV50_TILE_SIZE_Y(lvl->tile_mode);
+      tsz = NV50_TILE_SIZE_Z(lvl->tile_mode);
+
+      lvl->pitch = align(nbx * blocksize, tsx);
+
+      mt->total_size += lvl->pitch * align(nby, tsy) * align(d, tsz);
+
+      w = u_minify(w, 1);
+      h = u_minify(h, 1);
+      d = u_minify(d, 1);
+   }
+
+   if (pt->array_size > 1) {
+      mt->layer_stride = align(mt->total_size,
+                               NV50_TILE_SIZE(mt->level[0].tile_mode));
+      mt->total_size = mt->layer_stride * pt->array_size;
+   }
+}
+
+struct pipe_resource *
+nv50_miptree_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *templ)
+{
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+   struct pipe_resource *pt = &mt->base.base;
+   int ret;
+   union nouveau_bo_config bo_config;
+   uint32_t bo_flags;
+
+   if (!mt)
+      return NULL;
+
+   mt->base.vtbl = &nv50_miptree_vtbl;
+   *pt = *templ;
+   pipe_reference_init(&pt->reference, 1);
+   pt->screen = pscreen;
+
+   if (pt->bind & PIPE_BIND_LINEAR)
+      pt->flags |= NOUVEAU_RESOURCE_FLAG_LINEAR;
+
+   bo_config.nv50.memtype = nv50_mt_choose_storage_type(mt, TRUE);
+
+   if (!nv50_miptree_init_ms_mode(mt)) {
+      FREE(mt);
+      return NULL;
+   }
+
+   if (unlikely(pt->flags & NV50_RESOURCE_FLAG_VIDEO)) {
+      nv50_miptree_init_layout_video(mt);
+      if (pt->flags & NV50_RESOURCE_FLAG_NOALLOC) {
+         /* BO allocation done by client */
+         return pt;
+      }
+   } else
+   if (bo_config.nv50.memtype != 0) {
+      nv50_miptree_init_layout_tiled(mt);
+   } else
+   if (!nv50_miptree_init_layout_linear(mt, 64)) {
+      FREE(mt);
+      return NULL;
+   }
+   bo_config.nv50.tile_mode = mt->level[0].tile_mode;
+
+   bo_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP;
+   if (mt->base.base.bind & (PIPE_BIND_CURSOR | PIPE_BIND_DISPLAY_TARGET))
+      bo_flags |= NOUVEAU_BO_CONTIG;
+
+   ret = nouveau_bo_new(dev, bo_flags, 4096, mt->total_size, &bo_config,
+                        &mt->base.bo);
+   if (ret) {
+      FREE(mt);
+      return NULL;
+   }
+   mt->base.domain = NOUVEAU_BO_VRAM;
+   mt->base.address = mt->base.bo->offset;
+
+   return pt;
+}
+
+struct pipe_resource *
+nv50_miptree_from_handle(struct pipe_screen *pscreen,
+                         const struct pipe_resource *templ,
+                         struct winsys_handle *whandle)
+{
+   struct nv50_miptree *mt;
+   unsigned stride;
+
+   /* only supports 2D, non-mipmapped textures for the moment */
+   if ((templ->target != PIPE_TEXTURE_2D &&
+        templ->target != PIPE_TEXTURE_RECT) ||
+       templ->last_level != 0 ||
+       templ->depth0 != 1 ||
+       templ->array_size > 1)
+      return NULL;
+
+   mt = CALLOC_STRUCT(nv50_miptree);
+   if (!mt)
+      return NULL;
+
+   mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride);
+   if (mt->base.bo == NULL) {
+      FREE(mt);
+      return NULL;
+   }
+   mt->base.domain = NOUVEAU_BO_VRAM;
+   mt->base.address = mt->base.bo->offset;
+
+   mt->base.base = *templ;
+   mt->base.vtbl = &nv50_miptree_vtbl;
+   pipe_reference_init(&mt->base.base.reference, 1);
+   mt->base.base.screen = pscreen;
+   mt->level[0].pitch = stride;
+   mt->level[0].offset = 0;
+   mt->level[0].tile_mode = mt->base.bo->config.nv50.tile_mode;
+
+   /* no need to adjust bo reference count */
+   return &mt->base.base;
+}
+
+
+/* Offset of zslice @z from start of level @l. */
+INLINE unsigned
+nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
+{
+   const struct pipe_resource *pt = &mt->base.base;
+
+   unsigned tds = NV50_TILE_SHIFT_Z(mt->level[l].tile_mode);
+   unsigned ths = NV50_TILE_SHIFT_Y(mt->level[l].tile_mode);
+
+   unsigned nby = util_format_get_nblocksy(pt->format,
+                                           u_minify(pt->height0, l));
+
+   /* to next 2D tile slice within a 3D tile */
+   unsigned stride_2d = NV50_TILE_SIZE_2D(mt->level[l].tile_mode);
+
+   /* to slice in the next (in z direction) 3D tile */
+   unsigned stride_3d = (align(nby, (1 << ths)) * mt->level[l].pitch) << tds;
+
+   return (z & ((1 << tds) - 1)) * stride_2d + (z >> tds) * stride_3d;
+}
+
+/* Surface functions.
+ */
+
+struct nv50_surface *
+nv50_surface_from_miptree(struct nv50_miptree *mt,
+                          const struct pipe_surface *templ)
+{
+   struct pipe_surface *ps;
+   struct nv50_surface *ns = CALLOC_STRUCT(nv50_surface);
+   if (!ns)
+      return NULL;
+   ps = &ns->base;
+
+   pipe_reference_init(&ps->reference, 1);
+   pipe_resource_reference(&ps->texture, &mt->base.base);
+
+   ps->format = templ->format;
+   ps->writable = templ->writable;
+   ps->u.tex.level = templ->u.tex.level;
+   ps->u.tex.first_layer = templ->u.tex.first_layer;
+   ps->u.tex.last_layer = templ->u.tex.last_layer;
+
+   ns->width = u_minify(mt->base.base.width0, ps->u.tex.level);
+   ns->height = u_minify(mt->base.base.height0, ps->u.tex.level);
+   ns->depth = ps->u.tex.last_layer - ps->u.tex.first_layer + 1;
+   ns->offset = mt->level[templ->u.tex.level].offset;
+
+   /* comment says there are going to be removed, but they're used by the st */
+   ps->width = ns->width;
+   ps->height = ns->height;
+
+   ns->width <<= mt->ms_x;
+   ns->height <<= mt->ms_y;
+
+   return ns;
+}
+
+struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_context *pipe,
+                         struct pipe_resource *pt,
+                         const struct pipe_surface *templ)
+{
+   struct nv50_miptree *mt = nv50_miptree(pt);
+   struct nv50_surface *ns = nv50_surface_from_miptree(mt, templ);
+   if (!ns)
+      return NULL;
+   ns->base.context = pipe;
+
+   if (ns->base.u.tex.first_layer) {
+      const unsigned l = ns->base.u.tex.level;
+      const unsigned z = ns->base.u.tex.first_layer;
+
+      if (mt->layout_3d) {
+         ns->offset += nv50_mt_zslice_offset(mt, l, z);
+
+         /* TODO: switch to depth 1 tiles; but actually this shouldn't happen */
+         if (ns->depth > 1 &&
+             (z & (NV50_TILE_SIZE_Z(mt->level[l].tile_mode) - 1)))
+            NOUVEAU_ERR("Creating unsupported 3D surface !\n");
+      } else {
+         ns->offset += mt->layer_stride * z;
+      }
+   }
+
+   return &ns->base;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
new file mode 100644
index 00000000000..73df71c61e2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv50_program.h"
+#include "nv50/nv50_context.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+static INLINE unsigned
+bitcount4(const uint32_t val)
+{
+   static const uint8_t cnt[16]
+   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+   return cnt[val & 0xf];
+}
+
+static int
+nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
+{
+   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
+   unsigned i, n, c;
+
+   n = 0;
+   for (i = 0; i < info->numInputs; ++i) {
+      prog->in[i].id = i;
+      prog->in[i].sn = info->in[i].sn;
+      prog->in[i].si = info->in[i].si;
+      prog->in[i].hw = n;
+      prog->in[i].mask = info->in[i].mask;
+
+      prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
+
+      for (c = 0; c < 4; ++c)
+         if (info->in[i].mask & (1 << c))
+            info->in[i].slot[c] = n++;
+   }
+   prog->in_nr = info->numInputs;
+
+   for (i = 0; i < info->numSysVals; ++i) {
+      switch (info->sv[i].sn) {
+      case TGSI_SEMANTIC_INSTANCEID:
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
+         continue;
+      case TGSI_SEMANTIC_VERTEXID:
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
+         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12;
+         continue;
+      default:
+         break;
+      }
+   }
+
+   /*
+    * Corner case: VP has no inputs, but we will still need to submit data to
+    * draw it. HW will shout at us and won't draw anything if we don't enable
+    * any input, so let's just pretend it's the first one.
+    */
+   if (prog->vp.attrs[0] == 0 &&
+       prog->vp.attrs[1] == 0 &&
+       prog->vp.attrs[2] == 0)
+      prog->vp.attrs[0] |= 0xf;
+
+   /* VertexID before InstanceID */
+   if (info->io.vertexId < info->numSysVals)
+      info->sv[info->io.vertexId].slot[0] = n++;
+   if (info->io.instanceId < info->numSysVals)
+      info->sv[info->io.instanceId].slot[0] = n++;
+
+   n = 0;
+   for (i = 0; i < info->numOutputs; ++i) {
+      switch (info->out[i].sn) {
+      case TGSI_SEMANTIC_PSIZE:
+         prog->vp.psiz = i;
+         break;
+      case TGSI_SEMANTIC_CLIPDIST:
+         prog->vp.clpd[info->out[i].si] = n;
+         break;
+      case TGSI_SEMANTIC_EDGEFLAG:
+         prog->vp.edgeflag = i;
+         break;
+      case TGSI_SEMANTIC_BCOLOR:
+         prog->vp.bfc[info->out[i].si] = i;
+         break;
+      default:
+         break;
+      }
+      prog->out[i].id = i;
+      prog->out[i].sn = info->out[i].sn;
+      prog->out[i].si = info->out[i].si;
+      prog->out[i].hw = n;
+      prog->out[i].mask = info->out[i].mask;
+
+      for (c = 0; c < 4; ++c)
+         if (info->out[i].mask & (1 << c))
+            info->out[i].slot[c] = n++;
+   }
+   prog->out_nr = info->numOutputs;
+   prog->max_out = n;
+
+   if (prog->vp.psiz < info->numOutputs)
+      prog->vp.psiz = prog->out[prog->vp.psiz].hw;
+
+   return 0;
+}
+
+static int
+nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
+{
+   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
+   unsigned i, n, m, c;
+   unsigned nvary;
+   unsigned nflat;
+   unsigned nintp = 0;
+
+   /* count recorded non-flat inputs */
+   for (m = 0, i = 0; i < info->numInputs; ++i) {
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_POSITION:
+      case TGSI_SEMANTIC_FACE:
+         continue;
+      default:
+         m += info->in[i].flat ? 0 : 1;
+         break;
+      }
+   }
+   /* careful: id may be != i in info->in[prog->in[i].id] */
+
+   /* Fill prog->in[] so that non-flat inputs are first and
+    * kick out special inputs that don't use the RESULT_MAP.
+    */
+   for (n = 0, i = 0; i < info->numInputs; ++i) {
+      if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
+         prog->fp.interp |= info->in[i].mask << 24;
+         for (c = 0; c < 4; ++c)
+            if (info->in[i].mask & (1 << c))
+               info->in[i].slot[c] = nintp++;
+      } else
+      if (info->in[i].sn == TGSI_SEMANTIC_FACE) {
+         info->in[i].slot[0] = 255;
+      } else {
+         unsigned j = info->in[i].flat ? m++ : n++;
+
+         if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
+            prog->vp.bfc[info->in[i].si] = j;
+
+         prog->in[j].id = i;
+         prog->in[j].mask = info->in[i].mask;
+         prog->in[j].sn = info->in[i].sn;
+         prog->in[j].si = info->in[i].si;
+         prog->in[j].linear = info->in[i].linear;
+
+         prog->in_nr++;
+      }
+   }
+   if (!(prog->fp.interp & (8 << 24))) {
+      ++nintp;
+      prog->fp.interp |= 8 << 24;
+   }
+
+   for (i = 0; i < prog->in_nr; ++i) {
+      int j = prog->in[i].id;
+
+      prog->in[i].hw = nintp;
+      for (c = 0; c < 4; ++c)
+         if (prog->in[i].mask & (1 << c))
+            info->in[j].slot[c] = nintp++;
+   }
+   /* (n == m) if m never increased, i.e. no flat inputs */
+   nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
+   nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
+   nvary = nintp - nflat;
+
+   prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
+   prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
+
+   /* put front/back colors right after HPOS */
+   prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
+   for (i = 0; i < 2; ++i)
+      if (prog->vp.bfc[i] < 0xff)
+         prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
+
+   /* FP outputs */
+
+   if (info->prop.fp.numColourResults > 1)
+      prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      prog->out[i].id = i;
+      prog->out[i].sn = info->out[i].sn;
+      prog->out[i].si = info->out[i].si;
+      prog->out[i].mask = info->out[i].mask;
+
+      if (i == info->io.fragDepth || i == info->io.sampleMask)
+         continue;
+      prog->out[i].hw = info->out[i].si * 4;
+
+      for (c = 0; c < 4; ++c)
+         info->out[i].slot[c] = prog->out[i].hw + c;
+
+      prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
+   }
+
+   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.sampleMask].slot[0] = prog->max_out++;
+
+   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.fragDepth].slot[2] = prog->max_out++;
+
+   if (!prog->max_out)
+      prog->max_out = 4;
+
+   return 0;
+}
+
+static int
+nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
+{
+   switch (info->type) {
+   case PIPE_SHADER_VERTEX:
+      return nv50_vertprog_assign_slots(info);
+   case PIPE_SHADER_GEOMETRY:
+      return nv50_vertprog_assign_slots(info);
+   case PIPE_SHADER_FRAGMENT:
+      return nv50_fragprog_assign_slots(info);
+   default:
+      return -1;
+   }
+}
+
+static struct nv50_stream_output_state *
+nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
+                                  const struct pipe_stream_output_info *pso)
+{
+   struct nv50_stream_output_state *so;
+   unsigned b, i, c;
+   unsigned base[4];
+
+   so = MALLOC_STRUCT(nv50_stream_output_state);
+   if (!so)
+      return NULL;
+   memset(so->map, 0xff, sizeof(so->map));
+
+   for (b = 0; b < 4; ++b)
+      so->num_attribs[b] = 0;
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
+      b = pso->output[i].output_buffer;
+      assert(b < 4);
+      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
+   }
+
+   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
+
+   so->stride[0] = pso->stride[0] * 4;
+   base[0] = 0;
+   for (b = 1; b < 4; ++b) {
+      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
+      so->stride[b] = so->num_attribs[b] * 4;
+      if (so->num_attribs[b])
+         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
+      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
+   }
+   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
+      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
+      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
+   }
+
+   so->map_size = base[3] + so->num_attribs[3];
+
+   for (i = 0; i < pso->num_outputs; ++i) {
+      const unsigned s = pso->output[i].start_component;
+      const unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
+      b = pso->output[i].output_buffer;
+
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         so->map[base[b] + p + c] = info->out[r].slot[s + c];
+   }
+
+   return so;
+}
+
+boolean
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+{
+   struct nv50_ir_prog_info *info;
+   int ret;
+   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
+
+   info = CALLOC_STRUCT(nv50_ir_prog_info);
+   if (!info)
+      return FALSE;
+
+   info->type = prog->type;
+   info->target = chipset;
+   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info->bin.source = (void *)prog->pipe.tokens;
+
+   info->io.ucpCBSlot = 15;
+   info->io.ucpBase = 0;
+   info->io.genUserClip = prog->vp.clpd_nr;
+
+   info->assignSlots = nv50_program_assign_varying_slots;
+
+   prog->vp.bfc[0] = 0xff;
+   prog->vp.bfc[1] = 0xff;
+   prog->vp.edgeflag = 0xff;
+   prog->vp.clpd[0] = map_undef;
+   prog->vp.clpd[1] = map_undef;
+   prog->vp.psiz = map_undef;
+   prog->gp.primid = 0x80;
+
+   info->driverPriv = prog;
+
+#ifdef DEBUG
+   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
+   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
+#else
+   info->optLevel = 3;
+#endif
+
+   ret = nv50_ir_generate_code(info);
+   if (ret) {
+      NOUVEAU_ERR("shader translation failed: %i\n", ret);
+      goto out;
+   }
+   FREE(info->bin.syms);
+
+   prog->code = info->bin.code;
+   prog->code_size = info->bin.codeSize;
+   prog->fixups = info->bin.relocData;
+   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
+   prog->tls_space = info->bin.tlsSpace;
+
+   if (prog->type == PIPE_SHADER_FRAGMENT) {
+      if (info->prop.fp.writesDepth) {
+         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
+         prog->fp.flags[1] = 0x11;
+      }
+      if (info->prop.fp.usesDiscard)
+         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
+   }
+
+   if (prog->pipe.stream_output.num_outputs)
+      prog->so = nv50_program_create_strmout_state(info,
+                                                   &prog->pipe.stream_output);
+
+out:
+   FREE(info);
+   return !ret;
+}
+
+boolean
+nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
+{
+   struct nouveau_heap *heap;
+   int ret;
+   uint32_t size = align(prog->code_size, 0x40);
+
+   switch (prog->type) {
+   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
+   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
+   default:
+      assert(!"invalid program type");
+      return FALSE;
+   }
+
+   ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
+   if (ret) {
+      /* Out of space: evict everything to compactify the code segment, hoping
+       * the working set is much smaller and drifts slowly. Improve me !
+       */
+      while (heap->next) {
+         struct nv50_program *evict = heap->next->priv;
+         if (evict)
+            nouveau_heap_free(&evict->mem);
+      }
+      debug_printf("WARNING: out of code space, evicting all shaders.\n");
+      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
+      if (ret) {
+         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
+         return FALSE;
+      }
+   }
+   prog->code_base = prog->mem->start;
+
+   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
+   if (ret < 0)
+      return FALSE;
+   if (ret > 0)
+      nv50->state.new_tls_space = TRUE;
+
+   if (prog->fixups)
+      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
+
+   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
+                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       NOUVEAU_BO_VRAM, prog->code_size, prog->code);
+
+   BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
+   PUSH_DATA (nv50->base.pushbuf, 0);
+
+   return TRUE;
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+   const struct pipe_shader_state pipe = p->pipe;
+   const ubyte type = p->type;
+
+   if (p->mem)
+      nouveau_heap_free(&p->mem);
+
+   FREE(p->code);
+
+   FREE(p->fixups);
+
+   FREE(p->so);
+
+   memset(p, 0, sizeof(*p));
+
+   p->pipe = pipe;
+   p->type = type;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
new file mode 100644
index 00000000000..13b9516a3e4
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2010 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_PROG_H__
+#define __NV50_PROG_H__
+
+struct nv50_context;
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+struct nv50_varying {
+   uint8_t id; /* tgsi index */
+   uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
+
+   unsigned mask   : 4;
+   unsigned linear : 1;
+   unsigned pad    : 3;
+
+   ubyte sn; /* semantic name */
+   ubyte si; /* semantic index */
+};
+
+struct nv50_stream_output_state
+{
+   uint32_t ctrl;
+   uint16_t stride[4];
+   uint8_t num_attribs[4];
+   uint8_t map_size;
+   uint8_t map[128];
+};
+
+struct nv50_program {
+   struct pipe_shader_state pipe;
+
+   ubyte type;
+   boolean translated;
+
+   uint32_t *code;
+   unsigned code_size;
+   unsigned code_base;
+   uint32_t *immd;
+   unsigned immd_size;
+   unsigned parm_size; /* size limit of uniform buffer */
+   uint32_t tls_space; /* required local memory per thread */
+
+   ubyte max_gpr; /* REG_ALLOC_TEMP */
+   ubyte max_out; /* REG_ALLOC_RESULT or FP_RESULT_COUNT */
+
+   ubyte in_nr;
+   ubyte out_nr;
+   struct nv50_varying in[16];
+   struct nv50_varying out[16];
+
+   struct {
+      uint32_t attrs[3]; /* VP_ATTR_EN_0,1 and VP_GP_BUILTIN_ATTR_EN */
+      ubyte psiz;        /* output slot of point size */
+      ubyte bfc[2];      /* indices into varying for FFC (FP) or BFC (VP) */
+      ubyte edgeflag;
+      ubyte clpd[2];     /* output slot of clip distance[i]'s 1st component */
+      ubyte clpd_nr;
+   } vp;
+
+   struct {
+      uint32_t flags[2]; /* 0x19a8, 196c */
+      uint32_t interp; /* 0x1988 */
+      uint32_t colors; /* 0x1904 */
+   } fp;
+
+   struct {
+      ubyte primid; /* primitive id output register */
+      uint8_t vert_count;
+      uint8_t prim_type; /* point, line strip or tri strip */
+   } gp;
+
+   void *fixups; /* relocation records */
+
+   struct nouveau_heap *mem;
+
+   struct nv50_stream_output_state *so;
+};
+
+boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
+boolean nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
+void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
+
+#endif /* __NV50_PROG_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
new file mode 100644
index 00000000000..3e9a4096cf0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -0,0 +1,309 @@
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_resource.h"
+
+#include "nv50/nv50_3d.xml.h"
+
+struct push_context {
+   struct nouveau_pushbuf *push;
+
+   const void *idxbuf;
+
+   float edgeflag;
+   int edgeflag_attr;
+
+   uint32_t vertex_words;
+   uint32_t packet_vertex_limit;
+
+   struct translate *translate;
+
+   boolean primitive_restart;
+   uint32_t prim;
+   uint32_t restart_index;
+   uint32_t instance_id;
+};
+
+static INLINE unsigned
+prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static void
+emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i08(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
+                                ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NV04(ctx->push, NV50_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+      }
+   }
+}
+
+static void
+emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i16(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
+                                 ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NV04(ctx->push, NV50_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+      }
+   }
+}
+
+static void
+emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i32(elts, push, ctx->restart_index);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
+                               ctx->push->cur);
+
+      ctx->push->cur += size;
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NV04(ctx->push, NV50_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (ctx->push, ctx->restart_index);
+      }
+   }
+}
+
+static void
+emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
+{
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size = ctx->vertex_words * push;
+
+      BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
+
+      ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
+                          ctx->push->cur);
+      ctx->push->cur += size;
+      count -= push;
+      start += push;
+   }
+}
+
+
+#define NV50_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
+
+static INLINE unsigned
+nv50_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NV50_PRIM_GL_CASE(POINTS);
+   NV50_PRIM_GL_CASE(LINES);
+   NV50_PRIM_GL_CASE(LINE_LOOP);
+   NV50_PRIM_GL_CASE(LINE_STRIP);
+   NV50_PRIM_GL_CASE(TRIANGLES);
+   NV50_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NV50_PRIM_GL_CASE(TRIANGLE_FAN);
+   NV50_PRIM_GL_CASE(QUADS);
+   NV50_PRIM_GL_CASE(QUAD_STRIP);
+   NV50_PRIM_GL_CASE(POLYGON);
+   NV50_PRIM_GL_CASE(LINES_ADJACENCY);
+   NV50_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
+   NV50_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
+   NV50_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
+   /*
+   NV50_PRIM_GL_CASE(PATCHES); */
+   default:
+      return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
+      break;
+   }
+}
+
+void
+nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
+{
+   struct push_context ctx;
+   unsigned i, index_size;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
+   boolean apply_bias = info->indexed && info->index_bias;
+
+   ctx.push = nv50->base.pushbuf;
+   ctx.translate = nv50->vertex->translate;
+   ctx.packet_vertex_limit = nv50->vertex->packet_vertex_limit;
+   ctx.vertex_words = nv50->vertex->vertex_size;
+
+   for (i = 0; i < nv50->num_vtxbufs; ++i) {
+      const struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i];
+      const uint8_t *data;
+
+      if (unlikely(vb->buffer))
+         data = nouveau_resource_map_offset(&nv50->base,
+            nv04_resource(vb->buffer), vb->buffer_offset, NOUVEAU_BO_RD);
+      else
+         data = vb->user_buffer;
+
+      if (apply_bias && likely(!(nv50->vertex->instance_bufs & (1 << i))))
+         data += (ptrdiff_t)info->index_bias * vb->stride;
+
+      ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+   }
+
+   if (info->indexed) {
+      if (nv50->idxbuf.buffer) {
+         ctx.idxbuf = nouveau_resource_map_offset(&nv50->base,
+            nv04_resource(nv50->idxbuf.buffer), nv50->idxbuf.offset,
+            NOUVEAU_BO_RD);
+      } else {
+         ctx.idxbuf = nv50->idxbuf.user_buffer;
+      }
+      if (!ctx.idxbuf)
+         return;
+      index_size = nv50->idxbuf.index_size;
+      ctx.primitive_restart = info->primitive_restart;
+      ctx.restart_index = info->restart_index;
+   } else {
+      if (unlikely(info->count_from_stream_output)) {
+         struct pipe_context *pipe = &nv50->base.pipe;
+         struct nv50_so_target *targ;
+         targ = nv50_so_target(info->count_from_stream_output);
+         if (!targ->pq) {
+            NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+            return;
+         }
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         vert_count /= targ->stride;
+      }
+      ctx.idxbuf = NULL;
+      index_size = 0;
+      ctx.primitive_restart = FALSE;
+      ctx.restart_index = 0;
+   }
+
+   ctx.instance_id = info->start_instance;
+   ctx.prim = nv50_prim_gl(info->mode);
+
+   if (info->primitive_restart) {
+      BEGIN_NV04(ctx.push, NV50_3D(PRIM_RESTART_ENABLE), 2);
+      PUSH_DATA (ctx.push, 1);
+      PUSH_DATA (ctx.push, info->restart_index);
+   } else
+   if (nv50->state.prim_restart) {
+      BEGIN_NV04(ctx.push, NV50_3D(PRIM_RESTART_ENABLE), 1);
+      PUSH_DATA (ctx.push, 0);
+   }
+   nv50->state.prim_restart = info->primitive_restart;
+
+   while (inst_count--) {
+      BEGIN_NV04(ctx.push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (ctx.push, ctx.prim);
+      switch (index_size) {
+      case 0:
+         emit_vertices_seq(&ctx, info->start, vert_count);
+         break;
+      case 1:
+         emit_vertices_i08(&ctx, info->start, vert_count);
+         break;
+      case 2:
+         emit_vertices_i16(&ctx, info->start, vert_count);
+         break;
+      case 4:
+         emit_vertices_i32(&ctx, info->start, vert_count);
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      BEGIN_NV04(ctx.push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (ctx.push, 0);
+
+      ctx.instance_id++;
+      ctx.prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
new file mode 100644
index 00000000000..6f25a0822c4
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright 2011 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Christoph Bumiller
+ */
+
+#define NV50_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nv50/nv50_context.h"
+#include "nv_object.xml.h"
+
+/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
+ * (since we use only a single GPU channel per screen) will not work properly.
+ *
+ * The first is not that big of an issue because OpenGL does not allow nested
+ * queries anyway.
+ */
+
+struct nv50_query {
+   uint32_t *data;
+   uint16_t type;
+   uint16_t index;
+   uint32_t sequence;
+   struct nouveau_bo *bo;
+   uint32_t base;
+   uint32_t offset; /* base + i * 16 */
+   boolean ready;
+   boolean flushed;
+   boolean is64bit;
+   struct nouveau_mm_allocation *mm;
+};
+
+#define NV50_QUERY_ALLOC_SPACE 128
+
+static INLINE struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+   return (struct nv50_query *)pipe;
+}
+
+static boolean
+nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
+{
+   struct nv50_screen *screen = nv50->screen;
+   int ret;
+
+   if (q->bo) {
+      nouveau_bo_ref(NULL, &q->bo);
+      if (q->mm) {
+         if (q->ready)
+            nouveau_mm_free(q->mm);
+         else
+            nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work,
+                               q->mm);
+      }
+   }
+   if (size) {
+      q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
+      if (!q->bo)
+         return FALSE;
+      q->offset = q->base;
+
+      ret = nouveau_bo_map(q->bo, 0, screen->base.client);
+      if (ret) {
+         nv50_query_allocate(nv50, q, 0);
+         return FALSE;
+      }
+      q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
+   }
+   return TRUE;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
+   FREE(nv50_query(pq));
+}
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nv50_query *q;
+
+   q = CALLOC_STRUCT(nv50_query);
+   if (!q)
+      return NULL;
+
+   if (!nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE)) {
+      FREE(q);
+      return NULL;
+   }
+
+   q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED ||
+                 type == PIPE_QUERY_PRIMITIVES_EMITTED ||
+                 type == PIPE_QUERY_SO_STATISTICS);
+   q->type = type;
+
+   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
+      q->offset -= 16;
+      q->data -= 16 / sizeof(*q->data); /* we advance before query_begin ! */
+   }
+
+   return (struct pipe_query *)q;
+}
+
+static void
+nv50_query_get(struct nouveau_pushbuf *push, struct nv50_query *q,
+               unsigned offset, uint32_t get)
+{
+   offset += q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_NV04(push, NV50_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, get);
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_query *q = nv50_query(pq);
+
+   /* For occlusion queries we have to change the storage, because a previous
+    * query might set the initial render conition to FALSE even *after* we re-
+    * initialized it to TRUE.
+    */
+   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
+      q->offset += 16;
+      q->data += 16 / sizeof(*q->data);
+      if (q->offset - q->base == NV50_QUERY_ALLOC_SPACE)
+         nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE);
+
+      /* XXX: can we do this with the GPU, and sync with respect to a previous
+       *  query ?
+       */
+      q->data[1] = 1; /* initial render condition = TRUE */
+   }
+   if (!q->is64bit)
+      q->data[0] = q->sequence++; /* the previously used one */
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      PUSH_SPACE(push, 4);
+      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+      PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 1);
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_query_get(push, q, 0x10, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_query_get(push, q, 0x20, 0x05805002);
+      nv50_query_get(push, q, 0x30, 0x06805002);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_query_get(push, q, 0x10, 0x00005002);
+      break;
+   default:
+      break;
+   }
+   q->ready = FALSE;
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_query *q = nv50_query(pq);
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      nv50_query_get(push, q, 0, 0x0100f002);
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_query_get(push, q, 0, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_query_get(push, q, 0, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_query_get(push, q, 0x00, 0x05805002);
+      nv50_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      q->sequence++;
+      /* fall through */
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_query_get(push, q, 0, 0x00005002);
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      q->sequence++;
+      nv50_query_get(push, q, 0, 0x1000f010);
+      break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   q->ready = q->flushed = FALSE;
+}
+
+static INLINE boolean
+nv50_query_ready(struct nv50_query *q)
+{
+   return q->ready || (!q->is64bit && (q->data[0] == q->sequence));
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                  boolean wait, union pipe_query_result *result)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nv50_query *q = nv50_query(pq);
+   uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
+   boolean *res8 = (boolean *)result;
+   uint64_t *data64 = (uint64_t *)q->data;
+
+   if (!q->ready) /* update ? */
+      q->ready = nv50_query_ready(q);
+   if (!q->ready) {
+      if (!wait) {
+         /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
+         if (!q->flushed) {
+            q->flushed = TRUE;
+            PUSH_KICK(nv50->base.pushbuf);
+         }
+         return FALSE;
+      }
+      if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
+         return FALSE;
+   }
+   q->ready = TRUE;
+
+   switch (q->type) {
+   case PIPE_QUERY_GPU_FINISHED:
+      res8[0] = TRUE;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
+      res64[0] = q->data[1];
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
+   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
+      res64[0] = data64[0] - data64[2];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      res64[0] = data64[1];
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      res64[0] = 1000000000;
+      res8[8] = FALSE;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      res64[0] = data64[1] - data64[3];
+      break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = q->data[1];
+      break;
+   default:
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+void
+nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+{
+   struct nv50_query *q = nv50_query(pq);
+   unsigned offset = q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
+static void
+nv50_render_condition(struct pipe_context *pipe,
+                      struct pipe_query *pq,
+                      boolean condition, uint mode)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_query *q;
+
+   nv50->cond_query = pq;
+   nv50->cond_cond = condition;
+   nv50->cond_mode = mode;
+
+   PUSH_SPACE(push, 6);
+
+   if (!pq) {
+      BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+      PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+      return;
+   }
+   q = nv50_query(pq);
+
+   if (mode == PIPE_RENDER_COND_WAIT ||
+       mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   BEGIN_NV04(push, NV50_3D(COND_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, q->bo->offset + q->offset);
+   PUSH_DATA (push, q->bo->offset + q->offset);
+   PUSH_DATA (push, NV50_3D_COND_MODE_RES_NON_ZERO);
+}
+
+void
+nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                          struct pipe_query *pq, unsigned result_offset)
+{
+   struct nv50_query *q = nv50_query(pq);
+
+   /* XXX: does this exist ? */
+#define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
+
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
+                        NV50_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   nv50_query_end(pipe, targ->pq);
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+   struct pipe_context *pipe = &nv50->base.pipe;
+
+   pipe->create_query = nv50_query_create;
+   pipe->destroy_query = nv50_query_destroy;
+   pipe->begin_query = nv50_query_begin;
+   pipe->end_query = nv50_query_end;
+   pipe->get_query_result = nv50_query_result;
+   pipe->render_condition = nv50_render_condition;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
new file mode 100644
index 00000000000..7fbb0a92bf6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
@@ -0,0 +1,104 @@
+
+#include "pipe/p_context.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nouveau_screen.h"
+
+#include "nv50/nv50_resource.h"
+
+static struct pipe_resource *
+nv50_resource_create(struct pipe_screen *screen,
+                     const struct pipe_resource *templ)
+{
+   switch (templ->target) {
+   case PIPE_BUFFER:
+      return nouveau_buffer_create(screen, templ);
+   default:
+      return nv50_miptree_create(screen, templ);
+   }
+}
+
+static struct pipe_resource *
+nv50_resource_from_handle(struct pipe_screen * screen,
+                          const struct pipe_resource *templ,
+                          struct winsys_handle *whandle)
+{
+   if (templ->target == PIPE_BUFFER)
+      return NULL;
+   else
+      return nv50_miptree_from_handle(screen, templ, whandle);
+}
+
+struct pipe_surface *
+nv50_surface_from_buffer(struct pipe_context *pipe,
+			 struct pipe_resource *pbuf,
+			 const struct pipe_surface *templ)
+{
+   struct nv50_surface *sf = CALLOC_STRUCT(nv50_surface);
+   if (!sf)
+      return NULL;
+
+   pipe_reference_init(&sf->base.reference, 1);
+   pipe_resource_reference(&sf->base.texture, pbuf);
+
+   sf->base.format = templ->format;
+   sf->base.writable = templ->writable;
+   sf->base.u.buf.first_element = templ->u.buf.first_element;
+   sf->base.u.buf.last_element = templ->u.buf.last_element;
+
+   sf->offset =
+      templ->u.buf.first_element * util_format_get_blocksize(sf->base.format);
+
+   sf->offset &= ~0x7f; /* FIXME: RT_ADDRESS requires 128 byte alignment */
+
+   sf->width = templ->u.buf.last_element - templ->u.buf.first_element + 1;
+   sf->height = 1;
+   sf->depth = 1;
+
+   sf->base.width = sf->width;
+   sf->base.height = sf->height;
+
+   sf->base.context = pipe;
+   return &sf->base;
+}
+
+static struct pipe_surface *
+nv50_surface_create(struct pipe_context *pipe,
+		    struct pipe_resource *pres,
+		    const struct pipe_surface *templ)
+{
+   if (unlikely(pres->target == PIPE_BUFFER))
+      return nv50_surface_from_buffer(pipe, pres, templ);
+   return nv50_miptree_surface_new(pipe, pres, templ);
+}
+
+void
+nv50_surface_destroy(struct pipe_context *pipe, struct pipe_surface *ps)
+{
+   struct nv50_surface *s = nv50_surface(ps);
+
+   pipe_resource_reference(&ps->texture, NULL);
+
+   FREE(s);
+}
+
+void
+nv50_init_resource_functions(struct pipe_context *pcontext)
+{
+   pcontext->transfer_map = u_transfer_map_vtbl;
+   pcontext->transfer_flush_region = u_transfer_flush_region_vtbl;
+   pcontext->transfer_unmap = u_transfer_unmap_vtbl;
+   pcontext->transfer_inline_write = u_transfer_inline_write_vtbl;
+   pcontext->create_surface = nv50_surface_create;
+   pcontext->surface_destroy = nv50_surface_destroy;
+}
+
+void
+nv50_screen_init_resource_functions(struct pipe_screen *pscreen)
+{
+   pscreen->resource_create = nv50_resource_create;
+   pscreen->resource_from_handle = nv50_resource_from_handle;
+   pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+   pscreen->resource_destroy = u_resource_destroy_vtbl;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
new file mode 100644
index 00000000000..c06daa31c5d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -0,0 +1,153 @@
+
+#ifndef __NV50_RESOURCE_H__
+#define __NV50_RESOURCE_H__
+
+#include "util/u_transfer.h"
+#include "util/u_double_list.h"
+
+#include "nouveau_winsys.h"
+#include "nouveau_buffer.h"
+
+#ifndef __NVC0_RESOURCE_H__ /* make sure we don't use these in nvc0: */
+
+void
+nv50_init_resource_functions(struct pipe_context *pcontext);
+
+void
+nv50_screen_init_resource_functions(struct pipe_screen *pscreen);
+
+#define NV50_RESOURCE_FLAG_VIDEO (NOUVEAU_RESOURCE_FLAG_DRV_PRIV << 0)
+#define NV50_RESOURCE_FLAG_NOALLOC (NOUVEAU_RESOURCE_FLAG_DRV_PRIV << 1)
+
+#define NV50_TILE_SHIFT_X(m) 6
+#define NV50_TILE_SHIFT_Y(m) ((((m) >> 4) & 0xf) + 2)
+#define NV50_TILE_SHIFT_Z(m) ((((m) >> 8) & 0xf) + 0)
+
+#define NV50_TILE_SIZE_X(m) 64
+#define NV50_TILE_SIZE_Y(m) ( 4 << (((m) >> 4) & 0xf))
+#define NV50_TILE_SIZE_Z(m) ( 1 << (((m) >> 8) & 0xf))
+
+#define NV50_TILE_SIZE_2D(m) (NV50_TILE_SIZE_X(m) << NV50_TILE_SHIFT_Y(m))
+
+#define NV50_TILE_SIZE(m) (NV50_TILE_SIZE_2D(m) << NV50_TILE_SHIFT_Z(m))
+
+#endif /* __NVC0_RESOURCE_H__ */
+
+uint32_t
+nv50_tex_choose_tile_dims_helper(unsigned nx, unsigned ny, unsigned nz);
+
+struct nv50_miptree_level {
+   uint32_t offset;
+   uint32_t pitch;
+   uint32_t tile_mode;
+};
+
+#define NV50_MAX_TEXTURE_LEVELS 16
+
+struct nv50_miptree {
+   struct nv04_resource base;
+   struct nv50_miptree_level level[NV50_MAX_TEXTURE_LEVELS];
+   uint32_t total_size;
+   uint32_t layer_stride;
+   boolean layout_3d; /* TRUE if layer count varies with mip level */
+   uint8_t ms_x;      /* log2 of number of samples in x/y dimension */
+   uint8_t ms_y;
+   uint8_t ms_mode;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_resource *pt)
+{
+   return (struct nv50_miptree *)pt;
+}
+
+
+#define NV50_TEXVIEW_SCALED_COORDS     (1 << 0)
+#define NV50_TEXVIEW_FILTER_MSAA8      (1 << 1)
+#define NV50_TEXVIEW_ACCESS_RESOLVE    (1 << 2)
+
+
+/* Internal functions:
+ */
+boolean
+nv50_miptree_init_layout_linear(struct nv50_miptree *mt, unsigned pitch_align);
+
+struct pipe_resource *
+nv50_miptree_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *tmp);
+
+void
+nv50_miptree_destroy(struct pipe_screen *pscreen, struct pipe_resource *pt);
+
+struct pipe_resource *
+nv50_miptree_from_handle(struct pipe_screen *pscreen,
+                         const struct pipe_resource *template,
+                         struct winsys_handle *whandle);
+
+boolean
+nv50_miptree_get_handle(struct pipe_screen *pscreen,
+                        struct pipe_resource *pt,
+                        struct winsys_handle *whandle);
+
+struct nv50_surface {
+   struct pipe_surface base;
+   uint32_t offset;
+   uint32_t width;
+   uint16_t height;
+   uint16_t depth;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *ps)
+{
+   return (struct nv50_surface *)ps;
+}
+
+static INLINE enum pipe_format
+nv50_zs_to_s_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT: return PIPE_FORMAT_X24S8_UINT;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM: return PIPE_FORMAT_S8X24_UINT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: return PIPE_FORMAT_X32_S8X24_UINT;
+   default:
+      return format;
+   }
+}
+
+#ifndef __NVC0_RESOURCE_H__
+
+unsigned
+nv50_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z);
+
+struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_surface *templ);
+
+void *
+nv50_miptree_transfer_map(struct pipe_context *pctx,
+                          struct pipe_resource *res,
+                          unsigned level,
+                          unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **ptransfer);
+void
+nv50_miptree_transfer_unmap(struct pipe_context *pcontext,
+                            struct pipe_transfer *ptx);
+
+#endif /* __NVC0_RESOURCE_H__ */
+
+struct nv50_surface *
+nv50_surface_from_miptree(struct nv50_miptree *mt,
+                          const struct pipe_surface *templ);
+
+struct pipe_surface *
+nv50_surface_from_buffer(struct pipe_context *pipe,
+                         struct pipe_resource *pt,
+                         const struct pipe_surface *templ);
+
+void
+nv50_surface_destroy(struct pipe_context *, struct pipe_surface *);
+
+#endif /* __NV50_RESOURCE_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
new file mode 100644
index 00000000000..f454ec77656
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "pipe/p_screen.h"
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_screen.h"
+
+#include "nouveau_vp3_video.h"
+
+#include "nv_object.xml.h"
+#include <errno.h>
+
+#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
+# define NOUVEAU_GETPARAM_GRAPH_UNITS 13
+#endif
+
+/* affected by LOCAL_WARPS_LOG_ALLOC / LOCAL_WARPS_NO_CLAMP */
+#define LOCAL_WARPS_ALLOC 32
+/* affected by STACK_WARPS_LOG_ALLOC / STACK_WARPS_NO_CLAMP */
+#define STACK_WARPS_ALLOC 32
+
+#define THREADS_IN_WARP 32
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+                                enum pipe_format format,
+                                enum pipe_texture_target target,
+                                unsigned sample_count,
+                                unsigned bindings)
+{
+   if (sample_count > 8)
+      return FALSE;
+   if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
+      return FALSE;
+   if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128)
+      return FALSE;
+
+   if (!util_format_is_supported(format, bindings))
+      return FALSE;
+
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      if (nv50_screen(pscreen)->tesla->oclass < NVA0_3D_CLASS)
+         return FALSE;
+      break;
+   default:
+      break;
+   }
+
+   /* transfers & shared are always supported */
+   bindings &= ~(PIPE_BIND_TRANSFER_READ |
+                 PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_SHARED);
+
+   return (nv50_format_table[format].usage & bindings) == bindings;
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
+   switch (param) {
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 64;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 14;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 12;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 14;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return 512;
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -8;
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 7;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_SCALED_RESOLVE:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return 65536;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+      return nv50_screen(pscreen)->tesla->oclass >= NVA0_3D_CLASS;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+      return 0;
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+      return 0;
+      /*
+      return nv50_screen(pscreen)->tesla->oclass >= NVA3_3D_CLASS;
+      */
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return 140;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 8;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+      return 1;
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return 4;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+      return 64;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return (class_3d >= NVA0_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return nv50_screen(pscreen)->tesla->oclass >= NVA3_3D_CLASS;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return 0;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+   case PIPE_CAP_START_INSTANCE:
+      return 1;
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+      return 0; /* state trackers will know better */
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+      return 1;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 256;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 1; /* 256 for binding as RT, but that's not possible in GL */
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return 0;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 1;
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+      return 0;
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50;
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_LITTLE;
+   default:
+      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
+      return 0;
+   }
+}
+
+static int
+nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
+                             enum pipe_shader_cap param)
+{
+   switch (shader) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_FRAGMENT:
+      break;
+   default:
+      return 0;
+   }
+
+   switch (param) {
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+      return 16384;
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return 4;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      if (shader == PIPE_SHADER_VERTEX)
+         return 32;
+      return 0x300 / 16;
+   case PIPE_SHADER_CAP_MAX_CONSTS:
+      return 65536 / 16;
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return NV50_MAX_PIPE_CONSTBUFS;
+   case PIPE_SHADER_CAP_MAX_ADDRS:
+      return 1;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      return shader != PIPE_SHADER_FRAGMENT;
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      return 1;
+   case PIPE_SHADER_CAP_MAX_PREDS:
+      return 0;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return nv50_screen(pscreen)->max_tls_space / ONE_TEMP_SIZE;
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+      return 0;
+   case PIPE_SHADER_CAP_SUBROUTINES:
+      return 0; /* please inline, or provide function declarations */
+   case PIPE_SHADER_CAP_INTEGERS:
+      return 1;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+      return 32;
+   default:
+      NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
+      return 0;
+   }
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      return 10.0f;
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 64.0f;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0f;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 4.0f;
+   default:
+      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
+      return 0.0f;
+   }
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+
+   if (screen->base.fence.current) {
+      nouveau_fence_wait(screen->base.fence.current);
+      nouveau_fence_ref (NULL, &screen->base.fence.current);
+   }
+   if (screen->base.pushbuf)
+      screen->base.pushbuf->user_priv = NULL;
+
+   if (screen->blitter)
+      nv50_blitter_destroy(screen);
+
+   nouveau_bo_ref(NULL, &screen->code);
+   nouveau_bo_ref(NULL, &screen->tls_bo);
+   nouveau_bo_ref(NULL, &screen->stack_bo);
+   nouveau_bo_ref(NULL, &screen->txc);
+   nouveau_bo_ref(NULL, &screen->uniforms);
+   nouveau_bo_ref(NULL, &screen->fence.bo);
+
+   nouveau_heap_destroy(&screen->vp_code_heap);
+   nouveau_heap_destroy(&screen->gp_code_heap);
+   nouveau_heap_destroy(&screen->fp_code_heap);
+
+   FREE(screen->tic.entries);
+
+   nouveau_object_del(&screen->tesla);
+   nouveau_object_del(&screen->eng2d);
+   nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->sync);
+
+   nouveau_screen_fini(&screen->base);
+
+   FREE(screen);
+}
+
+static void
+nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+
+   /* we need to do it after possible flush in MARK_RING */
+   *sequence = ++screen->base.fence.sequence;
+
+   PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
+   PUSH_DATAh(push, screen->fence.bo->offset);
+   PUSH_DATA (push, screen->fence.bo->offset);
+   PUSH_DATA (push, *sequence);
+   PUSH_DATA (push, NV50_3D_QUERY_GET_MODE_WRITE_UNK0 |
+                    NV50_3D_QUERY_GET_UNK4 |
+                    NV50_3D_QUERY_GET_UNIT_CROP |
+                    NV50_3D_QUERY_GET_TYPE_QUERY |
+                    NV50_3D_QUERY_GET_QUERY_SELECT_ZERO |
+                    NV50_3D_QUERY_GET_SHORT);
+}
+
+static u32
+nv50_screen_fence_update(struct pipe_screen *pscreen)
+{
+   return nv50_screen(pscreen)->fence.map[0];
+}
+
+static void
+nv50_screen_init_hwctx(struct nv50_screen *screen)
+{
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   struct nv04_fifo *fifo;
+   unsigned i;
+
+   fifo = (struct nv04_fifo *)screen->base.channel->data;
+
+   BEGIN_NV04(push, SUBC_M2MF(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->m2mf->handle);
+   BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_DMA_NOTIFY), 3);
+   PUSH_DATA (push, screen->sync->handle);
+   PUSH_DATA (push, fifo->vram);
+   PUSH_DATA (push, fifo->vram);
+
+   BEGIN_NV04(push, SUBC_2D(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->eng2d->handle);
+   BEGIN_NV04(push, NV50_2D(DMA_NOTIFY), 4);
+   PUSH_DATA (push, screen->sync->handle);
+   PUSH_DATA (push, fifo->vram);
+   PUSH_DATA (push, fifo->vram);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_2D(OPERATION), 1);
+   PUSH_DATA (push, NV50_2D_OPERATION_SRCCOPY);
+   BEGIN_NV04(push, NV50_2D(CLIP_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_2D(COLOR_KEY_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, SUBC_2D(0x0888), 1);
+   PUSH_DATA (push, 1);
+
+   BEGIN_NV04(push, SUBC_3D(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->tesla->handle);
+
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
+   BEGIN_NV04(push, NV50_3D(DMA_NOTIFY), 1);
+   PUSH_DATA (push, screen->sync->handle);
+   BEGIN_NV04(push, NV50_3D(DMA_ZETA), 11);
+   for (i = 0; i < 11; ++i)
+      PUSH_DATA(push, fifo->vram);
+   BEGIN_NV04(push, NV50_3D(DMA_COLOR(0)), NV50_3D_DMA_COLOR__LEN);
+   for (i = 0; i < NV50_3D_DMA_COLOR__LEN; ++i)
+      PUSH_DATA(push, fifo->vram);
+
+   BEGIN_NV04(push, NV50_3D(REG_MODE), 1);
+   PUSH_DATA (push, NV50_3D_REG_MODE_STRIPED);
+   BEGIN_NV04(push, NV50_3D(UNK1400_LANES), 1);
+   PUSH_DATA (push, 0xf);
+
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+      BEGIN_NV04(push, NV50_3D(WATCHDOG_TIMER), 1);
+      PUSH_DATA (push, 0x18);
+   }
+
+   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+   PUSH_DATA (push, 1);
+
+   BEGIN_NV04(push, NV50_3D(CSAA_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, NV50_3D_MULTISAMPLE_MODE_MS1);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(LINE_LAST_PIXEL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(BLEND_SEPARATE_ALPHA), 1);
+   PUSH_DATA (push, 1);
+
+   if (screen->tesla->oclass >= NVA0_3D_CLASS) {
+      BEGIN_NV04(push, SUBC_3D(NVA0_3D_TEX_MISC), 1);
+      PUSH_DATA (push, NVA0_3D_TEX_MISC_SEAMLESS_CUBE_MAP);
+   }
+
+   BEGIN_NV04(push, NV50_3D(SCREEN_Y_CONTROL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(WINDOW_OFFSET_X), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(ZCULL_REGION), 1);
+   PUSH_DATA (push, 0x3f);
+
+   BEGIN_NV04(push, NV50_3D(VP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->code->offset + (0 << NV50_CODE_BO_SIZE_LOG2));
+   PUSH_DATA (push, screen->code->offset + (0 << NV50_CODE_BO_SIZE_LOG2));
+
+   BEGIN_NV04(push, NV50_3D(FP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->code->offset + (1 << NV50_CODE_BO_SIZE_LOG2));
+   PUSH_DATA (push, screen->code->offset + (1 << NV50_CODE_BO_SIZE_LOG2));
+
+   BEGIN_NV04(push, NV50_3D(GP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->code->offset + (2 << NV50_CODE_BO_SIZE_LOG2));
+   PUSH_DATA (push, screen->code->offset + (2 << NV50_CODE_BO_SIZE_LOG2));
+
+   BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->tls_bo->offset);
+   PUSH_DATA (push, screen->tls_bo->offset);
+   PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
+
+   BEGIN_NV04(push, NV50_3D(STACK_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->stack_bo->offset);
+   PUSH_DATA (push, screen->stack_bo->offset);
+   PUSH_DATA (push, 4);
+
+   BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->uniforms->offset + (0 << 16));
+   PUSH_DATA (push, screen->uniforms->offset + (0 << 16));
+   PUSH_DATA (push, (NV50_CB_PVP << 16) | 0x0000);
+
+   BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->uniforms->offset + (1 << 16));
+   PUSH_DATA (push, screen->uniforms->offset + (1 << 16));
+   PUSH_DATA (push, (NV50_CB_PGP << 16) | 0x0000);
+
+   BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->uniforms->offset + (2 << 16));
+   PUSH_DATA (push, screen->uniforms->offset + (2 << 16));
+   PUSH_DATA (push, (NV50_CB_PFP << 16) | 0x0000);
+
+   BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->uniforms->offset + (3 << 16));
+   PUSH_DATA (push, screen->uniforms->offset + (3 << 16));
+   PUSH_DATA (push, (NV50_CB_AUX << 16) | 0x0200);
+
+   BEGIN_NI04(push, NV50_3D(SET_PROGRAM_CB), 3);
+   PUSH_DATA (push, (NV50_CB_AUX << 12) | 0xf01);
+   PUSH_DATA (push, (NV50_CB_AUX << 12) | 0xf21);
+   PUSH_DATA (push, (NV50_CB_AUX << 12) | 0xf31);
+
+   /* return { 0.0, 0.0, 0.0, 0.0 } on out-of-bounds vtxbuf access */
+   BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
+   PUSH_DATA (push, ((1 << 9) << 6) | NV50_CB_AUX);
+   BEGIN_NI04(push, NV50_3D(CB_DATA(0)), 4);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   BEGIN_NV04(push, NV50_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->uniforms->offset + (3 << 16) + (1 << 9));
+   PUSH_DATA (push, screen->uniforms->offset + (3 << 16) + (1 << 9));
+
+   /* max TIC (bits 4:8) & TSC bindings, per program type */
+   for (i = 0; i < 3; ++i) {
+      BEGIN_NV04(push, NV50_3D(TEX_LIMITS(i)), 1);
+      PUSH_DATA (push, 0x54);
+   }
+
+   BEGIN_NV04(push, NV50_3D(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_3D(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_3D(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_3D(CLIP_RECTS_EN), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(CLIP_RECTS_MODE), 1);
+   PUSH_DATA (push, NV50_3D_CLIP_RECTS_MODE_INSIDE_ANY);
+   BEGIN_NV04(push, NV50_3D(CLIP_RECT_HORIZ(0)), 8 * 2);
+   for (i = 0; i < 8 * 2; ++i)
+      PUSH_DATA(push, 0);
+   BEGIN_NV04(push, NV50_3D(CLIPID_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_TRANSFORM_EN), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(DEPTH_RANGE_NEAR(0)), 2);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 1.0f);
+
+   BEGIN_NV04(push, NV50_3D(VIEW_VOLUME_CLIP_CTRL), 1);
+#ifdef NV50_SCISSORS_CLIPPING
+   PUSH_DATA (push, 0x0000);
+#else
+   PUSH_DATA (push, 0x1080);
+#endif
+
+   BEGIN_NV04(push, NV50_3D(CLEAR_FLAGS), 1);
+   PUSH_DATA (push, NV50_3D_CLEAR_FLAGS_CLEAR_RECT_VIEWPORT);
+
+   /* We use scissors instead of exact view volume clipping,
+    * so they're always enabled.
+    */
+   BEGIN_NV04(push, NV50_3D(SCISSOR_ENABLE(0)), 3);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 8192 << 16);
+   PUSH_DATA (push, 8192 << 16);
+
+   BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(POINT_RASTER_RULES), 1);
+   PUSH_DATA (push, NV50_3D_POINT_RASTER_RULES_OGL);
+   BEGIN_NV04(push, NV50_3D(FRAG_COLOR_CLAMP_EN), 1);
+   PUSH_DATA (push, 0x11111111);
+   BEGIN_NV04(push, NV50_3D(EDGEFLAG), 1);
+   PUSH_DATA (push, 1);
+
+   PUSH_KICK (push);
+}
+
+static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space,
+      uint64_t *tls_size)
+{
+   struct nouveau_device *dev = screen->base.device;
+   int ret;
+
+   screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) *
+         ONE_TEMP_SIZE;
+   if (nouveau_mesa_debug)
+      debug_printf("allocating space for %u temps\n",
+            util_next_power_of_two(tls_space / ONE_TEMP_SIZE));
+   *tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) *
+         screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+                        *tls_size, NULL, &screen->tls_bo);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret);
+      return ret;
+   }
+
+   return 0;
+}
+
+int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
+{
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   int ret;
+   uint64_t tls_size;
+
+   if (tls_space < screen->cur_tls_space)
+      return 0;
+   if (tls_space > screen->max_tls_space) {
+      /* fixable by limiting number of warps (LOCAL_WARPS_LOG_ALLOC /
+       * LOCAL_WARPS_NO_CLAMP) */
+      NOUVEAU_ERR("Unsupported number of temporaries (%u > %u). Fixable if someone cares.\n",
+            (unsigned)(tls_space / ONE_TEMP_SIZE),
+            (unsigned)(screen->max_tls_space / ONE_TEMP_SIZE));
+      return -ENOMEM;
+   }
+
+   nouveau_bo_ref(NULL, &screen->tls_bo);
+   ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, NV50_3D(LOCAL_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->tls_bo->offset);
+   PUSH_DATA (push, screen->tls_bo->offset);
+   PUSH_DATA (push, util_logbase2(screen->cur_tls_space / 8));
+
+   return 1;
+}
+
+struct pipe_screen *
+nv50_screen_create(struct nouveau_device *dev)
+{
+   struct nv50_screen *screen;
+   struct pipe_screen *pscreen;
+   struct nouveau_object *chan;
+   uint64_t value;
+   uint32_t tesla_class;
+   unsigned stack_size;
+   int ret;
+
+   screen = CALLOC_STRUCT(nv50_screen);
+   if (!screen)
+      return NULL;
+   pscreen = &screen->base.base;
+
+   ret = nouveau_screen_init(&screen->base, dev);
+   if (ret) {
+      NOUVEAU_ERR("nouveau_screen_init failed: %d\n", ret);
+      goto fail;
+   }
+
+   /* TODO: Prevent FIFO prefetch before transfer of index buffers and
+    *  admit them to VRAM.
+    */
+   screen->base.vidmem_bindings |= PIPE_BIND_CONSTANT_BUFFER |
+      PIPE_BIND_VERTEX_BUFFER;
+   screen->base.sysmem_bindings |=
+      PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
+
+   screen->base.pushbuf->user_priv = screen;
+   screen->base.pushbuf->rsvd_kick = 5;
+
+   chan = screen->base.channel;
+
+   pscreen->destroy = nv50_screen_destroy;
+   pscreen->context_create = nv50_create;
+   pscreen->is_format_supported = nv50_screen_is_format_supported;
+   pscreen->get_param = nv50_screen_get_param;
+   pscreen->get_shader_param = nv50_screen_get_shader_param;
+   pscreen->get_paramf = nv50_screen_get_paramf;
+
+   nv50_screen_init_resource_functions(pscreen);
+
+   if (screen->base.device->chipset < 0x84 ||
+       debug_get_bool_option("NOUVEAU_PMPEG", FALSE)) {
+      /* PMPEG */
+      nouveau_screen_init_vdec(&screen->base);
+   } else if (screen->base.device->chipset < 0x98 ||
+              screen->base.device->chipset == 0xa0) {
+      /* VP2 */
+      screen->base.base.get_video_param = nv84_screen_get_video_param;
+      screen->base.base.is_video_format_supported = nv84_screen_video_supported;
+   } else {
+      /* VP3/4 */
+      screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param;
+      screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096,
+                        NULL, &screen->fence.bo);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate fence bo: %d\n", ret);
+      goto fail;
+   }
+
+   nouveau_bo_map(screen->fence.bo, 0, NULL);
+   screen->fence.map = screen->fence.bo->map;
+   screen->base.fence.emit = nv50_screen_fence_emit;
+   screen->base.fence.update = nv50_screen_fence_update;
+
+   ret = nouveau_object_new(chan, 0xbeef0301, NOUVEAU_NOTIFIER_CLASS,
+                            &(struct nv04_notify){ .length = 32 },
+                            sizeof(struct nv04_notify), &screen->sync);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate notifier: %d\n", ret);
+      goto fail;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef5039, NV50_M2MF_CLASS,
+                            NULL, 0, &screen->m2mf);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate PGRAPH context for M2MF: %d\n", ret);
+      goto fail;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef502d, NV50_2D_CLASS,
+                            NULL, 0, &screen->eng2d);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate PGRAPH context for 2D: %d\n", ret);
+      goto fail;
+   }
+
+   switch (dev->chipset & 0xf0) {
+   case 0x50:
+      tesla_class = NV50_3D_CLASS;
+      break;
+   case 0x80:
+   case 0x90:
+      tesla_class = NV84_3D_CLASS;
+      break;
+   case 0xa0:
+      switch (dev->chipset) {
+      case 0xa0:
+      case 0xaa:
+      case 0xac:
+         tesla_class = NVA0_3D_CLASS;
+         break;
+      case 0xaf:
+         tesla_class = NVAF_3D_CLASS;
+         break;
+      default:
+         tesla_class = NVA3_3D_CLASS;
+         break;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", dev->chipset);
+      goto fail;
+   }
+   screen->base.class_3d = tesla_class;
+
+   ret = nouveau_object_new(chan, 0xbeef5097, tesla_class,
+                            NULL, 0, &screen->tesla);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate PGRAPH context for 3D: %d\n", ret);
+      goto fail;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
+                        3 << NV50_CODE_BO_SIZE_LOG2, NULL, &screen->code);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate code bo: %d\n", ret);
+      goto fail;
+   }
+
+   nouveau_heap_init(&screen->vp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2);
+   nouveau_heap_init(&screen->gp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2);
+   nouveau_heap_init(&screen->fp_code_heap, 0, 1 << NV50_CODE_BO_SIZE_LOG2);
+
+   nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
+
+   screen->TPs = util_bitcount(value & 0xffff);
+   screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
+
+   stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
+         STACK_WARPS_ALLOC * 64 * 8;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, stack_size, NULL,
+                        &screen->stack_bo);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate stack bo: %d\n", ret);
+      goto fail;
+   }
+
+   uint64_t size_of_one_temp = util_next_power_of_two(screen->TPs) *
+         screen->MPsInTP * LOCAL_WARPS_ALLOC *  THREADS_IN_WARP *
+         ONE_TEMP_SIZE;
+   screen->max_tls_space = dev->vram_size / size_of_one_temp * ONE_TEMP_SIZE;
+   screen->max_tls_space /= 2; /* half of vram */
+
+   /* hw can address max 64 KiB */
+   screen->max_tls_space = MIN2(screen->max_tls_space, 64 << 10);
+
+   uint64_t tls_size;
+   unsigned tls_space = 4/*temps*/ * ONE_TEMP_SIZE;
+   ret = nv50_tls_alloc(screen, tls_space, &tls_size);
+   if (ret)
+      goto fail;
+
+   if (nouveau_mesa_debug)
+      debug_printf("TPs = %u, MPsInTP = %u, VRAM = %"PRIu64" MiB, tls_size = %"PRIu64" KiB\n",
+            screen->TPs, screen->MPsInTP, dev->vram_size >> 20, tls_size >> 10);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 4 << 16, NULL,
+                        &screen->uniforms);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate uniforms bo: %d\n", ret);
+      goto fail;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, 3 << 16, NULL,
+                        &screen->txc);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate TIC/TSC bo: %d\n", ret);
+      goto fail;
+   }
+
+   screen->tic.entries = CALLOC(4096, sizeof(void *));
+   screen->tsc.entries = screen->tic.entries + 2048;
+
+   if (!nv50_blitter_create(screen))
+      goto fail;
+
+   nv50_screen_init_hwctx(screen);
+
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+
+   return pscreen;
+
+fail:
+   nv50_screen_destroy(pscreen);
+   return NULL;
+}
+
+int
+nv50_screen_tic_alloc(struct nv50_screen *screen, void *entry)
+{
+   int i = screen->tic.next;
+
+   while (screen->tic.lock[i / 32] & (1 << (i % 32)))
+      i = (i + 1) & (NV50_TIC_MAX_ENTRIES - 1);
+
+   screen->tic.next = (i + 1) & (NV50_TIC_MAX_ENTRIES - 1);
+
+   if (screen->tic.entries[i])
+      nv50_tic_entry(screen->tic.entries[i])->id = -1;
+
+   screen->tic.entries[i] = entry;
+   return i;
+}
+
+int
+nv50_screen_tsc_alloc(struct nv50_screen *screen, void *entry)
+{
+   int i = screen->tsc.next;
+
+   while (screen->tsc.lock[i / 32] & (1 << (i % 32)))
+      i = (i + 1) & (NV50_TSC_MAX_ENTRIES - 1);
+
+   screen->tsc.next = (i + 1) & (NV50_TSC_MAX_ENTRIES - 1);
+
+   if (screen->tsc.entries[i])
+      nv50_tsc_entry(screen->tsc.entries[i])->id = -1;
+
+   screen->tsc.entries[i] = entry;
+   return i;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
new file mode 100644
index 00000000000..091a3921a4b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -0,0 +1,153 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "nouveau_screen.h"
+#include "nouveau_fence.h"
+#include "nouveau_mm.h"
+#include "nouveau_heap.h"
+
+#include "nv50/nv50_winsys.h"
+#include "nv50/nv50_stateobj.h"
+
+#define NV50_TIC_MAX_ENTRIES 2048
+#define NV50_TSC_MAX_ENTRIES 2048
+
+/* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
+#define NV50_MAX_PIPE_CONSTBUFS 14
+
+struct nv50_context;
+
+#define NV50_CODE_BO_SIZE_LOG2 19
+
+#define NV50_SCREEN_RESIDENT_BO_COUNT 5
+
+struct nv50_blitter;
+
+struct nv50_screen {
+   struct nouveau_screen base;
+
+   struct nv50_context *cur_ctx;
+
+   struct nouveau_bo *code;
+   struct nouveau_bo *uniforms;
+   struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
+   struct nouveau_bo *stack_bo;
+   struct nouveau_bo *tls_bo;
+
+   unsigned TPs;
+   unsigned MPsInTP;
+   unsigned max_tls_space;
+   unsigned cur_tls_space;
+
+   struct nouveau_heap *vp_code_heap;
+   struct nouveau_heap *gp_code_heap;
+   struct nouveau_heap *fp_code_heap;
+
+   struct nv50_blitter *blitter;
+
+   struct {
+      void **entries;
+      int next;
+      uint32_t lock[NV50_TIC_MAX_ENTRIES / 32];
+   } tic;
+
+   struct {
+      void **entries;
+      int next;
+      uint32_t lock[NV50_TSC_MAX_ENTRIES / 32];
+   } tsc;
+
+   struct {
+      uint32_t *map;
+      struct nouveau_bo *bo;
+   } fence;
+
+   struct nouveau_object *sync;
+
+   struct nouveau_object *tesla;
+   struct nouveau_object *eng2d;
+   struct nouveau_object *m2mf;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+   return (struct nv50_screen *)screen;
+}
+
+boolean nv50_blitter_create(struct nv50_screen *);
+void nv50_blitter_destroy(struct nv50_screen *);
+
+int nv50_screen_tic_alloc(struct nv50_screen *, void *);
+int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
+
+static INLINE void
+nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
+{
+   struct nv50_screen *screen = nv50_screen(res->base.screen);
+
+   if (res->mm) {
+      nouveau_fence_ref(screen->base.fence.current, &res->fence);
+      if (flags & NOUVEAU_BO_WR)
+         nouveau_fence_ref(screen->base.fence.current, &res->fence_wr);
+   }
+}
+
+static INLINE void
+nv50_resource_validate(struct nv04_resource *res, uint32_t flags)
+{
+   if (likely(res->bo)) {
+      if (flags & NOUVEAU_BO_WR)
+         res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING |
+            NOUVEAU_BUFFER_STATUS_DIRTY;
+      if (flags & NOUVEAU_BO_RD)
+         res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      nv50_resource_fence(res, flags);
+   }
+}
+
+struct nv50_format {
+   uint32_t rt;
+   uint32_t tic;
+   uint32_t vtx;
+   uint32_t usage;
+};
+
+extern const struct nv50_format nv50_format_table[];
+
+static INLINE void
+nv50_screen_tic_unlock(struct nv50_screen *screen, struct nv50_tic_entry *tic)
+{
+   if (tic->id >= 0)
+      screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
+}
+
+static INLINE void
+nv50_screen_tsc_unlock(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
+{
+   if (tsc->id >= 0)
+      screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
+}
+
+static INLINE void
+nv50_screen_tic_free(struct nv50_screen *screen, struct nv50_tic_entry *tic)
+{
+   if (tic->id >= 0) {
+      screen->tic.entries[tic->id] = NULL;
+      screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
+   }
+}
+
+static INLINE void
+nv50_screen_tsc_free(struct nv50_screen *screen, struct nv50_tsc_entry *tsc)
+{
+   if (tsc->id >= 0) {
+      screen->tsc.entries[tsc->id] = NULL;
+      screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
+   }
+}
+
+extern int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
new file mode 100644
index 00000000000..9144fc48d95
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "nv50/nv50_context.h"
+
+void
+nv50_constbufs_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned s;
+
+   for (s = 0; s < 3; ++s) {
+      unsigned p;
+
+      if (s == PIPE_SHADER_FRAGMENT)
+         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_FRAGMENT;
+      else
+      if (s == PIPE_SHADER_GEOMETRY)
+         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_GEOMETRY;
+      else
+         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_VERTEX;
+
+      while (nv50->constbuf_dirty[s]) {
+         const int i = ffs(nv50->constbuf_dirty[s]) - 1;
+         nv50->constbuf_dirty[s] &= ~(1 << i);
+
+         if (nv50->constbuf[s][i].user) {
+            const unsigned b = NV50_CB_PVP + s;
+            unsigned start = 0;
+            unsigned words = nv50->constbuf[s][0].size / 4;
+            if (i) {
+               NOUVEAU_ERR("user constbufs only supported in slot 0\n");
+               continue;
+            }
+            if (!nv50->state.uniform_buffer_bound[s]) {
+               nv50->state.uniform_buffer_bound[s] = TRUE;
+               BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
+               PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
+            }
+            while (words) {
+               unsigned nr;
+
+               if (!PUSH_SPACE(push, 16))
+                  break;
+               nr = PUSH_AVAIL(push);
+               assert(nr >= 16);
+               nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN);
+
+               BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
+               PUSH_DATA (push, (start << 8) | b);
+               BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr);
+               PUSH_DATAp(push, &nv50->constbuf[s][0].u.data[start * 4], nr);
+
+               start += nr;
+               words -= nr;
+            }
+         } else {
+            struct nv04_resource *res =
+               nv04_resource(nv50->constbuf[s][i].u.buf);
+            if (res) {
+               /* TODO: allocate persistent bindings */
+               const unsigned b = s * 16 + i;
+
+               assert(nouveau_resource_mapped_by_gpu(&res->base));
+
+               BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+               PUSH_DATAh(push, res->address + nv50->constbuf[s][i].offset);
+               PUSH_DATA (push, res->address + nv50->constbuf[s][i].offset);
+               PUSH_DATA (push, (b << 16) |
+                          (nv50->constbuf[s][i].size & 0xffff));
+               BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
+               PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
+
+               BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+            } else {
+               BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
+               PUSH_DATA (push, (i << 8) | p | 0);
+            }
+            if (i == 0)
+               nv50->state.uniform_buffer_bound[s] = FALSE;
+         }
+      }
+   }
+}
+
+static boolean
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
+{
+   if (!prog->translated) {
+      prog->translated = nv50_program_translate(
+         prog, nv50->screen->base.device->chipset);
+      if (!prog->translated)
+         return FALSE;
+   } else
+   if (prog->mem)
+      return TRUE;
+
+   return nv50_program_upload_code(nv50, prog);
+}
+
+static INLINE void
+nv50_program_update_context_state(struct nv50_context *nv50,
+                                  struct nv50_program *prog, int stage)
+{
+   const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+
+   if (prog && prog->tls_space) {
+      if (nv50->state.new_tls_space)
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+      if (!nv50->state.tls_required || nv50->state.new_tls_space)
+         BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
+      nv50->state.new_tls_space = FALSE;
+      nv50->state.tls_required |= 1 << stage;
+   } else {
+      if (nv50->state.tls_required == (1 << stage))
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+      nv50->state.tls_required &= ~(1 << stage);
+   }
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *vp = nv50->vertprog;
+
+   if (!nv50_program_validate(nv50, vp))
+         return;
+   nv50_program_update_context_state(nv50, vp, 0);
+
+   BEGIN_NV04(push, NV50_3D(VP_ATTR_EN(0)), 2);
+   PUSH_DATA (push, vp->vp.attrs[0]);
+   PUSH_DATA (push, vp->vp.attrs[1]);
+   BEGIN_NV04(push, NV50_3D(VP_REG_ALLOC_RESULT), 1);
+   PUSH_DATA (push, vp->max_out);
+   BEGIN_NV04(push, NV50_3D(VP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, vp->max_gpr);
+   BEGIN_NV04(push, NV50_3D(VP_START_ID), 1);
+   PUSH_DATA (push, vp->code_base);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *fp = nv50->fragprog;
+
+   if (!nv50_program_validate(nv50, fp))
+         return;
+   nv50_program_update_context_state(nv50, fp, 1);
+
+   BEGIN_NV04(push, NV50_3D(FP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, fp->max_gpr);
+   BEGIN_NV04(push, NV50_3D(FP_RESULT_COUNT), 1);
+   PUSH_DATA (push, fp->max_out);
+   BEGIN_NV04(push, NV50_3D(FP_CONTROL), 1);
+   PUSH_DATA (push, fp->fp.flags[0]);
+   BEGIN_NV04(push, NV50_3D(FP_CTRL_UNK196C), 1);
+   PUSH_DATA (push, fp->fp.flags[1]);
+   BEGIN_NV04(push, NV50_3D(FP_START_ID), 1);
+   PUSH_DATA (push, fp->code_base);
+}
+
+void
+nv50_gmtyprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *gp = nv50->gmtyprog;
+
+   if (gp) {
+      BEGIN_NV04(push, NV50_3D(GP_REG_ALLOC_TEMP), 1);
+      PUSH_DATA (push, gp->max_gpr);
+      BEGIN_NV04(push, NV50_3D(GP_REG_ALLOC_RESULT), 1);
+      PUSH_DATA (push, gp->max_out);
+      BEGIN_NV04(push, NV50_3D(GP_OUTPUT_PRIMITIVE_TYPE), 1);
+      PUSH_DATA (push, gp->gp.prim_type);
+      BEGIN_NV04(push, NV50_3D(GP_VERTEX_OUTPUT_COUNT), 1);
+      PUSH_DATA (push, gp->gp.vert_count);
+      BEGIN_NV04(push, NV50_3D(GP_START_ID), 1);
+      PUSH_DATA (push, gp->code_base);
+
+      nv50->state.prim_size = gp->gp.prim_type; /* enum matches vertex count */
+   }
+   nv50_program_update_context_state(nv50, gp, 2);
+
+   /* GP_ENABLE is updated in linkage validation */
+}
+
+static void
+nv50_sprite_coords_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   uint32_t pntc[8], mode;
+   struct nv50_program *fp = nv50->fragprog;
+   unsigned i, c;
+   unsigned m = (nv50->state.interpolant_ctrl >> 8) & 0xff;
+
+   if (!nv50->rast->pipe.point_quad_rasterization) {
+      if (nv50->state.point_sprite) {
+         BEGIN_NV04(push, NV50_3D(POINT_COORD_REPLACE_MAP(0)), 8);
+         for (i = 0; i < 8; ++i)
+            PUSH_DATA(push, 0);
+
+         nv50->state.point_sprite = FALSE;
+      }
+      return;
+   } else {
+      nv50->state.point_sprite = TRUE;
+   }
+
+   memset(pntc, 0, sizeof(pntc));
+
+   for (i = 0; i < fp->in_nr; i++) {
+      unsigned n = util_bitcount(fp->in[i].mask);
+
+      if (fp->in[i].sn != TGSI_SEMANTIC_GENERIC) {
+         m += n;
+         continue;
+      }
+      if (!(nv50->rast->pipe.sprite_coord_enable & (1 << fp->in[i].si))) {
+         m += n;
+         continue;
+      }
+
+      for (c = 0; c < 4; ++c) {
+         if (fp->in[i].mask & (1 << c)) {
+            pntc[m / 8] |= (c + 1) << ((m % 8) * 4);
+            ++m;
+         }
+      }
+   }
+
+   if (nv50->rast->pipe.sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+      mode = 0x00;
+   else
+      mode = 0x10;
+
+   BEGIN_NV04(push, NV50_3D(POINT_SPRITE_CTRL), 1);
+   PUSH_DATA (push, mode);
+
+   BEGIN_NV04(push, NV50_3D(POINT_COORD_REPLACE_MAP(0)), 8);
+   PUSH_DATAp(push, pntc, 8);
+}
+
+/* Validate state derived from shaders and the rasterizer cso. */
+void
+nv50_validate_derived_rs(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   uint32_t color, psize;
+
+   nv50_sprite_coords_validate(nv50);
+
+   if (nv50->state.rasterizer_discard != nv50->rast->pipe.rasterizer_discard) {
+      nv50->state.rasterizer_discard = nv50->rast->pipe.rasterizer_discard;
+      BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1);
+      PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
+   }
+
+   if (nv50->dirty & NV50_NEW_FRAGPROG)
+      return;
+   psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
+   color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
+
+   if (nv50->rast->pipe.clamp_vertex_color)
+      color |= NV50_3D_SEMANTIC_COLOR_CLMP_EN;
+
+   if (color != nv50->state.semantic_color) {
+      nv50->state.semantic_color = color;
+      BEGIN_NV04(push, NV50_3D(SEMANTIC_COLOR), 1);
+      PUSH_DATA (push, color);
+   }
+
+   if (nv50->rast->pipe.point_size_per_vertex)
+      psize |= NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
+
+   if (psize != nv50->state.semantic_psize) {
+      nv50->state.semantic_psize = psize;
+      BEGIN_NV04(push, NV50_3D(SEMANTIC_PTSZ), 1);
+      PUSH_DATA (push, psize);
+   }
+}
+
+static int
+nv50_vec4_map(uint8_t *map, int mid, uint32_t lin[4],
+              struct nv50_varying *in, struct nv50_varying *out)
+{
+   int c;
+   uint8_t mv = out->mask, mf = in->mask, oid = out->hw;
+
+   for (c = 0; c < 4; ++c) {
+      if (mf & 1) {
+         if (in->linear)
+            lin[mid / 32] |= 1 << (mid % 32);
+         if (mv & 1)
+            map[mid] = oid;
+         else
+         if (c == 3)
+            map[mid] |= 1;
+         ++mid;
+      }
+
+      oid += mv & 1;
+      mf >>= 1;
+      mv >>= 1;
+   }
+
+   return mid;
+}
+
+void
+nv50_fp_linkage_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *vp = nv50->gmtyprog ? nv50->gmtyprog : nv50->vertprog;
+   struct nv50_program *fp = nv50->fragprog;
+   struct nv50_varying dummy;
+   int i, n, c, m;
+   uint32_t primid = 0;
+   uint32_t psiz = 0x000;
+   uint32_t interp = fp->fp.interp;
+   uint32_t colors = fp->fp.colors;
+   uint32_t lin[4];
+   uint8_t map[64];
+   uint8_t so_map[64];
+
+   if (!(nv50->dirty & (NV50_NEW_VERTPROG |
+                        NV50_NEW_FRAGPROG |
+                        NV50_NEW_GMTYPROG))) {
+      uint8_t bfc, ffc;
+      ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
+      bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
+         >> 8;
+      if (nv50->rast->pipe.light_twoside == ((ffc == bfc) ? 0 : 1))
+         return;
+   }
+
+   memset(lin, 0x00, sizeof(lin));
+
+   /* XXX: in buggy-endian mode, is the first element of map (u32)0x000000xx
+    *  or is it the first byte ?
+    */
+   memset(map, nv50->gmtyprog ? 0x80 : 0x40, sizeof(map));
+
+   dummy.mask = 0xf; /* map all components of HPOS */
+   dummy.linear = 0;
+   m = nv50_vec4_map(map, 0, lin, &dummy, &vp->out[0]);
+
+   for (c = 0; c < vp->vp.clpd_nr; ++c)
+      map[m++] = vp->vp.clpd[c / 4] + (c % 4);
+
+   colors |= m << 8; /* adjust BFC0 id */
+
+   dummy.mask = 0x0;
+
+   /* if light_twoside is active, FFC0_ID == BFC0_ID is invalid */
+   if (nv50->rast->pipe.light_twoside) {
+      for (i = 0; i < 2; ++i) {
+         n = vp->vp.bfc[i];
+         if (fp->vp.bfc[i] >= fp->in_nr)
+            continue;
+         m = nv50_vec4_map(map, m, lin, &fp->in[fp->vp.bfc[i]],
+                           (n < vp->out_nr) ? &vp->out[n] : &dummy);
+      }
+   }
+   colors += m - 4; /* adjust FFC0 id */
+   interp |= m << 8; /* set map id where 'normal' FP inputs start */
+
+   for (i = 0; i < fp->in_nr; ++i) {
+      for (n = 0; n < vp->out_nr; ++n)
+         if (vp->out[n].sn == fp->in[i].sn &&
+             vp->out[n].si == fp->in[i].si)
+            break;
+      m = nv50_vec4_map(map, m, lin,
+                        &fp->in[i], (n < vp->out_nr) ? &vp->out[n] : &dummy);
+   }
+
+   /* PrimitiveID either is replaced by the system value, or
+    * written by the geometry shader into an output register
+    */
+   if (fp->gp.primid < 0x80) {
+      primid = m;
+      map[m++] = vp->gp.primid;
+   }
+
+   if (nv50->rast->pipe.point_size_per_vertex) {
+      psiz = (m << 4) | 1;
+      map[m++] = vp->vp.psiz;
+   }
+
+   if (nv50->rast->pipe.clamp_vertex_color)
+      colors |= NV50_3D_SEMANTIC_COLOR_CLMP_EN;
+
+   if (unlikely(vp->so)) {
+      /* Slot i in STRMOUT_MAP specifies the offset where slot i in RESULT_MAP
+       * gets written.
+       *
+       * TODO:
+       * Inverting vp->so->map (output -> offset) would probably speed this up.
+       */
+      memset(so_map, 0, sizeof(so_map));
+      for (i = 0; i < vp->so->map_size; ++i) {
+         if (vp->so->map[i] == 0xff)
+            continue;
+         for (c = 0; c < m; ++c)
+            if (map[c] == vp->so->map[i] && !so_map[c])
+               break;
+         if (c == m) {
+            c = m;
+            map[m++] = vp->so->map[i];
+         }
+         so_map[c] = 0x80 | i;
+      }
+      for (c = m; c & 3; ++c)
+         so_map[c] = 0;
+   }
+
+   n = (m + 3) / 4;
+   assert(m <= 64);
+
+   if (unlikely(nv50->gmtyprog)) {
+      BEGIN_NV04(push, NV50_3D(GP_RESULT_MAP_SIZE), 1);
+      PUSH_DATA (push, m);
+      BEGIN_NV04(push, NV50_3D(GP_RESULT_MAP(0)), n);
+      PUSH_DATAp(push, map, n);
+   } else {
+      BEGIN_NV04(push, NV50_3D(VP_GP_BUILTIN_ATTR_EN), 1);
+      PUSH_DATA (push, vp->vp.attrs[2]);
+
+      BEGIN_NV04(push, NV50_3D(SEMANTIC_PRIM_ID), 1);
+      PUSH_DATA (push, primid);
+
+      BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP_SIZE), 1);
+      PUSH_DATA (push, m);
+      BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n);
+      PUSH_DATAp(push, map, n);
+   }
+
+   BEGIN_NV04(push, NV50_3D(SEMANTIC_COLOR), 4);
+   PUSH_DATA (push, colors);
+   PUSH_DATA (push, (vp->vp.clpd_nr << 8) | 4);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, psiz);
+
+   BEGIN_NV04(push, NV50_3D(FP_INTERPOLANT_CTRL), 1);
+   PUSH_DATA (push, interp);
+
+   nv50->state.interpolant_ctrl = interp;
+
+   nv50->state.semantic_color = colors;
+   nv50->state.semantic_psize = psiz;
+
+   BEGIN_NV04(push, NV50_3D(NOPERSPECTIVE_BITMAP(0)), 4);
+   PUSH_DATAp(push, lin, 4);
+
+   BEGIN_NV04(push, NV50_3D(GP_ENABLE), 1);
+   PUSH_DATA (push, nv50->gmtyprog ? 1 : 0);
+
+   if (vp->so) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_MAP(0)), n);
+      PUSH_DATAp(push, so_map, n);
+   }
+}
+
+static int
+nv50_vp_gp_mapping(uint8_t *map, int m,
+                   struct nv50_program *vp, struct nv50_program *gp)
+{
+   int i, j, c;
+
+   for (i = 0; i < gp->in_nr; ++i) {
+      uint8_t oid = 0, mv = 0, mg = gp->in[i].mask;
+
+      for (j = 0; j < vp->out_nr; ++j) {
+         if (vp->out[j].sn == gp->in[i].sn &&
+             vp->out[j].si == gp->in[i].si) {
+            mv = vp->out[j].mask;
+            oid = vp->out[j].hw;
+            break;
+         }
+      }
+
+      for (c = 0; c < 4; ++c, mv >>= 1, mg >>= 1) {
+         if (mg & mv & 1)
+            map[m++] = oid;
+         else
+         if (mg & 1)
+            map[m++] = (c == 3) ? 0x41 : 0x40;
+         oid += mv & 1;
+      }
+   }
+   return m;
+}
+
+void
+nv50_gp_linkage_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *vp = nv50->vertprog;
+   struct nv50_program *gp = nv50->gmtyprog;
+   int m = 0;
+   int n;
+   uint8_t map[64];
+
+   if (!gp)
+      return;
+   memset(map, 0, sizeof(map));
+
+   m = nv50_vp_gp_mapping(map, m, vp, gp);
+
+   n = (m + 3) / 4;
+
+   BEGIN_NV04(push, NV50_3D(VP_GP_BUILTIN_ATTR_EN), 1);
+   PUSH_DATA (push, vp->vp.attrs[2] | gp->vp.attrs[2]);
+
+   BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP_SIZE), 1);
+   PUSH_DATA (push, m);
+   BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n);
+   PUSH_DATAp(push, map, n);
+}
+
+void
+nv50_stream_output_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_stream_output_state *so;
+   uint32_t ctrl;
+   unsigned i;
+   unsigned prims = ~0;
+
+   so = nv50->gmtyprog ? nv50->gmtyprog->so : nv50->vertprog->so;
+
+   BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   if (!so || !nv50->num_so_targets) {
+      if (nv50->screen->base.class_3d < NVA0_3D_CLASS) {
+         BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+         PUSH_DATA (push, 0);
+      }
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+      PUSH_DATA (push, 1);
+      return;
+   }
+
+   /* previous TFB needs to complete */
+   if (nv50->screen->base.class_3d < NVA0_3D_CLASS) {
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   ctrl = so->ctrl;
+   if (nv50->screen->base.class_3d >= NVA0_3D_CLASS)
+      ctrl |= NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET;
+
+   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
+   PUSH_DATA (push, ctrl);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
+
+   for (i = 0; i < nv50->num_so_targets; ++i) {
+      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
+
+      const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;
+
+      if (n == 4 && !targ->clean)
+         nv84_query_fifo_wait(push, targ->pq);
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
+      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, so->num_attribs[i]);
+      if (n == 4) {
+         PUSH_DATA(push, targ->pipe.buffer_size);
+
+         BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
+         if (!targ->clean) {
+            assert(targ->pq);
+            nv50_query_pushbuf_submit(push, targ->pq, 0x4);
+         } else {
+            PUSH_DATA(push, 0);
+            targ->clean = FALSE;
+         }
+      } else {
+         const unsigned limit = targ->pipe.buffer_size /
+            (so->stride[i] * nv50->state.prim_size);
+         prims = MIN2(prims, limit);
+      }
+      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+   }
+   if (prims != ~0) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+      PUSH_DATA (push, prims);
+   }
+   BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+   PUSH_DATA (push, 1);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
new file mode 100644
index 00000000000..7dceb51c19e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -0,0 +1,1110 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_format_srgb.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50/nv50_stateobj.h"
+#include "nv50/nv50_context.h"
+
+#include "nv50/nv50_3d.xml.h"
+#include "nv50/nv50_texture.xml.h"
+
+#include "nouveau_gldefs.h"
+
+/* Caveats:
+ *  ! pipe_sampler_state.normalized_coords is ignored - rectangle textures will
+ *     use non-normalized coordinates, everything else won't
+ *    (The relevant bit is in the TIC entry and not the TSC entry.)
+ *
+ *  ! pipe_sampler_state.seamless_cube_map is ignored - seamless filtering is
+ *     always activated on NVA0 +
+ *    (Give me the global bit, otherwise it's not worth the CPU work.)
+ *
+ *  ! pipe_sampler_state.border_color is not swizzled according to the texture
+ *     swizzle in pipe_sampler_view
+ *    (This will be ugly with indirect independent texture/sampler access,
+ *     we'd have to emulate the logic in the shader. GL doesn't have that,
+ *     D3D doesn't have swizzle, if we knew what we were implementing we'd be
+ *     good.)
+ *
+ *  ! pipe_rasterizer_state.line_last_pixel is ignored - it is never drawn
+ *
+ *  ! pipe_rasterizer_state.flatshade_first also applies to QUADS
+ *    (There's a GL query for that, forcing an exception is just ridiculous.)
+ *
+ *  ! pipe_rasterizer_state.half_pixel_center is ignored - pixel centers
+ *     are always at half integer coordinates and the top-left rule applies
+ *    (There does not seem to be a hardware switch for this.)
+ *
+ *  ! pipe_rasterizer_state.sprite_coord_enable is masked with 0xff on NVC0
+ *    (The hardware only has 8 slots meant for TexCoord and we have to assign
+ *     in advance to maintain elegant separate shader objects.)
+ */
+
+static INLINE uint32_t
+nv50_colormask(unsigned mask)
+{
+   uint32_t ret = 0;
+
+   if (mask & PIPE_MASK_R)
+      ret |= 0x0001;
+   if (mask & PIPE_MASK_G)
+      ret |= 0x0010;
+   if (mask & PIPE_MASK_B)
+      ret |= 0x0100;
+   if (mask & PIPE_MASK_A)
+      ret |= 0x1000;
+
+   return ret;
+}
+
+#define NV50_BLEND_FACTOR_CASE(a, b) \
+   case PIPE_BLENDFACTOR_##a: return NV50_3D_BLEND_FACTOR_##b
+
+static INLINE uint32_t
+nv50_blend_fac(unsigned factor)
+{
+   switch (factor) {
+   NV50_BLEND_FACTOR_CASE(ONE, ONE);
+   NV50_BLEND_FACTOR_CASE(SRC_COLOR, SRC_COLOR);
+   NV50_BLEND_FACTOR_CASE(SRC_ALPHA, SRC_ALPHA);
+   NV50_BLEND_FACTOR_CASE(DST_ALPHA, DST_ALPHA);
+   NV50_BLEND_FACTOR_CASE(DST_COLOR, DST_COLOR);
+   NV50_BLEND_FACTOR_CASE(SRC_ALPHA_SATURATE, SRC_ALPHA_SATURATE);
+   NV50_BLEND_FACTOR_CASE(CONST_COLOR, CONSTANT_COLOR);
+   NV50_BLEND_FACTOR_CASE(CONST_ALPHA, CONSTANT_ALPHA);
+   NV50_BLEND_FACTOR_CASE(SRC1_COLOR, SRC1_COLOR);
+   NV50_BLEND_FACTOR_CASE(SRC1_ALPHA, SRC1_ALPHA);
+   NV50_BLEND_FACTOR_CASE(ZERO, ZERO);
+   NV50_BLEND_FACTOR_CASE(INV_SRC_COLOR, ONE_MINUS_SRC_COLOR);
+   NV50_BLEND_FACTOR_CASE(INV_SRC_ALPHA, ONE_MINUS_SRC_ALPHA);
+   NV50_BLEND_FACTOR_CASE(INV_DST_ALPHA, ONE_MINUS_DST_ALPHA);
+   NV50_BLEND_FACTOR_CASE(INV_DST_COLOR, ONE_MINUS_DST_COLOR);
+   NV50_BLEND_FACTOR_CASE(INV_CONST_COLOR, ONE_MINUS_CONSTANT_COLOR);
+   NV50_BLEND_FACTOR_CASE(INV_CONST_ALPHA, ONE_MINUS_CONSTANT_ALPHA);
+   NV50_BLEND_FACTOR_CASE(INV_SRC1_COLOR, ONE_MINUS_SRC1_COLOR);
+   NV50_BLEND_FACTOR_CASE(INV_SRC1_ALPHA, ONE_MINUS_SRC1_ALPHA);
+   default:
+      return NV50_3D_BLEND_FACTOR_ZERO;
+   }
+}
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+                        const struct pipe_blend_state *cso)
+{
+   struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
+   int i;
+   boolean emit_common_func = cso->rt[0].blend_enable;
+   uint32_t ms;
+
+   if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
+      SB_BEGIN_3D(so, BLEND_INDEPENDENT, 1);
+      SB_DATA    (so, cso->independent_blend_enable);
+   }
+
+   so->pipe = *cso;
+
+   SB_BEGIN_3D(so, COLOR_MASK_COMMON, 1);
+   SB_DATA    (so, !cso->independent_blend_enable);
+
+   SB_BEGIN_3D(so, BLEND_ENABLE_COMMON, 1);
+   SB_DATA    (so, !cso->independent_blend_enable);
+
+   if (cso->independent_blend_enable) {
+      SB_BEGIN_3D(so, BLEND_ENABLE(0), 8);
+      for (i = 0; i < 8; ++i) {
+         SB_DATA(so, cso->rt[i].blend_enable);
+         if (cso->rt[i].blend_enable)
+            emit_common_func = TRUE;
+      }
+
+      if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
+         emit_common_func = FALSE;
+
+         for (i = 0; i < 8; ++i) {
+            if (!cso->rt[i].blend_enable)
+               continue;
+            SB_BEGIN_3D_(so, NVA3_3D_IBLEND_EQUATION_RGB(i), 6);
+            SB_DATA     (so, nvgl_blend_eqn(cso->rt[i].rgb_func));
+            SB_DATA     (so, nv50_blend_fac(cso->rt[i].rgb_src_factor));
+            SB_DATA     (so, nv50_blend_fac(cso->rt[i].rgb_dst_factor));
+            SB_DATA     (so, nvgl_blend_eqn(cso->rt[i].alpha_func));
+            SB_DATA     (so, nv50_blend_fac(cso->rt[i].alpha_src_factor));
+            SB_DATA     (so, nv50_blend_fac(cso->rt[i].alpha_dst_factor));
+         }
+      }
+   } else {
+      SB_BEGIN_3D(so, BLEND_ENABLE(0), 1);
+      SB_DATA    (so, cso->rt[0].blend_enable);
+   }
+
+   if (emit_common_func) {
+      SB_BEGIN_3D(so, BLEND_EQUATION_RGB, 5);
+      SB_DATA    (so, nvgl_blend_eqn(cso->rt[0].rgb_func));
+      SB_DATA    (so, nv50_blend_fac(cso->rt[0].rgb_src_factor));
+      SB_DATA    (so, nv50_blend_fac(cso->rt[0].rgb_dst_factor));
+      SB_DATA    (so, nvgl_blend_eqn(cso->rt[0].alpha_func));
+      SB_DATA    (so, nv50_blend_fac(cso->rt[0].alpha_src_factor));
+      SB_BEGIN_3D(so, BLEND_FUNC_DST_ALPHA, 1);
+      SB_DATA    (so, nv50_blend_fac(cso->rt[0].alpha_dst_factor));
+   }
+
+   if (cso->logicop_enable) {
+      SB_BEGIN_3D(so, LOGIC_OP_ENABLE, 2);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_logicop_func(cso->logicop_func));
+   } else {
+      SB_BEGIN_3D(so, LOGIC_OP_ENABLE, 1);
+      SB_DATA    (so, 0);
+   }
+
+   if (cso->independent_blend_enable) {
+      SB_BEGIN_3D(so, COLOR_MASK(0), 8);
+      for (i = 0; i < 8; ++i)
+         SB_DATA(so, nv50_colormask(cso->rt[i].colormask));
+   } else {
+      SB_BEGIN_3D(so, COLOR_MASK(0), 1);
+      SB_DATA    (so, nv50_colormask(cso->rt[0].colormask));
+   }
+
+   ms = 0;
+   if (cso->alpha_to_coverage)
+      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+   if (cso->alpha_to_one)
+      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+
+   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
+   SB_DATA    (so, ms);
+
+   assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+   return so;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->blend = hwcso;
+   nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+/* NOTE: ignoring line_last_pixel, using FALSE (set on screen init) */
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *cso)
+{
+   struct nv50_rasterizer_stateobj *so;
+   uint32_t reg;
+
+   so = CALLOC_STRUCT(nv50_rasterizer_stateobj);
+   if (!so)
+      return NULL;
+   so->pipe = *cso;
+
+#ifndef NV50_SCISSORS_CLIPPING
+   SB_BEGIN_3D(so, SCISSOR_ENABLE(0), 1);
+   SB_DATA    (so, cso->scissor);
+#endif
+
+   SB_BEGIN_3D(so, SHADE_MODEL, 1);
+   SB_DATA    (so, cso->flatshade ? NV50_3D_SHADE_MODEL_FLAT :
+                                    NV50_3D_SHADE_MODEL_SMOOTH);
+   SB_BEGIN_3D(so, PROVOKING_VERTEX_LAST, 1);
+   SB_DATA    (so, !cso->flatshade_first);
+   SB_BEGIN_3D(so, VERTEX_TWO_SIDE_ENABLE, 1);
+   SB_DATA    (so, cso->light_twoside);
+
+   SB_BEGIN_3D(so, FRAG_COLOR_CLAMP_EN, 1);
+   SB_DATA    (so, cso->clamp_fragment_color ? 0x11111111 : 0x00000000);
+
+   SB_BEGIN_3D(so, MULTISAMPLE_ENABLE, 1);
+   SB_DATA    (so, cso->multisample);
+
+   SB_BEGIN_3D(so, LINE_WIDTH, 1);
+   SB_DATA    (so, fui(cso->line_width));
+   SB_BEGIN_3D(so, LINE_SMOOTH_ENABLE, 1);
+   SB_DATA    (so, cso->line_smooth);
+
+   SB_BEGIN_3D(so, LINE_STIPPLE_ENABLE, 1);
+   if (cso->line_stipple_enable) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, LINE_STIPPLE, 1);
+      SB_DATA    (so, (cso->line_stipple_pattern << 8) |
+                  cso->line_stipple_factor);
+   } else {
+      SB_DATA    (so, 0);
+   }
+
+   if (!cso->point_size_per_vertex) {
+      SB_BEGIN_3D(so, POINT_SIZE, 1);
+      SB_DATA    (so, fui(cso->point_size));
+   }
+   SB_BEGIN_3D(so, POINT_SPRITE_ENABLE, 1);
+   SB_DATA    (so, cso->point_quad_rasterization);
+   SB_BEGIN_3D(so, POINT_SMOOTH_ENABLE, 1);
+   SB_DATA    (so, cso->point_smooth);
+
+   SB_BEGIN_3D(so, POLYGON_MODE_FRONT, 3);
+   SB_DATA    (so, nvgl_polygon_mode(cso->fill_front));
+   SB_DATA    (so, nvgl_polygon_mode(cso->fill_back));
+   SB_DATA    (so, cso->poly_smooth);
+
+   SB_BEGIN_3D(so, CULL_FACE_ENABLE, 3);
+   SB_DATA    (so, cso->cull_face != PIPE_FACE_NONE);
+   SB_DATA    (so, cso->front_ccw ? NV50_3D_FRONT_FACE_CCW :
+                                    NV50_3D_FRONT_FACE_CW);
+   switch (cso->cull_face) {
+   case PIPE_FACE_FRONT_AND_BACK:
+      SB_DATA(so, NV50_3D_CULL_FACE_FRONT_AND_BACK);
+      break;
+   case PIPE_FACE_FRONT:
+      SB_DATA(so, NV50_3D_CULL_FACE_FRONT);
+      break;
+   case PIPE_FACE_BACK:
+   default:
+     SB_DATA(so, NV50_3D_CULL_FACE_BACK);
+     break;
+   }
+
+   SB_BEGIN_3D(so, POLYGON_STIPPLE_ENABLE, 1);
+   SB_DATA    (so, cso->poly_stipple_enable);
+   SB_BEGIN_3D(so, POLYGON_OFFSET_POINT_ENABLE, 3);
+   SB_DATA    (so, cso->offset_point);
+   SB_DATA    (so, cso->offset_line);
+   SB_DATA    (so, cso->offset_tri);
+
+   if (cso->offset_point || cso->offset_line || cso->offset_tri) {
+      SB_BEGIN_3D(so, POLYGON_OFFSET_FACTOR, 1);
+      SB_DATA    (so, fui(cso->offset_scale));
+      SB_BEGIN_3D(so, POLYGON_OFFSET_UNITS, 1);
+      SB_DATA    (so, fui(cso->offset_units * 2.0f));
+      SB_BEGIN_3D(so, POLYGON_OFFSET_CLAMP, 1);
+      SB_DATA    (so, fui(cso->offset_clamp));
+   }
+
+   if (cso->depth_clip) {
+      reg = 0;
+   } else {
+      reg =
+         NV50_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_NEAR |
+         NV50_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_FAR |
+         NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK1;
+   }
+#ifndef NV50_SCISSORS_CLIPPING
+   reg |=
+      NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK7 |
+      NV50_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK1;
+#endif
+   SB_BEGIN_3D(so, VIEW_VOLUME_CLIP_CTRL, 1);
+   SB_DATA    (so, reg);
+
+   assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+   return (void *)so;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->rast = hwcso;
+   nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static void *
+nv50_zsa_state_create(struct pipe_context *pipe,
+                      const struct pipe_depth_stencil_alpha_state *cso)
+{
+   struct nv50_zsa_stateobj *so = CALLOC_STRUCT(nv50_zsa_stateobj);
+
+   so->pipe = *cso;
+
+   SB_BEGIN_3D(so, DEPTH_WRITE_ENABLE, 1);
+   SB_DATA    (so, cso->depth.writemask);
+   SB_BEGIN_3D(so, DEPTH_TEST_ENABLE, 1);
+   if (cso->depth.enabled) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, DEPTH_TEST_FUNC, 1);
+      SB_DATA    (so, nvgl_comparison_op(cso->depth.func));
+   } else {
+      SB_DATA    (so, 0);
+   }
+
+   if (cso->stencil[0].enabled) {
+      SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+      SB_DATA    (so, nvgl_comparison_op(cso->stencil[0].func));
+      SB_BEGIN_3D(so, STENCIL_FRONT_MASK, 2);
+      SB_DATA    (so, cso->stencil[0].writemask);
+      SB_DATA    (so, cso->stencil[0].valuemask);
+   } else {
+      SB_BEGIN_3D(so, STENCIL_ENABLE, 1);
+      SB_DATA    (so, 0);
+   }
+
+   if (cso->stencil[1].enabled) {
+      assert(cso->stencil[0].enabled);
+      SB_BEGIN_3D(so, STENCIL_TWO_SIDE_ENABLE, 5);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+      SB_DATA    (so, nvgl_comparison_op(cso->stencil[1].func));
+      SB_BEGIN_3D(so, STENCIL_BACK_MASK, 2);
+      SB_DATA    (so, cso->stencil[1].writemask);
+      SB_DATA    (so, cso->stencil[1].valuemask);
+   } else {
+      SB_BEGIN_3D(so, STENCIL_TWO_SIDE_ENABLE, 1);
+      SB_DATA    (so, 0);
+   }
+
+   SB_BEGIN_3D(so, ALPHA_TEST_ENABLE, 1);
+   if (cso->alpha.enabled) {
+      SB_DATA    (so, 1);
+      SB_BEGIN_3D(so, ALPHA_TEST_REF, 2);
+      SB_DATA    (so, fui(cso->alpha.ref_value));
+      SB_DATA    (so, nvgl_comparison_op(cso->alpha.func));
+   } else {
+      SB_DATA    (so, 0);
+   }
+
+   assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+   return (void *)so;
+}
+
+static void
+nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->zsa = hwcso;
+   nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+/* ====================== SAMPLERS AND TEXTURES ================================
+ */
+
+#define NV50_TSC_WRAP_CASE(n) \
+    case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
+
+static INLINE unsigned
+nv50_tsc_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   NV50_TSC_WRAP_CASE(REPEAT);
+   NV50_TSC_WRAP_CASE(MIRROR_REPEAT);
+   NV50_TSC_WRAP_CASE(CLAMP_TO_EDGE);
+   NV50_TSC_WRAP_CASE(CLAMP_TO_BORDER);
+   NV50_TSC_WRAP_CASE(CLAMP);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_EDGE);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_BORDER);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP);
+   default:
+       NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+       return NV50_TSC_WRAP_REPEAT;
+   }
+}
+
+void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *cso)
+{
+   struct nv50_tsc_entry *so = MALLOC_STRUCT(nv50_tsc_entry);
+   float f[2];
+
+   so->id = -1;
+
+   so->tsc[0] = (0x00026000 |
+                 (nv50_tsc_wrap_mode(cso->wrap_s) << 0) |
+                 (nv50_tsc_wrap_mode(cso->wrap_t) << 3) |
+                 (nv50_tsc_wrap_mode(cso->wrap_r) << 6));
+
+   switch (cso->mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      so->tsc[1] = NV50_TSC_1_MAGF_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_NEAREST:
+   default:
+      so->tsc[1] = NV50_TSC_1_MAGF_NEAREST;
+      break;
+   }
+
+   switch (cso->min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      so->tsc[1] |= NV50_TSC_1_MINF_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_NEAREST:
+   default:
+      so->tsc[1] |= NV50_TSC_1_MINF_NEAREST;
+      break;
+   }
+
+   switch (cso->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      so->tsc[1] |= NV50_TSC_1_MIPF_LINEAR;
+      break;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      so->tsc[1] |= NV50_TSC_1_MIPF_NEAREST;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+   default:
+      so->tsc[1] |= NV50_TSC_1_MIPF_NONE;
+      break;
+   }
+
+   if (nouveau_screen(pipe->screen)->class_3d >= NVE4_3D_CLASS) {
+      if (cso->seamless_cube_map)
+         so->tsc[1] |= NVE4_TSC_1_CUBE_SEAMLESS;
+      if (!cso->normalized_coords)
+         so->tsc[1] |= NVE4_TSC_1_FORCE_NONNORMALIZED_COORDS;
+   }
+
+   if (cso->max_anisotropy >= 16)
+      so->tsc[0] |= (7 << 20);
+   else
+   if (cso->max_anisotropy >= 12)
+      so->tsc[0] |= (6 << 20);
+   else {
+      so->tsc[0] |= (cso->max_anisotropy >> 1) << 20;
+
+      if (cso->max_anisotropy >= 4)
+         so->tsc[1] |= NV50_TSC_1_UNKN_ANISO_35;
+      else
+      if (cso->max_anisotropy >= 2)
+         so->tsc[1] |= NV50_TSC_1_UNKN_ANISO_15;
+   }
+
+   if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+      /* NOTE: must be deactivated for non-shadow textures */
+      so->tsc[0] |= (1 << 9);
+      so->tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7) << 10;
+   }
+
+   f[0] = CLAMP(cso->lod_bias, -16.0f, 15.0f);
+   so->tsc[1] |= ((int)(f[0] * 256.0f) & 0x1fff) << 12;
+
+   f[0] = CLAMP(cso->min_lod, 0.0f, 15.0f);
+   f[1] = CLAMP(cso->max_lod, 0.0f, 15.0f);
+   so->tsc[2] =
+      (((int)(f[1] * 256.0f) & 0xfff) << 12) | ((int)(f[0] * 256.0f) & 0xfff);
+
+   so->tsc[2] |=
+      util_format_linear_float_to_srgb_8unorm(cso->border_color.f[0]) << 24;
+   so->tsc[3] =
+      util_format_linear_float_to_srgb_8unorm(cso->border_color.f[1]) << 12;
+   so->tsc[3] |=
+      util_format_linear_float_to_srgb_8unorm(cso->border_color.f[2]) << 20;
+
+   so->tsc[4] = fui(cso->border_color.f[0]);
+   so->tsc[5] = fui(cso->border_color.f[1]);
+   so->tsc[6] = fui(cso->border_color.f[2]);
+   so->tsc[7] = fui(cso->border_color.f[3]);
+
+   return (void *)so;
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   unsigned s, i;
+
+   for (s = 0; s < 3; ++s)
+      for (i = 0; i < nv50_context(pipe)->num_samplers[s]; ++i)
+         if (nv50_context(pipe)->samplers[s][i] == hwcso)
+            nv50_context(pipe)->samplers[s][i] = NULL;
+
+   nv50_screen_tsc_free(nv50_context(pipe)->screen, nv50_tsc_entry(hwcso));
+
+   FREE(hwcso);
+}
+
+static INLINE void
+nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
+                               unsigned nr, void **hwcso)
+{
+   unsigned i;
+
+   for (i = 0; i < nr; ++i) {
+      struct nv50_tsc_entry *old = nv50->samplers[s][i];
+
+      nv50->samplers[s][i] = nv50_tsc_entry(hwcso[i]);
+      if (old)
+         nv50_screen_tsc_unlock(nv50->screen, old);
+   }
+   for (; i < nv50->num_samplers[s]; ++i)
+      if (nv50->samplers[s][i])
+         nv50_screen_tsc_unlock(nv50->screen, nv50->samplers[s][i]);
+
+   nv50->num_samplers[s] = nr;
+
+   nv50->dirty |= NV50_NEW_SAMPLERS;
+}
+
+static void
+nv50_vp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nv50_stage_sampler_states_bind(nv50_context(pipe), 0, nr, s);
+}
+
+static void
+nv50_fp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nv50_stage_sampler_states_bind(nv50_context(pipe), 2, nr, s);
+}
+
+static void
+nv50_gp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nv50_stage_sampler_states_bind(nv50_context(pipe), 1, nr, s);
+}
+
+/* NOTE: only called when not referenced anywhere, won't be bound */
+static void
+nv50_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+
+   nv50_screen_tic_free(nv50_context(pipe)->screen, nv50_tic_entry(view));
+
+   FREE(nv50_tic_entry(view));
+}
+
+static INLINE void
+nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
+                             unsigned nr,
+                             struct pipe_sampler_view **views)
+{
+   unsigned i;
+
+   for (i = 0; i < nr; ++i) {
+      struct nv50_tic_entry *old = nv50_tic_entry(nv50->textures[s][i]);
+      if (old)
+         nv50_screen_tic_unlock(nv50->screen, old);
+
+      pipe_sampler_view_reference(&nv50->textures[s][i], views[i]);
+   }
+
+   for (i = nr; i < nv50->num_textures[s]; ++i) {
+      struct nv50_tic_entry *old = nv50_tic_entry(nv50->textures[s][i]);
+      if (!old)
+         continue;
+      nv50_screen_tic_unlock(nv50->screen, old);
+
+      pipe_sampler_view_reference(&nv50->textures[s][i], NULL);
+   }
+
+   nv50->num_textures[s] = nr;
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+
+   nv50->dirty |= NV50_NEW_TEXTURES;
+}
+
+static void
+nv50_vp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nv50_stage_set_sampler_views(nv50_context(pipe), 0, nr, views);
+}
+
+static void
+nv50_fp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nv50_stage_set_sampler_views(nv50_context(pipe), 2, nr, views);
+}
+
+static void
+nv50_gp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nv50_stage_set_sampler_views(nv50_context(pipe), 1, nr, views);
+}
+
+/* ============================= SHADERS =======================================
+ */
+
+static void *
+nv50_sp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso, unsigned type)
+{
+   struct nv50_program *prog;
+
+   prog = CALLOC_STRUCT(nv50_program);
+   if (!prog)
+      return NULL;
+
+   prog->type = type;
+   prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
+
+   return (void *)prog;
+}
+
+static void
+nv50_sp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_program *prog = (struct nv50_program *)hwcso;
+
+   nv50_program_destroy(nv50_context(pipe), prog);
+
+   FREE((void *)prog->pipe.tokens);
+   FREE(prog);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nv50_sp_state_create(pipe, cso, PIPE_SHADER_VERTEX);
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nv50_context *nv50 = nv50_context(pipe);
+
+    nv50->vertprog = hwcso;
+    nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nv50_sp_state_create(pipe, cso, PIPE_SHADER_FRAGMENT);
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nv50_context *nv50 = nv50_context(pipe);
+
+    nv50->fragprog = hwcso;
+    nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void *
+nv50_gp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nv50_sp_state_create(pipe, cso, PIPE_SHADER_GEOMETRY);
+}
+
+static void
+nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nv50_context *nv50 = nv50_context(pipe);
+
+    nv50->gmtyprog = hwcso;
+    nv50->dirty |= NV50_NEW_GMTYPROG;
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+                         struct pipe_constant_buffer *cb)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct pipe_resource *res = cb ? cb->buffer : NULL;
+   const unsigned s = nv50_context_shader_stage(shader);
+   const unsigned i = index;
+
+   if (shader == PIPE_SHADER_COMPUTE)
+      return;
+
+   if (nv50->constbuf[s][i].user)
+      nv50->constbuf[s][i].u.buf = NULL;
+   else
+   if (nv50->constbuf[s][i].u.buf)
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+
+   pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
+
+   nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   if (nv50->constbuf[s][i].user) {
+      nv50->constbuf[s][i].u.data = cb->user_buffer;
+      nv50->constbuf[s][i].size = cb->buffer_size;
+      nv50->constbuf_valid[s] |= 1 << i;
+   } else
+   if (res) {
+      nv50->constbuf[s][i].offset = cb->buffer_offset;
+      nv50->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+      nv50->constbuf_valid[s] |= 1 << i;
+   } else {
+      nv50->constbuf_valid[s] &= ~(1 << i);
+   }
+   nv50->constbuf_dirty[s] |= 1 << i;
+
+   nv50->dirty |= NV50_NEW_CONSTBUF;
+}
+
+/* =============================================================================
+ */
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *bcol)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->blend_colour = *bcol;
+   nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+static void
+nv50_set_stencil_ref(struct pipe_context *pipe,
+                     const struct pipe_stencil_ref *sr)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->stencil_ref = *sr;
+   nv50->dirty |= NV50_NEW_STENCIL_REF;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));
+
+   nv50->dirty |= NV50_NEW_CLIP;
+}
+
+static void
+nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->sample_mask = sample_mask;
+   nv50->dirty |= NV50_NEW_SAMPLE_MASK;
+}
+
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+
+   for (i = 0; i < fb->nr_cbufs; ++i)
+      pipe_surface_reference(&nv50->framebuffer.cbufs[i], fb->cbufs[i]);
+   for (; i < nv50->framebuffer.nr_cbufs; ++i)
+      pipe_surface_reference(&nv50->framebuffer.cbufs[i], NULL);
+
+   nv50->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+   nv50->framebuffer.width = fb->width;
+   nv50->framebuffer.height = fb->height;
+
+   pipe_surface_reference(&nv50->framebuffer.zsbuf, fb->zsbuf);
+
+   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+                         const struct pipe_poly_stipple *stipple)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->stipple = *stipple;
+   nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_states(struct pipe_context *pipe,
+                        unsigned start_slot,
+                        unsigned num_scissors,
+                        const struct pipe_scissor_state *scissor)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->scissor = *scissor;
+   nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_states(struct pipe_context *pipe,
+                         unsigned start_slot,
+                         unsigned num_viewports,
+                         const struct pipe_viewport_state *vpt)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->viewport = *vpt;
+   nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned start_slot, unsigned count,
+                        const struct pipe_vertex_buffer *vb)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+
+   util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
+                                 start_slot, count);
+
+   if (!vb) {
+      nv50->vbo_user &= ~(((1ull << count) - 1) << start_slot);
+      nv50->vbo_constant &= ~(((1ull << count) - 1) << start_slot);
+      return;
+   }
+
+   for (i = 0; i < count; ++i) {
+      unsigned dst_index = start_slot + i;
+
+      if (!vb[i].buffer && vb[i].user_buffer) {
+         nv50->vbo_user |= 1 << dst_index;
+         if (!vb[i].stride)
+            nv50->vbo_constant |= 1 << dst_index;
+         else
+            nv50->vbo_constant &= ~(1 << dst_index);
+      } else {
+         nv50->vbo_user &= ~(1 << dst_index);
+         nv50->vbo_constant &= ~(1 << dst_index);
+      }
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+
+   nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void
+nv50_set_index_buffer(struct pipe_context *pipe,
+                      const struct pipe_index_buffer *ib)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   if (nv50->idxbuf.buffer)
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
+
+   if (ib) {
+      pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
+      nv50->idxbuf.index_size = ib->index_size;
+      if (ib->buffer) {
+         nv50->idxbuf.offset = ib->offset;
+         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD);
+      } else {
+         nv50->idxbuf.user_buffer = ib->user_buffer;
+      }
+   } else {
+      pipe_resource_reference(&nv50->idxbuf.buffer, NULL);
+   }
+}
+
+static void
+nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->vertex = hwcso;
+   nv50->dirty |= NV50_NEW_VERTEX;
+}
+
+static struct pipe_stream_output_target *
+nv50_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
+{
+   struct nv50_so_target *targ = MALLOC_STRUCT(nv50_so_target);
+   if (!targ)
+      return NULL;
+
+   if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
+      targ->pq = pipe->create_query(pipe,
+                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET);
+      if (!targ->pq) {
+         FREE(targ);
+         return NULL;
+      }
+   } else {
+      targ->pq = NULL;
+   }
+   targ->clean = TRUE;
+
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
+
+   return &targ->pipe;
+}
+
+static void
+nv50_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+   if (targ->pq)
+      pipe->destroy_query(pipe, targ->pq);
+   pipe_resource_reference(&targ->pipe.buffer, NULL);
+   FREE(targ);
+}
+
+static void
+nv50_set_stream_output_targets(struct pipe_context *pipe,
+                               unsigned num_targets,
+                               struct pipe_stream_output_target **targets,
+                               unsigned append_mask)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+   boolean serialize = TRUE;
+   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+
+   assert(num_targets <= 4);
+
+   for (i = 0; i < num_targets; ++i) {
+      const boolean changed = nv50->so_target[i] != targets[i];
+      if (!changed && (append_mask & (1 << i)))
+         continue;
+      nv50->so_targets_dirty |= 1 << i;
+
+      if (can_resume && changed && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nv50_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nv50->so_target[i], targets[i]);
+   }
+   for (; i < nv50->num_so_targets; ++i) {
+      if (can_resume && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+      pipe_so_target_reference(&nv50->so_target[i], NULL);
+      nv50->so_targets_dirty |= 1 << i;
+   }
+   nv50->num_so_targets = num_targets;
+
+   if (nv50->so_targets_dirty)
+      nv50->dirty |= NV50_NEW_STRMOUT;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+   struct pipe_context *pipe = &nv50->base.pipe;
+
+   pipe->create_blend_state = nv50_blend_state_create;
+   pipe->bind_blend_state = nv50_blend_state_bind;
+   pipe->delete_blend_state = nv50_blend_state_delete;
+
+   pipe->create_rasterizer_state = nv50_rasterizer_state_create;
+   pipe->bind_rasterizer_state = nv50_rasterizer_state_bind;
+   pipe->delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+   pipe->create_depth_stencil_alpha_state = nv50_zsa_state_create;
+   pipe->bind_depth_stencil_alpha_state = nv50_zsa_state_bind;
+   pipe->delete_depth_stencil_alpha_state = nv50_zsa_state_delete;
+
+   pipe->create_sampler_state = nv50_sampler_state_create;
+   pipe->delete_sampler_state = nv50_sampler_state_delete;
+   pipe->bind_vertex_sampler_states   = nv50_vp_sampler_states_bind;
+   pipe->bind_fragment_sampler_states = nv50_fp_sampler_states_bind;
+   pipe->bind_geometry_sampler_states = nv50_gp_sampler_states_bind;
+
+   pipe->create_sampler_view = nv50_create_sampler_view;
+   pipe->sampler_view_destroy = nv50_sampler_view_destroy;
+   pipe->set_vertex_sampler_views   = nv50_vp_set_sampler_views;
+   pipe->set_fragment_sampler_views = nv50_fp_set_sampler_views;
+   pipe->set_geometry_sampler_views = nv50_gp_set_sampler_views;
+
+   pipe->create_vs_state = nv50_vp_state_create;
+   pipe->create_fs_state = nv50_fp_state_create;
+   pipe->create_gs_state = nv50_gp_state_create;
+   pipe->bind_vs_state = nv50_vp_state_bind;
+   pipe->bind_fs_state = nv50_fp_state_bind;
+   pipe->bind_gs_state = nv50_gp_state_bind;
+   pipe->delete_vs_state = nv50_sp_state_delete;
+   pipe->delete_fs_state = nv50_sp_state_delete;
+   pipe->delete_gs_state = nv50_sp_state_delete;
+
+   pipe->set_blend_color = nv50_set_blend_color;
+   pipe->set_stencil_ref = nv50_set_stencil_ref;
+   pipe->set_clip_state = nv50_set_clip_state;
+   pipe->set_sample_mask = nv50_set_sample_mask;
+   pipe->set_constant_buffer = nv50_set_constant_buffer;
+   pipe->set_framebuffer_state = nv50_set_framebuffer_state;
+   pipe->set_polygon_stipple = nv50_set_polygon_stipple;
+   pipe->set_scissor_states = nv50_set_scissor_states;
+   pipe->set_viewport_states = nv50_set_viewport_states;
+
+   pipe->create_vertex_elements_state = nv50_vertex_state_create;
+   pipe->delete_vertex_elements_state = nv50_vertex_state_delete;
+   pipe->bind_vertex_elements_state = nv50_vertex_state_bind;
+
+   pipe->set_vertex_buffers = nv50_set_vertex_buffers;
+   pipe->set_index_buffer = nv50_set_index_buffer;
+
+   pipe->create_stream_output_target = nv50_so_target_create;
+   pipe->stream_output_target_destroy = nv50_so_target_destroy;
+   pipe->set_stream_output_targets = nv50_set_stream_output_targets;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
new file mode 100644
index 00000000000..866829ca22d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -0,0 +1,414 @@
+
+#include "nv50/nv50_context.h"
+#include "os/os_time.h"
+
+static void
+nv50_validate_fb(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+   unsigned i;
+   unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
+   uint32_t array_size = 0xffff, array_mode = 0;
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+
+   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+   PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
+   BEGIN_NV04(push, NV50_3D(SCREEN_SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, fb->width << 16);
+   PUSH_DATA (push, fb->height << 16);
+
+   for (i = 0; i < fb->nr_cbufs; ++i) {
+      struct nv50_miptree *mt = nv50_miptree(fb->cbufs[i]->texture);
+      struct nv50_surface *sf = nv50_surface(fb->cbufs[i]);
+      struct nouveau_bo *bo = mt->base.bo;
+
+      array_size = MIN2(array_size, sf->depth);
+      if (mt->layout_3d)
+         array_mode = NV50_3D_RT_ARRAY_MODE_MODE_3D; /* 1 << 16 */
+
+      /* can't mix 3D with ARRAY or have RTs of different depth/array_size */
+      assert(mt->layout_3d || !array_mode || array_size == 1);
+
+      BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(i)), 5);
+      PUSH_DATAh(push, bo->offset + sf->offset);
+      PUSH_DATA (push, bo->offset + sf->offset);
+      PUSH_DATA (push, nv50_format_table[sf->base.format].rt);
+      if (likely(nouveau_bo_memtype(bo))) {
+         PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+         PUSH_DATA (push, mt->layer_stride >> 2);
+         BEGIN_NV04(push, NV50_3D(RT_HORIZ(i)), 2);
+         PUSH_DATA (push, sf->width);
+         PUSH_DATA (push, sf->height);
+         BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
+         PUSH_DATA (push, array_mode | array_size);
+      } else {
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         BEGIN_NV04(push, NV50_3D(RT_HORIZ(i)), 2);
+         PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | mt->level[0].pitch);
+         PUSH_DATA (push, sf->height);
+         BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
+         PUSH_DATA (push, 0);
+
+         assert(!fb->zsbuf);
+         assert(!mt->ms_mode);
+      }
+
+      ms_mode = mt->ms_mode;
+
+      if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
+         nv50->state.rt_serialize = TRUE;
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      mt->base.status &= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      /* only register for writing, otherwise we'd always serialize here */
+      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+   }
+
+   if (fb->zsbuf) {
+      struct nv50_miptree *mt = nv50_miptree(fb->zsbuf->texture);
+      struct nv50_surface *sf = nv50_surface(fb->zsbuf);
+      struct nouveau_bo *bo = mt->base.bo;
+      int unk = mt->base.base.target == PIPE_TEXTURE_3D || sf->depth == 1;
+
+      BEGIN_NV04(push, NV50_3D(ZETA_ADDRESS_HIGH), 5);
+      PUSH_DATAh(push, bo->offset + sf->offset);
+      PUSH_DATA (push, bo->offset + sf->offset);
+      PUSH_DATA (push, nv50_format_table[fb->zsbuf->format].rt);
+      PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+      PUSH_DATA (push, mt->layer_stride >> 2);
+      BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
+      PUSH_DATA (push, 1);
+      BEGIN_NV04(push, NV50_3D(ZETA_HORIZ), 3);
+      PUSH_DATA (push, sf->width);
+      PUSH_DATA (push, sf->height);
+      PUSH_DATA (push, (unk << 16) | sf->depth);
+
+      ms_mode = mt->ms_mode;
+
+      if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
+         nv50->state.rt_serialize = TRUE;
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      mt->base.status &= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+   } else {
+      BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, ms_mode);
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
+   PUSH_DATA (push, fb->width << 16);
+   PUSH_DATA (push, fb->height << 16);
+}
+
+static void
+nv50_validate_blend_colour(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   BEGIN_NV04(push, NV50_3D(BLEND_COLOR(0)), 4);
+   PUSH_DATAf(push, nv50->blend_colour.color[0]);
+   PUSH_DATAf(push, nv50->blend_colour.color[1]);
+   PUSH_DATAf(push, nv50->blend_colour.color[2]);
+   PUSH_DATAf(push, nv50->blend_colour.color[3]);
+}
+
+static void
+nv50_validate_stencil_ref(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   BEGIN_NV04(push, NV50_3D(STENCIL_FRONT_FUNC_REF), 1);
+   PUSH_DATA (push, nv50->stencil_ref.ref_value[0]);
+   BEGIN_NV04(push, NV50_3D(STENCIL_BACK_FUNC_REF), 1);
+   PUSH_DATA (push, nv50->stencil_ref.ref_value[1]);
+}
+
+static void
+nv50_validate_stipple(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned i;
+
+   BEGIN_NV04(push, NV50_3D(POLYGON_STIPPLE_PATTERN(0)), 32);
+   for (i = 0; i < 32; ++i)
+      PUSH_DATA(push, util_bswap32(nv50->stipple.stipple[i]));
+}
+
+static void
+nv50_validate_scissor(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_scissor_state *s = &nv50->scissor;
+#ifdef NV50_SCISSORS_CLIPPING
+   struct pipe_viewport_state *vp = &nv50->viewport;
+   int minx, maxx, miny, maxy;
+
+   if (!(nv50->dirty &
+         (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
+       nv50->state.scissor == nv50->rast->pipe.scissor)
+      return;
+   nv50->state.scissor = nv50->rast->pipe.scissor;
+
+   if (nv50->state.scissor) {
+      minx = s->minx;
+      maxx = s->maxx;
+      miny = s->miny;
+      maxy = s->maxy;
+   } else {
+      minx = 0;
+      maxx = nv50->framebuffer.width;
+      miny = 0;
+      maxy = nv50->framebuffer.height;
+   }
+
+   minx = MAX2(minx, (int)(vp->translate[0] - fabsf(vp->scale[0])));
+   maxx = MIN2(maxx, (int)(vp->translate[0] + fabsf(vp->scale[0])));
+   miny = MAX2(miny, (int)(vp->translate[1] - fabsf(vp->scale[1])));
+   maxy = MIN2(maxy, (int)(vp->translate[1] + fabsf(vp->scale[1])));
+
+   BEGIN_NV04(push, NV50_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, (maxx << 16) | minx);
+   PUSH_DATA (push, (maxy << 16) | miny);
+#else
+   BEGIN_NV04(push, NV50_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, (s->maxx << 16) | s->minx);
+   PUSH_DATA (push, (s->maxy << 16) | s->miny);
+#endif
+}
+
+static void
+nv50_validate_viewport(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   float zmin, zmax;
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_TRANSLATE_X(0)), 3);
+   PUSH_DATAf(push, nv50->viewport.translate[0]);
+   PUSH_DATAf(push, nv50->viewport.translate[1]);
+   PUSH_DATAf(push, nv50->viewport.translate[2]);
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_SCALE_X(0)), 3);
+   PUSH_DATAf(push, nv50->viewport.scale[0]);
+   PUSH_DATAf(push, nv50->viewport.scale[1]);
+   PUSH_DATAf(push, nv50->viewport.scale[2]);
+
+   zmin = nv50->viewport.translate[2] - fabsf(nv50->viewport.scale[2]);
+   zmax = nv50->viewport.translate[2] + fabsf(nv50->viewport.scale[2]);
+
+#ifdef NV50_SCISSORS_CLIPPING
+   BEGIN_NV04(push, NV50_3D(DEPTH_RANGE_NEAR(0)), 2);
+   PUSH_DATAf(push, zmin);
+   PUSH_DATAf(push, zmax);
+#endif
+}
+
+static INLINE void
+nv50_check_program_ucps(struct nv50_context *nv50,
+                        struct nv50_program *vp, uint8_t mask)
+{
+   const unsigned n = util_logbase2(mask) + 1;
+
+   if (vp->vp.clpd_nr >= n)
+      return;
+   nv50_program_destroy(nv50, vp);
+
+   vp->vp.clpd_nr = n;
+   if (likely(vp == nv50->vertprog)) {
+      nv50->dirty |= NV50_NEW_VERTPROG;
+      nv50_vertprog_validate(nv50);
+   } else {
+      nv50->dirty |= NV50_NEW_GMTYPROG;
+      nv50_gmtyprog_validate(nv50);
+   }
+   nv50_fp_linkage_validate(nv50);
+}
+
+static void
+nv50_validate_clip(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *vp;
+   uint8_t clip_enable;
+
+   if (nv50->dirty & NV50_NEW_CLIP) {
+      BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
+      PUSH_DATA (push, (0 << 8) | NV50_CB_AUX);
+      BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
+      PUSH_DATAp(push, &nv50->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
+   }
+
+   vp = nv50->gmtyprog;
+   if (likely(!vp))
+      vp = nv50->vertprog;
+
+   clip_enable = nv50->rast->pipe.clip_plane_enable;
+
+   BEGIN_NV04(push, NV50_3D(CLIP_DISTANCE_ENABLE), 1);
+   PUSH_DATA (push, clip_enable);
+
+   if (clip_enable)
+      nv50_check_program_ucps(nv50, vp, clip_enable);
+}
+
+static void
+nv50_validate_blend(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   PUSH_SPACE(push, nv50->blend->size);
+   PUSH_DATAp(push, nv50->blend->state, nv50->blend->size);
+}
+
+static void
+nv50_validate_zsa(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   PUSH_SPACE(push, nv50->zsa->size);
+   PUSH_DATAp(push, nv50->zsa->state, nv50->zsa->size);
+}
+
+static void
+nv50_validate_rasterizer(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   PUSH_SPACE(push, nv50->rast->size);
+   PUSH_DATAp(push, nv50->rast->state, nv50->rast->size);
+}
+
+static void
+nv50_validate_sample_mask(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   unsigned mask[4] =
+   {
+      nv50->sample_mask & 0xffff,
+      nv50->sample_mask & 0xffff,
+      nv50->sample_mask & 0xffff,
+      nv50->sample_mask & 0xffff
+   };
+
+   BEGIN_NV04(push, NV50_3D(MSAA_MASK(0)), 4);
+   PUSH_DATA (push, mask[0]);
+   PUSH_DATA (push, mask[1]);
+   PUSH_DATA (push, mask[2]);
+   PUSH_DATA (push, mask[3]);
+}
+
+static void
+nv50_switch_pipe_context(struct nv50_context *ctx_to)
+{
+   struct nv50_context *ctx_from = ctx_to->screen->cur_ctx;
+
+   if (ctx_from)
+      ctx_to->state = ctx_from->state;
+
+   ctx_to->dirty = ~0;
+
+   if (!ctx_to->vertex)
+      ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
+
+   if (!ctx_to->vertprog)
+      ctx_to->dirty &= ~NV50_NEW_VERTPROG;
+   if (!ctx_to->fragprog)
+      ctx_to->dirty &= ~NV50_NEW_FRAGPROG;
+
+   if (!ctx_to->blend)
+      ctx_to->dirty &= ~NV50_NEW_BLEND;
+   if (!ctx_to->rast)
+#ifdef NV50_SCISSORS_CLIPPING
+      ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
+#else
+      ctx_to->dirty &= ~NV50_NEW_RASTERIZER;
+#endif
+   if (!ctx_to->zsa)
+      ctx_to->dirty &= ~NV50_NEW_ZSA;
+
+   ctx_to->screen->cur_ctx = ctx_to;
+}
+
+static struct state_validate {
+    void (*func)(struct nv50_context *);
+    uint32_t states;
+} validate_list[] = {
+    { nv50_validate_fb,            NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_blend,         NV50_NEW_BLEND },
+    { nv50_validate_zsa,           NV50_NEW_ZSA },
+    { nv50_validate_sample_mask,   NV50_NEW_SAMPLE_MASK },
+    { nv50_validate_rasterizer,    NV50_NEW_RASTERIZER },
+    { nv50_validate_blend_colour,  NV50_NEW_BLEND_COLOUR },
+    { nv50_validate_stencil_ref,   NV50_NEW_STENCIL_REF },
+    { nv50_validate_stipple,       NV50_NEW_STIPPLE },
+#ifdef NV50_SCISSORS_CLIPPING
+    { nv50_validate_scissor,       NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT |
+                                   NV50_NEW_RASTERIZER |
+                                   NV50_NEW_FRAMEBUFFER },
+#else
+    { nv50_validate_scissor,       NV50_NEW_SCISSOR },
+#endif
+    { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
+    { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
+    { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
+    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG },
+    { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
+                                   NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
+    { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
+    { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
+    { nv50_validate_textures,      NV50_NEW_TEXTURES },
+    { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS }
+};
+#define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
+
+boolean
+nv50_state_validate(struct nv50_context *nv50, uint32_t mask, unsigned words)
+{
+   uint32_t state_mask;
+   int ret;
+   unsigned i;
+
+   if (nv50->screen->cur_ctx != nv50)
+      nv50_switch_pipe_context(nv50);
+
+   state_mask = nv50->dirty & mask;
+
+   if (state_mask) {
+      for (i = 0; i < validate_list_len; ++i) {
+         struct state_validate *validate = &validate_list[i];
+
+         if (state_mask & validate->states)
+            validate->func(nv50);
+      }
+      nv50->dirty &= ~state_mask;
+
+      if (nv50->state.rt_serialize) {
+         nv50->state.rt_serialize = FALSE;
+         BEGIN_NV04(nv50->base.pushbuf, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+         PUSH_DATA (nv50->base.pushbuf, 0);
+      }
+
+      nv50_bufctx_fence(nv50->bufctx_3d, FALSE);
+   }
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
+   ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
+
+   if (unlikely(nv50->state.flushed)) {
+      nv50->state.flushed = FALSE;
+      nv50_bufctx_fence(nv50->bufctx_3d, TRUE);
+   }
+   return !ret;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
new file mode 100644
index 00000000000..238951733cf
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -0,0 +1,78 @@
+
+#ifndef __NV50_STATEOBJ_H__
+#define __NV50_STATEOBJ_H__
+
+#include "pipe/p_state.h"
+
+#define NV50_SCISSORS_CLIPPING
+
+#define SB_BEGIN_3D(so, m, s) \
+   (so)->state[(so)->size++] = NV50_FIFO_PKHDR(NV50_3D(m), s)
+
+#define SB_BEGIN_3D_(so, m, s) \
+   (so)->state[(so)->size++] = NV50_FIFO_PKHDR(SUBC_3D(m), s)
+
+#define SB_DATA(so, u) (so)->state[(so)->size++] = (u)
+
+#include "nv50/nv50_stateobj_tex.h"
+
+struct nv50_blend_stateobj {
+   struct pipe_blend_state pipe;
+   int size;
+   uint32_t state[84]; // TODO: allocate less if !independent_blend_enable
+};
+
+struct nv50_rasterizer_stateobj {
+   struct pipe_rasterizer_state pipe;
+   int size;
+   uint32_t state[48];
+};
+
+struct nv50_zsa_stateobj {
+   struct pipe_depth_stencil_alpha_state pipe;
+   int size;
+   uint32_t state[29];
+};
+
+struct nv50_constbuf {
+   union {
+      struct pipe_resource *buf;
+      const uint8_t *data;
+   } u;
+   uint32_t size; /* max 65536 */
+   uint32_t offset;
+   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+};
+
+struct nv50_vertex_element {
+   struct pipe_vertex_element pipe;
+   uint32_t state;
+};
+
+struct nv50_vertex_stateobj {
+   uint32_t min_instance_div[PIPE_MAX_ATTRIBS];
+   uint16_t vb_access_size[PIPE_MAX_ATTRIBS];
+   struct translate *translate;
+   unsigned num_elements;
+   uint32_t instance_elts;
+   uint32_t instance_bufs;
+   boolean need_conversion;
+   unsigned vertex_size;
+   unsigned packet_vertex_limit;
+   struct nv50_vertex_element element[0];
+};
+
+struct nv50_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
+};
+
+static INLINE struct nv50_so_target *
+nv50_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nv50_so_target *)ptarg;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
new file mode 100644
index 00000000000..99548cbdb42
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj_tex.h
@@ -0,0 +1,34 @@
+
+#ifndef __NV50_STATEOBJ_TEX_H__
+#define __NV50_STATEOBJ_TEX_H__
+
+#include "pipe/p_state.h"
+
+struct nv50_tsc_entry {
+   int id;
+   uint32_t tsc[8];
+};
+
+static INLINE struct nv50_tsc_entry *
+nv50_tsc_entry(void *hwcso)
+{
+   return (struct nv50_tsc_entry *)hwcso;
+}
+
+struct nv50_tic_entry {
+   struct pipe_sampler_view pipe;
+   int id;
+   uint32_t tic[8];
+};
+
+static INLINE struct nv50_tic_entry *
+nv50_tic_entry(struct pipe_sampler_view *view)
+{
+   return (struct nv50_tic_entry *)view;
+}
+
+extern void *
+nv50_sampler_state_create(struct pipe_context *,
+                          const struct pipe_sampler_state *);
+
+#endif /* __NV50_STATEOBJ_TEX_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
new file mode 100644
index 00000000000..dcc1fce41c5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -0,0 +1,1353 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#include "pipe/p_defines.h"
+
+#include "util/u_inlines.h"
+#include "util/u_pack_color.h"
+#include "util/u_format.h"
+#include "util/u_surface.h"
+
+#include "tgsi/tgsi_ureg.h"
+
+#include "os/os_thread.h"
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_resource.h"
+
+#include "nv50/nv50_defs.xml.h"
+#include "nv50/nv50_texture.xml.h"
+
+/* these are used in nv50_blit.h */
+#define NV50_ENG2D_SUPPORTED_FORMATS 0xff0843e080608409ULL
+#define NV50_ENG2D_NOCONVERT_FORMATS 0x0008402000000000ULL
+#define NV50_ENG2D_LUMINANCE_FORMATS 0x0008402000000000ULL
+#define NV50_ENG2D_INTENSITY_FORMATS 0x0000000000000000ULL
+#define NV50_ENG2D_OPERATION_FORMATS 0x060001c000608000ULL
+
+#define NOUVEAU_DRIVER 0x50
+#include "nv50/nv50_blit.h"
+
+static INLINE uint8_t
+nv50_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+{
+   uint8_t id = nv50_format_table[format].rt;
+
+   /* Hardware values for color formats range from 0xc0 to 0xff,
+    * but the 2D engine doesn't support all of them.
+    */
+   if ((id >= 0xc0) && (NV50_ENG2D_SUPPORTED_FORMATS & (1ULL << (id - 0xc0))))
+      return id;
+   assert(dst_src_equal);
+
+   switch (util_format_get_blocksize(format)) {
+   case 1:
+      return NV50_SURFACE_FORMAT_R8_UNORM;
+   case 2:
+      return NV50_SURFACE_FORMAT_R16_UNORM;
+   case 4:
+      return NV50_SURFACE_FORMAT_BGRA8_UNORM;
+   default:
+      return 0;
+   }
+}
+
+static int
+nv50_2d_texture_set(struct nouveau_pushbuf *push, int dst,
+                    struct nv50_miptree *mt, unsigned level, unsigned layer,
+                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+{
+   struct nouveau_bo *bo = mt->base.bo;
+   uint32_t width, height, depth;
+   uint32_t format;
+   uint32_t mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+   uint32_t offset = mt->level[level].offset;
+
+   format = nv50_2d_format(pformat, dst, dst_src_pformat_equal);
+   if (!format) {
+      NOUVEAU_ERR("invalid/unsupported surface format: %s\n",
+                  util_format_name(pformat));
+      return 1;
+   }
+
+   width = u_minify(mt->base.base.width0, level) << mt->ms_x;
+   height = u_minify(mt->base.base.height0, level) << mt->ms_y;
+   depth = u_minify(mt->base.base.depth0, level);
+
+   offset = mt->level[level].offset;
+   if (!mt->layout_3d) {
+      offset += mt->layer_stride * layer;
+      depth = 1;
+      layer = 0;
+   } else
+   if (!dst) {
+      offset += nv50_mt_zslice_offset(mt, level, layer);
+      layer = 0;
+   }
+
+   if (!nouveau_bo_memtype(bo)) {
+      BEGIN_NV04(push, SUBC_2D(mthd), 2);
+      PUSH_DATA (push, format);
+      PUSH_DATA (push, 1);
+      BEGIN_NV04(push, SUBC_2D(mthd + 0x14), 5);
+      PUSH_DATA (push, mt->level[level].pitch);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+      PUSH_DATAh(push, bo->offset + offset);
+      PUSH_DATA (push, bo->offset + offset);
+   } else {
+      BEGIN_NV04(push, SUBC_2D(mthd), 5);
+      PUSH_DATA (push, format);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, mt->level[level].tile_mode);
+      PUSH_DATA (push, depth);
+      PUSH_DATA (push, layer);
+      BEGIN_NV04(push, SUBC_2D(mthd + 0x18), 4);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+      PUSH_DATAh(push, bo->offset + offset);
+      PUSH_DATA (push, bo->offset + offset);
+   }
+
+#if 0
+   if (dst) {
+      BEGIN_NV04(push, SUBC_2D(NV50_2D_CLIP_X), 4);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+   }
+#endif
+   return 0;
+}
+
+static int
+nv50_2d_texture_do_copy(struct nouveau_pushbuf *push,
+                        struct nv50_miptree *dst, unsigned dst_level,
+                        unsigned dx, unsigned dy, unsigned dz,
+                        struct nv50_miptree *src, unsigned src_level,
+                        unsigned sx, unsigned sy, unsigned sz,
+                        unsigned w, unsigned h)
+{
+   const enum pipe_format dfmt = dst->base.base.format;
+   const enum pipe_format sfmt = src->base.base.format;
+   int ret;
+   boolean eqfmt = dfmt == sfmt;
+
+   if (!PUSH_SPACE(push, 2 * 16 + 32))
+      return PIPE_ERROR;
+
+   ret = nv50_2d_texture_set(push, 1, dst, dst_level, dz, dfmt, eqfmt);
+   if (ret)
+      return ret;
+
+   ret = nv50_2d_texture_set(push, 0, src, src_level, sz, sfmt, eqfmt);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, NV50_2D(BLIT_CONTROL), 1);
+   PUSH_DATA (push, NV50_2D_BLIT_CONTROL_FILTER_POINT_SAMPLE);
+   BEGIN_NV04(push, NV50_2D(BLIT_DST_X), 4);
+   PUSH_DATA (push, dx << dst->ms_x);
+   PUSH_DATA (push, dy << dst->ms_y);
+   PUSH_DATA (push, w << dst->ms_x);
+   PUSH_DATA (push, h << dst->ms_y);
+   BEGIN_NV04(push, NV50_2D(BLIT_DU_DX_FRACT), 4);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_2D(BLIT_SRC_X_FRACT), 4);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, sx << src->ms_x);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, sy << src->ms_y);
+
+   return 0;
+}
+
+static void
+nv50_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *src, unsigned src_level,
+                          const struct pipe_box *src_box)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   int ret;
+   boolean m2mf;
+   unsigned dst_layer = dstz, src_layer = src_box->z;
+
+   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+      nouveau_copy_buffer(&nv50->base,
+                          nv04_resource(dst), dstx,
+                          nv04_resource(src), src_box->x, src_box->width);
+      return;
+   }
+
+   /* 0 and 1 are equal, only supporting 0/1, 2, 4 and 8 */
+   assert((src->nr_samples | 1) == (dst->nr_samples | 1));
+
+   m2mf = (src->format == dst->format) ||
+      (util_format_get_blocksizebits(src->format) ==
+       util_format_get_blocksizebits(dst->format));
+
+   nv04_resource(dst)->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+
+   if (m2mf) {
+      struct nv50_m2mf_rect drect, srect;
+      unsigned i;
+      unsigned nx = util_format_get_nblocksx(src->format, src_box->width);
+      unsigned ny = util_format_get_nblocksy(src->format, src_box->height);
+
+      nv50_m2mf_rect_setup(&drect, dst, dst_level, dstx, dsty, dstz);
+      nv50_m2mf_rect_setup(&srect, src, src_level,
+                           src_box->x, src_box->y, src_box->z);
+
+      for (i = 0; i < src_box->depth; ++i) {
+         nv50_m2mf_transfer_rect(nv50, &drect, &srect, nx, ny);
+
+         if (nv50_miptree(dst)->layout_3d)
+            drect.z++;
+         else
+            drect.base += nv50_miptree(dst)->layer_stride;
+
+         if (nv50_miptree(src)->layout_3d)
+            srect.z++;
+         else
+            srect.base += nv50_miptree(src)->layer_stride;
+      }
+      return;
+   }
+
+   assert((src->format == dst->format) ||
+          (nv50_2d_src_format_faithful(src->format) &&
+           nv50_2d_dst_format_faithful(dst->format)));
+
+   BCTX_REFN(nv50->bufctx, 2D, nv04_resource(src), RD);
+   BCTX_REFN(nv50->bufctx, 2D, nv04_resource(dst), WR);
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx);
+   nouveau_pushbuf_validate(nv50->base.pushbuf);
+
+   for (; dst_layer < dstz + src_box->depth; ++dst_layer, ++src_layer) {
+      ret = nv50_2d_texture_do_copy(nv50->base.pushbuf,
+                                    nv50_miptree(dst), dst_level,
+                                    dstx, dsty, dst_layer,
+                                    nv50_miptree(src), src_level,
+                                    src_box->x, src_box->y, src_layer,
+                                    src_box->width, src_box->height);
+      if (ret)
+         break;
+   }
+   nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D);
+}
+
+static void
+nv50_clear_render_target(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+			 const union pipe_color_union *color,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_miptree *mt = nv50_miptree(dst->texture);
+   struct nv50_surface *sf = nv50_surface(dst);
+   struct nouveau_bo *bo = mt->base.bo;
+   unsigned z;
+
+   BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
+   PUSH_DATAf(push, color->f[0]);
+   PUSH_DATAf(push, color->f[1]);
+   PUSH_DATAf(push, color->f[2]);
+   PUSH_DATAf(push, color->f[3]);
+
+   if (nouveau_pushbuf_space(push, 32 + sf->depth, 1, 0))
+      return;
+
+   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);
+
+   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5);
+   PUSH_DATAh(push, bo->offset + sf->offset);
+   PUSH_DATA (push, bo->offset + sf->offset);
+   PUSH_DATA (push, nv50_format_table[dst->format].rt);
+   PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2);
+   if (nouveau_bo_memtype(bo))
+      PUSH_DATA(push, sf->width);
+   else
+      PUSH_DATA(push, NV50_3D_RT_HORIZ_LINEAR | mt->level[0].pitch);
+   PUSH_DATA (push, sf->height);
+   BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
+   PUSH_DATA (push, 1);
+
+   if (!nouveau_bo_memtype(bo)) {
+      BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
+   PUSH_DATA (push, (width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+
+   BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
+   for (z = 0; z < sf->depth; ++z) {
+      PUSH_DATA (push, 0x3c |
+                 (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
+   }
+
+   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_clear_depth_stencil(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_miptree *mt = nv50_miptree(dst->texture);
+   struct nv50_surface *sf = nv50_surface(dst);
+   struct nouveau_bo *bo = mt->base.bo;
+   uint32_t mode = 0;
+   unsigned z;
+
+   assert(nouveau_bo_memtype(bo)); /* ZETA cannot be linear */
+
+   if (clear_flags & PIPE_CLEAR_DEPTH) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_DEPTH), 1);
+      PUSH_DATAf(push, depth);
+      mode |= NV50_3D_CLEAR_BUFFERS_Z;
+   }
+
+   if (clear_flags & PIPE_CLEAR_STENCIL) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_STENCIL), 1);
+      PUSH_DATA (push, stencil & 0xff);
+      mode |= NV50_3D_CLEAR_BUFFERS_S;
+   }
+
+   if (nouveau_pushbuf_space(push, 32 + sf->depth, 1, 0))
+      return;
+
+   PUSH_REFN(push, bo, mt->base.domain | NOUVEAU_BO_WR);
+
+   BEGIN_NV04(push, NV50_3D(ZETA_ADDRESS_HIGH), 5);
+   PUSH_DATAh(push, bo->offset + sf->offset);
+   PUSH_DATA (push, bo->offset + sf->offset);
+   PUSH_DATA (push, nv50_format_table[dst->format].rt);
+   PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(ZETA_HORIZ), 3);
+   PUSH_DATA (push, sf->width);
+   PUSH_DATA (push, sf->height);
+   PUSH_DATA (push, (1 << 16) | 1);
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
+   PUSH_DATA (push, (width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+
+   BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
+   for (z = 0; z < sf->depth; ++z) {
+      PUSH_DATA (push, mode |
+                 (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
+   }
+
+   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+void
+nv50_clear(struct pipe_context *pipe, unsigned buffers,
+           const union pipe_color_union *color,
+           double depth, unsigned stencil)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+   unsigned i;
+   uint32_t mode = 0;
+
+   /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
+   if (!nv50_state_validate(nv50, NV50_NEW_FRAMEBUFFER, 9 + (fb->nr_cbufs * 2)))
+      return;
+
+   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4);
+      PUSH_DATAf(push, color->f[0]);
+      PUSH_DATAf(push, color->f[1]);
+      PUSH_DATAf(push, color->f[2]);
+      PUSH_DATAf(push, color->f[3]);
+      mode =
+         NV50_3D_CLEAR_BUFFERS_R | NV50_3D_CLEAR_BUFFERS_G |
+         NV50_3D_CLEAR_BUFFERS_B | NV50_3D_CLEAR_BUFFERS_A;
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTH) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_DEPTH), 1);
+      PUSH_DATA (push, fui(depth));
+      mode |= NV50_3D_CLEAR_BUFFERS_Z;
+   }
+
+   if (buffers & PIPE_CLEAR_STENCIL) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_STENCIL), 1);
+      PUSH_DATA (push, stencil & 0xff);
+      mode |= NV50_3D_CLEAR_BUFFERS_S;
+   }
+
+   BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
+   PUSH_DATA (push, mode);
+
+   for (i = 1; i < fb->nr_cbufs; i++) {
+      BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
+      PUSH_DATA (push, (i << 6) | 0x3c);
+   }
+}
+
+
+/* =============================== BLIT CODE ===================================
+ */
+
+struct nv50_blitter
+{
+   struct nv50_program *fp[NV50_BLIT_MAX_TEXTURE_TYPES][NV50_BLIT_MODES];
+   struct nv50_program vp;
+
+   struct nv50_tsc_entry sampler[2]; /* nearest, bilinear */
+
+   pipe_mutex mutex;
+};
+
+struct nv50_blitctx
+{
+   struct nv50_context *nv50;
+   struct nv50_program *fp;
+   uint8_t mode;
+   uint16_t color_mask;
+   uint8_t filter;
+   enum pipe_texture_target target;
+   struct {
+      struct pipe_framebuffer_state fb;
+      struct nv50_rasterizer_stateobj *rast;
+      struct nv50_program *vp;
+      struct nv50_program *gp;
+      struct nv50_program *fp;
+      unsigned num_textures[3];
+      unsigned num_samplers[3];
+      struct pipe_sampler_view *texture[2];
+      struct nv50_tsc_entry *sampler[2];
+      uint32_t dirty;
+   } saved;
+   struct nv50_rasterizer_stateobj rast;
+};
+
+static void
+nv50_blitter_make_vp(struct nv50_blitter *blit)
+{
+   static const uint32_t code[] =
+   {
+      0x10000001, 0x0423c788, /* mov b32 o[0x00] s[0x00] */ /* HPOS.x */
+      0x10000205, 0x0423c788, /* mov b32 o[0x04] s[0x04] */ /* HPOS.y */
+      0x10000409, 0x0423c788, /* mov b32 o[0x08] s[0x08] */ /* TEXC.x */
+      0x1000060d, 0x0423c788, /* mov b32 o[0x0c] s[0x0c] */ /* TEXC.y */
+      0x10000811, 0x0423c789, /* mov b32 o[0x10] s[0x10] */ /* TEXC.z */
+   };
+
+   blit->vp.type = PIPE_SHADER_VERTEX;
+   blit->vp.translated = TRUE;
+   blit->vp.code = (uint32_t *)code; /* const_cast */
+   blit->vp.code_size = sizeof(code);
+   blit->vp.max_gpr = 4;
+   blit->vp.max_out = 5;
+   blit->vp.out_nr = 2;
+   blit->vp.out[0].mask = 0x3;
+   blit->vp.out[0].sn = TGSI_SEMANTIC_POSITION;
+   blit->vp.out[1].hw = 2;
+   blit->vp.out[1].mask = 0x7;
+   blit->vp.out[1].sn = TGSI_SEMANTIC_GENERIC;
+   blit->vp.out[1].si = 0;
+   blit->vp.vp.attrs[0] = 0x73;
+   blit->vp.vp.psiz = 0x40;
+   blit->vp.vp.edgeflag = 0x40;
+}
+
+void *
+nv50_blitter_make_fp(struct pipe_context *pipe,
+                     unsigned mode,
+                     enum pipe_texture_target ptarg)
+{
+   struct ureg_program *ureg;
+   struct ureg_src tc;
+   struct ureg_dst out;
+   struct ureg_dst data;
+
+   const unsigned target = nv50_blit_get_tgsi_texture_target(ptarg);
+
+   boolean tex_rgbaz = FALSE;
+   boolean tex_s = FALSE;
+   boolean cvt_un8 = FALSE;
+
+   if (mode != NV50_BLIT_MODE_PASS &&
+       mode != NV50_BLIT_MODE_Z24X8 &&
+       mode != NV50_BLIT_MODE_X8Z24)
+      tex_s = TRUE;
+
+   if (mode != NV50_BLIT_MODE_X24S8 &&
+       mode != NV50_BLIT_MODE_S8X24 &&
+       mode != NV50_BLIT_MODE_XS)
+      tex_rgbaz = TRUE;
+
+   if (mode != NV50_BLIT_MODE_PASS &&
+       mode != NV50_BLIT_MODE_ZS &&
+       mode != NV50_BLIT_MODE_XS)
+      cvt_un8 = TRUE;
+
+   ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+   if (!ureg)
+      return NULL;
+
+   out = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
+   tc = ureg_DECL_fs_input(
+      ureg, TGSI_SEMANTIC_GENERIC, 0, TGSI_INTERPOLATE_LINEAR);
+
+   data = ureg_DECL_temporary(ureg);
+
+   if (tex_s) {
+      ureg_TEX(ureg, ureg_writemask(data, TGSI_WRITEMASK_X),
+               target, tc, ureg_DECL_sampler(ureg, 1));
+      ureg_MOV(ureg, ureg_writemask(data, TGSI_WRITEMASK_Y),
+               ureg_scalar(ureg_src(data), TGSI_SWIZZLE_X));
+   }
+   if (tex_rgbaz) {
+      const unsigned mask = (mode == NV50_BLIT_MODE_PASS) ?
+         TGSI_WRITEMASK_XYZW : TGSI_WRITEMASK_X;
+      ureg_TEX(ureg, ureg_writemask(data, mask),
+               target, tc, ureg_DECL_sampler(ureg, 0));
+   }
+
+   if (cvt_un8) {
+      struct ureg_src mask;
+      struct ureg_src scale;
+      struct ureg_dst outz;
+      struct ureg_dst outs;
+      struct ureg_dst zdst3 = ureg_writemask(data, TGSI_WRITEMASK_XYZ);
+      struct ureg_dst zdst = ureg_writemask(data, TGSI_WRITEMASK_X);
+      struct ureg_dst sdst = ureg_writemask(data, TGSI_WRITEMASK_Y);
+      struct ureg_src zsrc3 = ureg_src(data);
+      struct ureg_src zsrc = ureg_scalar(zsrc3, TGSI_SWIZZLE_X);
+      struct ureg_src ssrc = ureg_scalar(zsrc3, TGSI_SWIZZLE_Y);
+      struct ureg_src zshuf;
+
+      mask = ureg_imm3u(ureg, 0x0000ff, 0x00ff00, 0xff0000);
+      scale = ureg_imm4f(ureg,
+                         1.0f / 0x0000ff, 1.0f / 0x00ff00, 1.0f / 0xff0000,
+                         (1 << 24) - 1);
+
+      if (mode == NV50_BLIT_MODE_Z24S8 ||
+          mode == NV50_BLIT_MODE_X24S8 ||
+          mode == NV50_BLIT_MODE_Z24X8) {
+         outz = ureg_writemask(out, TGSI_WRITEMASK_XYZ);
+         outs = ureg_writemask(out, TGSI_WRITEMASK_W);
+         zshuf = ureg_src(data);
+      } else {
+         outz = ureg_writemask(out, TGSI_WRITEMASK_YZW);
+         outs = ureg_writemask(out, TGSI_WRITEMASK_X);
+         zshuf = ureg_swizzle(zsrc3, TGSI_SWIZZLE_W,
+                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z);
+      }
+
+      if (tex_s) {
+         ureg_I2F(ureg, sdst, ssrc);
+         ureg_MUL(ureg, outs, ssrc, ureg_scalar(scale, TGSI_SWIZZLE_X));
+      }
+
+      if (tex_rgbaz) {
+         ureg_MUL(ureg, zdst, zsrc, ureg_scalar(scale, TGSI_SWIZZLE_W));
+         ureg_F2I(ureg, zdst, zsrc);
+         ureg_AND(ureg, zdst3, zsrc, mask);
+         ureg_I2F(ureg, zdst3, zsrc3);
+         ureg_MUL(ureg, zdst3, zsrc3, scale);
+         ureg_MOV(ureg, outz, zshuf);
+      }
+   } else {
+      unsigned mask = TGSI_WRITEMASK_XYZW;
+
+      if (mode != NV50_BLIT_MODE_PASS) {
+         mask &= ~TGSI_WRITEMASK_ZW;
+         if (!tex_s)
+            mask = TGSI_WRITEMASK_X;
+         if (!tex_rgbaz)
+            mask = TGSI_WRITEMASK_Y;
+      }
+      ureg_MOV(ureg, ureg_writemask(out, mask), ureg_src(data));
+   }
+   ureg_END(ureg);
+
+   return ureg_create_shader_and_destroy(ureg, pipe);
+}
+
+static void
+nv50_blitter_make_sampler(struct nv50_blitter *blit)
+{
+   /* clamp to edge, min/max lod = 0, nearest filtering */
+
+   blit->sampler[0].id = -1;
+
+   blit->sampler[0].tsc[0] = NV50_TSC_0_SRGB_CONVERSION_ALLOWED |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPS__SHIFT) |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPT__SHIFT) |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPR__SHIFT);
+   blit->sampler[0].tsc[1] =
+      NV50_TSC_1_MAGF_NEAREST | NV50_TSC_1_MINF_NEAREST | NV50_TSC_1_MIPF_NONE;
+
+   /* clamp to edge, min/max lod = 0, bilinear filtering */
+
+   blit->sampler[1].id = -1;
+
+   blit->sampler[1].tsc[0] = blit->sampler[0].tsc[0];
+   blit->sampler[1].tsc[1] =
+      NV50_TSC_1_MAGF_LINEAR | NV50_TSC_1_MINF_LINEAR | NV50_TSC_1_MIPF_NONE;
+}
+
+unsigned
+nv50_blit_select_mode(const struct pipe_blit_info *info)
+{
+   const unsigned mask = info->mask;
+
+   switch (info->dst.resource->format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      switch (mask & PIPE_MASK_ZS) {
+      case PIPE_MASK_ZS: return NV50_BLIT_MODE_Z24S8;
+      case PIPE_MASK_Z:  return NV50_BLIT_MODE_Z24X8;
+      default:
+         return NV50_BLIT_MODE_X24S8;
+      }
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      switch (mask & PIPE_MASK_ZS) {
+      case PIPE_MASK_ZS: return NV50_BLIT_MODE_S8Z24;
+      case PIPE_MASK_Z:  return NV50_BLIT_MODE_X8Z24;
+      default:
+         return NV50_BLIT_MODE_S8X24;
+      }
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      switch (mask & PIPE_MASK_ZS) {
+      case PIPE_MASK_ZS: return NV50_BLIT_MODE_ZS;
+      case PIPE_MASK_Z:  return NV50_BLIT_MODE_PASS;
+      default:
+         return NV50_BLIT_MODE_XS;
+      }
+   default:
+      return NV50_BLIT_MODE_PASS;
+   }
+}
+
+static void
+nv50_blit_select_fp(struct nv50_blitctx *ctx, const struct pipe_blit_info *info)
+{
+   struct nv50_blitter *blitter = ctx->nv50->screen->blitter;
+
+   const enum pipe_texture_target ptarg =
+      nv50_blit_reinterpret_pipe_texture_target(info->src.resource->target);
+
+   const unsigned targ = nv50_blit_texture_type(ptarg);
+   const unsigned mode = ctx->mode;
+
+   if (!blitter->fp[targ][mode]) {
+      pipe_mutex_lock(blitter->mutex);
+      if (!blitter->fp[targ][mode])
+         blitter->fp[targ][mode] =
+            nv50_blitter_make_fp(&ctx->nv50->base.pipe, mode, ptarg);
+      pipe_mutex_unlock(blitter->mutex);
+   }
+   ctx->fp = blitter->fp[targ][mode];
+}
+
+static void
+nv50_blit_set_dst(struct nv50_blitctx *ctx,
+                  struct pipe_resource *res, unsigned level, unsigned layer,
+                  enum pipe_format format)
+{
+   struct nv50_context *nv50 = ctx->nv50;
+   struct pipe_context *pipe = &nv50->base.pipe;
+   struct pipe_surface templ;
+
+   if (util_format_is_depth_or_stencil(format))
+      templ.format = nv50_blit_zeta_to_colour_format(format);
+   else
+      templ.format = format;
+
+   templ.u.tex.level = level;
+   templ.u.tex.first_layer = templ.u.tex.last_layer = layer;
+
+   if (layer == -1) {
+      templ.u.tex.first_layer = 0;
+      templ.u.tex.last_layer =
+         (res->target == PIPE_TEXTURE_3D ? res->depth0 : res->array_size) - 1;
+   }
+
+   nv50->framebuffer.cbufs[0] = nv50_miptree_surface_new(pipe, res, &templ);
+   nv50->framebuffer.nr_cbufs = 1;
+   nv50->framebuffer.zsbuf = NULL;
+   nv50->framebuffer.width = nv50->framebuffer.cbufs[0]->width;
+   nv50->framebuffer.height = nv50->framebuffer.cbufs[0]->height;
+}
+
+static void
+nv50_blit_set_src(struct nv50_blitctx *blit,
+                  struct pipe_resource *res, unsigned level, unsigned layer,
+                  enum pipe_format format, const uint8_t filter)
+{
+   struct nv50_context *nv50 = blit->nv50;
+   struct pipe_context *pipe = &nv50->base.pipe;
+   struct pipe_sampler_view templ;
+   uint32_t flags;
+   enum pipe_texture_target target;
+
+   target = nv50_blit_reinterpret_pipe_texture_target(res->target);
+
+   templ.format = format;
+   templ.u.tex.first_level = templ.u.tex.last_level = level;
+   templ.u.tex.first_layer = templ.u.tex.last_layer = layer;
+   templ.swizzle_r = PIPE_SWIZZLE_RED;
+   templ.swizzle_g = PIPE_SWIZZLE_GREEN;
+   templ.swizzle_b = PIPE_SWIZZLE_BLUE;
+   templ.swizzle_a = PIPE_SWIZZLE_ALPHA;
+
+   if (layer == -1) {
+      templ.u.tex.first_layer = 0;
+      templ.u.tex.last_layer =
+         (res->target == PIPE_TEXTURE_3D ? res->depth0 : res->array_size) - 1;
+   }
+
+   flags = res->last_level ? 0 : NV50_TEXVIEW_SCALED_COORDS;
+   flags |= NV50_TEXVIEW_ACCESS_RESOLVE;
+   if (filter && res->nr_samples == 8)
+      flags |= NV50_TEXVIEW_FILTER_MSAA8;
+
+   nv50->textures[2][0] = nv50_create_texture_view(
+      pipe, res, &templ, flags, target);
+   nv50->textures[2][1] = NULL;
+
+   nv50->num_textures[0] = nv50->num_textures[1] = 0;
+   nv50->num_textures[2] = 1;
+
+   templ.format = nv50_zs_to_s_format(format);
+   if (templ.format != res->format) {
+      nv50->textures[2][1] = nv50_create_texture_view(
+         pipe, res, &templ, flags, target);
+      nv50->num_textures[2] = 2;
+   }
+}
+
+static void
+nv50_blitctx_prepare_state(struct nv50_blitctx *blit)
+{
+   struct nouveau_pushbuf *push = blit->nv50->base.pushbuf;
+
+   if (blit->nv50->cond_query) {
+      BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+      PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+   }
+
+   /* blend state */
+   BEGIN_NV04(push, NV50_3D(COLOR_MASK(0)), 1);
+   PUSH_DATA (push, blit->color_mask);
+   BEGIN_NV04(push, NV50_3D(BLEND_ENABLE(0)), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(LOGIC_OP_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   /* rasterizer state */
+#ifndef NV50_SCISSORS_CLIPPING
+   BEGIN_NV04(push, NV50_3D(SCISSOR_ENABLE(0)), 1);
+   PUSH_DATA (push, 1);
+#endif
+   BEGIN_NV04(push, NV50_3D(VERTEX_TWO_SIDE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(FRAG_COLOR_CLAMP_EN), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MSAA_MASK(0)), 4);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   BEGIN_NV04(push, NV50_3D(POLYGON_MODE_FRONT), 3);
+   PUSH_DATA (push, NV50_3D_POLYGON_MODE_FRONT_FILL);
+   PUSH_DATA (push, NV50_3D_POLYGON_MODE_BACK_FILL);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(CULL_FACE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(POLYGON_STIPPLE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(POLYGON_OFFSET_FILL_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   /* zsa state */
+   BEGIN_NV04(push, NV50_3D(DEPTH_TEST_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(STENCIL_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(ALPHA_TEST_ENABLE), 1);
+   PUSH_DATA (push, 0);
+}
+
+static void
+nv50_blitctx_pre_blit(struct nv50_blitctx *ctx)
+{
+   struct nv50_context *nv50 = ctx->nv50;
+   struct nv50_blitter *blitter = nv50->screen->blitter;
+   int s;
+
+   ctx->saved.fb.width = nv50->framebuffer.width;
+   ctx->saved.fb.height = nv50->framebuffer.height;
+   ctx->saved.fb.nr_cbufs = nv50->framebuffer.nr_cbufs;
+   ctx->saved.fb.cbufs[0] = nv50->framebuffer.cbufs[0];
+   ctx->saved.fb.zsbuf = nv50->framebuffer.zsbuf;
+
+   ctx->saved.rast = nv50->rast;
+
+   ctx->saved.vp = nv50->vertprog;
+   ctx->saved.gp = nv50->gmtyprog;
+   ctx->saved.fp = nv50->fragprog;
+
+   nv50->rast = &ctx->rast;
+
+   nv50->vertprog = &blitter->vp;
+   nv50->gmtyprog = NULL;
+   nv50->fragprog = ctx->fp;
+
+   for (s = 0; s < 3; ++s) {
+      ctx->saved.num_textures[s] = nv50->num_textures[s];
+      ctx->saved.num_samplers[s] = nv50->num_samplers[s];
+   }
+   ctx->saved.texture[0] = nv50->textures[2][0];
+   ctx->saved.texture[1] = nv50->textures[2][1];
+   ctx->saved.sampler[0] = nv50->samplers[2][0];
+   ctx->saved.sampler[1] = nv50->samplers[2][1];
+
+   nv50->samplers[2][0] = &blitter->sampler[ctx->filter];
+   nv50->samplers[2][1] = &blitter->sampler[ctx->filter];
+
+   nv50->num_samplers[0] = nv50->num_samplers[1] = 0;
+   nv50->num_samplers[2] = 2;
+
+   ctx->saved.dirty = nv50->dirty;
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+
+   nv50->dirty =
+      NV50_NEW_FRAMEBUFFER |
+      NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | NV50_NEW_GMTYPROG |
+      NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS;
+}
+
+static void
+nv50_blitctx_post_blit(struct nv50_blitctx *blit)
+{
+   struct nv50_context *nv50 = blit->nv50;
+   int s;
+
+   pipe_surface_reference(&nv50->framebuffer.cbufs[0], NULL);
+
+   nv50->framebuffer.width = blit->saved.fb.width;
+   nv50->framebuffer.height = blit->saved.fb.height;
+   nv50->framebuffer.nr_cbufs = blit->saved.fb.nr_cbufs;
+   nv50->framebuffer.cbufs[0] = blit->saved.fb.cbufs[0];
+   nv50->framebuffer.zsbuf = blit->saved.fb.zsbuf;
+
+   nv50->rast = blit->saved.rast;
+
+   nv50->vertprog = blit->saved.vp;
+   nv50->gmtyprog = blit->saved.gp;
+   nv50->fragprog = blit->saved.fp;
+
+   pipe_sampler_view_reference(&nv50->textures[2][0], NULL);
+   pipe_sampler_view_reference(&nv50->textures[2][1], NULL);
+
+   for (s = 0; s < 3; ++s) {
+      nv50->num_textures[s] = blit->saved.num_textures[s];
+      nv50->num_samplers[s] = blit->saved.num_samplers[s];
+   }
+   nv50->textures[2][0] = blit->saved.texture[0];
+   nv50->textures[2][1] = blit->saved.texture[1];
+   nv50->samplers[2][0] = blit->saved.sampler[0];
+   nv50->samplers[2][1] = blit->saved.sampler[1];
+
+   if (nv50->cond_query)
+      nv50->base.pipe.render_condition(&nv50->base.pipe, nv50->cond_query,
+                                       nv50->cond_cond, nv50->cond_mode);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+
+   nv50->dirty = blit->saved.dirty |
+      (NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR | NV50_NEW_SAMPLE_MASK |
+       NV50_NEW_RASTERIZER | NV50_NEW_ZSA | NV50_NEW_BLEND |
+       NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS |
+       NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG | NV50_NEW_FRAGPROG);
+}
+
+
+static void
+nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info)
+{
+   struct nv50_blitctx *blit = nv50->blit;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_resource *src = info->src.resource;
+   struct pipe_resource *dst = info->dst.resource;
+   int32_t minx, maxx, miny, maxy;
+   int32_t i;
+   float x0, x1, y0, y1, z;
+   float dz;
+   float x_range, y_range;
+
+   blit->mode = nv50_blit_select_mode(info);
+   blit->color_mask = nv50_blit_derive_color_mask(info);
+   blit->filter = nv50_blit_get_filter(info);
+
+   nv50_blit_select_fp(blit, info);
+   nv50_blitctx_pre_blit(blit);
+
+   nv50_blit_set_dst(blit, dst, info->dst.level, -1, info->dst.format);
+   nv50_blit_set_src(blit, src, info->src.level, -1, info->src.format,
+                     blit->filter);
+
+   nv50_blitctx_prepare_state(blit);
+
+   nv50_state_validate(nv50, ~0, 36);
+
+   x_range = (float)info->src.box.width / (float)info->dst.box.width;
+   y_range = (float)info->src.box.height / (float)info->dst.box.height;
+
+   x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x;
+   y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y;
+
+   x1 = x0 + 16384.0f * x_range;
+   y1 = y0 + 16384.0f * y_range;
+
+   x0 *= (float)(1 << nv50_miptree(src)->ms_x);
+   x1 *= (float)(1 << nv50_miptree(src)->ms_x);
+   y0 *= (float)(1 << nv50_miptree(src)->ms_y);
+   y1 *= (float)(1 << nv50_miptree(src)->ms_y);
+
+   if (src->last_level > 0) {
+      /* If there are mip maps, GPU always assumes normalized coordinates. */
+      const unsigned l = info->src.level;
+      const float fh = u_minify(src->width0 << nv50_miptree(src)->ms_x, l);
+      const float fv = u_minify(src->height0 << nv50_miptree(src)->ms_y, l);
+      x0 /= fh;
+      x1 /= fh;
+      y0 /= fv;
+      y1 /= fv;
+   }
+
+   /* XXX: multiply by 6 for cube arrays ? */
+   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
+   z = (float)info->src.box.z;
+   if (nv50_miptree(src)->layout_3d)
+      z += 0.5f * dz;
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_TRANSFORM_EN), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(VIEW_VOLUME_CLIP_CTRL), 1);
+   PUSH_DATA (push, 0x1);
+
+   /* Draw a large triangle in screen coordinates covering the whole
+    * render target, with scissors defining the destination region.
+    * The vertex is supplied with non-normalized texture coordinates
+    * arranged in a way to yield the desired offset and scale.
+    */
+
+   minx = info->dst.box.x;
+   maxx = info->dst.box.x + info->dst.box.width;
+   miny = info->dst.box.y;
+   maxy = info->dst.box.y + info->dst.box.height;
+   if (info->scissor_enable) {
+      minx = MAX2(minx, info->scissor.minx);
+      maxx = MIN2(maxx, info->scissor.maxx);
+      miny = MAX2(miny, info->scissor.miny);
+      maxy = MIN2(maxy, info->scissor.maxy);
+   }
+   BEGIN_NV04(push, NV50_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, (maxx << 16) | minx);
+   PUSH_DATA (push, (maxy << 16) | miny);
+
+   for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
+      if (info->dst.box.z + i) {
+         BEGIN_NV04(push, NV50_3D(LAYER), 1);
+         PUSH_DATA (push, info->dst.box.z + i);
+      }
+      PUSH_SPACE(push, 32);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_3F_X(1)), 3);
+      PUSH_DATAf(push, x0);
+      PUSH_DATAf(push, y0);
+      PUSH_DATAf(push, z);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(0)), 2);
+      PUSH_DATAf(push, 0.0f);
+      PUSH_DATAf(push, 0.0f);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_3F_X(1)), 3);
+      PUSH_DATAf(push, x1);
+      PUSH_DATAf(push, y0);
+      PUSH_DATAf(push, z);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(0)), 2);
+      PUSH_DATAf(push, 16384 << nv50_miptree(dst)->ms_x);
+      PUSH_DATAf(push, 0.0f);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_3F_X(1)), 3);
+      PUSH_DATAf(push, x0);
+      PUSH_DATAf(push, y1);
+      PUSH_DATAf(push, z);
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(0)), 2);
+      PUSH_DATAf(push, 0.0f);
+      PUSH_DATAf(push, 16384 << nv50_miptree(dst)->ms_y);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+   }
+   if (info->dst.box.z + info->dst.box.depth - 1) {
+      BEGIN_NV04(push, NV50_3D(LAYER), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   /* re-enable normally constant state */
+
+   BEGIN_NV04(push, NV50_3D(VIEWPORT_TRANSFORM_EN), 1);
+   PUSH_DATA (push, 1);
+
+   nv50_blitctx_post_blit(blit);
+}
+
+static void
+nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_miptree *dst = nv50_miptree(info->dst.resource);
+   struct nv50_miptree *src = nv50_miptree(info->src.resource);
+   const int32_t srcx_adj = info->src.box.width < 0 ? -1 : 0;
+   const int32_t srcy_adj = info->src.box.height < 0 ? -1 : 0;
+   const int32_t dz = info->dst.box.z;
+   const int32_t sz = info->src.box.z;
+   uint32_t dstw, dsth;
+   int32_t dstx, dsty;
+   int64_t srcx, srcy;
+   int64_t du_dx, dv_dy;
+   int i;
+   uint32_t mode;
+   uint32_t mask = nv50_blit_eng2d_get_mask(info);
+   boolean b;
+
+   mode = nv50_blit_get_filter(info) ?
+      NV50_2D_BLIT_CONTROL_FILTER_BILINEAR :
+      NV50_2D_BLIT_CONTROL_FILTER_POINT_SAMPLE;
+   mode |= (src->base.base.nr_samples > dst->base.base.nr_samples) ?
+      NV50_2D_BLIT_CONTROL_ORIGIN_CORNER : NV50_2D_BLIT_CONTROL_ORIGIN_CENTER;
+
+   du_dx = ((int64_t)info->src.box.width << 32) / info->dst.box.width;
+   dv_dy = ((int64_t)info->src.box.height << 32) / info->dst.box.height;
+
+   b = info->dst.format == info->src.format;
+   nv50_2d_texture_set(push, 1, dst, info->dst.level, dz, info->dst.format, b);
+   nv50_2d_texture_set(push, 0, src, info->src.level, sz, info->src.format, b);
+
+   if (info->scissor_enable) {
+      BEGIN_NV04(push, NV50_2D(CLIP_X), 5);
+      PUSH_DATA (push, info->scissor.minx << dst->ms_x);
+      PUSH_DATA (push, info->scissor.miny << dst->ms_y);
+      PUSH_DATA (push, (info->scissor.maxx - info->scissor.minx) << dst->ms_x);
+      PUSH_DATA (push, (info->scissor.maxy - info->scissor.miny) << dst->ms_y);
+      PUSH_DATA (push, 1); /* enable */
+   }
+
+   if (mask != 0xffffffff) {
+      BEGIN_NV04(push, NV50_2D(ROP), 1);
+      PUSH_DATA (push, 0xca); /* DPSDxax */
+      BEGIN_NV04(push, NV50_2D(PATTERN_COLOR_FORMAT), 1);
+      PUSH_DATA (push, NV50_2D_PATTERN_COLOR_FORMAT_32BPP);
+      BEGIN_NV04(push, NV50_2D(PATTERN_COLOR(0)), 4);
+      PUSH_DATA (push, 0x00000000);
+      PUSH_DATA (push, mask);
+      PUSH_DATA (push, 0xffffffff);
+      PUSH_DATA (push, 0xffffffff);
+      BEGIN_NV04(push, NV50_2D(OPERATION), 1);
+      PUSH_DATA (push, NV50_2D_OPERATION_ROP);
+   } else
+   if (info->src.format != info->dst.format) {
+      if (info->src.format == PIPE_FORMAT_R8_UNORM ||
+          info->src.format == PIPE_FORMAT_R16_UNORM ||
+          info->src.format == PIPE_FORMAT_R16_FLOAT ||
+          info->src.format == PIPE_FORMAT_R32_FLOAT) {
+         mask = 0xffff0000; /* also makes condition for OPERATION reset true */
+         BEGIN_NV04(push, NV50_2D(BETA4), 2);
+         PUSH_DATA (push, mask);
+         PUSH_DATA (push, NV50_2D_OPERATION_SRCCOPY_PREMULT);
+      }
+   }
+
+   if (src->ms_x > dst->ms_x || src->ms_y > dst->ms_y) {
+      /* ms_x is always >= ms_y */
+      du_dx <<= src->ms_x - dst->ms_x;
+      dv_dy <<= src->ms_y - dst->ms_y;
+   } else {
+      du_dx >>= dst->ms_x - src->ms_x;
+      dv_dy >>= dst->ms_y - src->ms_y;
+   }
+
+   srcx = (int64_t)(info->src.box.x + srcx_adj) << (src->ms_x + 32);
+   srcy = (int64_t)(info->src.box.y + srcy_adj) << (src->ms_y + 32);
+
+   if (src->base.base.nr_samples > dst->base.base.nr_samples) {
+      /* center src coorinates for proper MS resolve filtering */
+      srcx += (int64_t)src->ms_x << 32;
+      srcy += (int64_t)src->ms_y << 32;
+   }
+
+   dstx = info->dst.box.x << dst->ms_x;
+   dsty = info->dst.box.y << dst->ms_y;
+
+   dstw = info->dst.box.width << dst->ms_x;
+   dsth = info->dst.box.height << dst->ms_y;
+
+   if (dstx < 0) {
+      dstw += dstx;
+      srcx -= du_dx * dstx;
+      dstx = 0;
+   }
+   if (dsty < 0) {
+      dsth += dsty;
+      srcy -= dv_dy * dsty;
+      dsty = 0;
+   }
+
+   BEGIN_NV04(push, NV50_2D(BLIT_CONTROL), 1);
+   PUSH_DATA (push, mode);
+   BEGIN_NV04(push, NV50_2D(BLIT_DST_X), 4);
+   PUSH_DATA (push, dstx);
+   PUSH_DATA (push, dsty);
+   PUSH_DATA (push, dstw);
+   PUSH_DATA (push, dsth);
+   BEGIN_NV04(push, NV50_2D(BLIT_DU_DX_FRACT), 4);
+   PUSH_DATA (push, du_dx);
+   PUSH_DATA (push, du_dx >> 32);
+   PUSH_DATA (push, dv_dy);
+   PUSH_DATA (push, dv_dy >> 32);
+
+   BCTX_REFN(nv50->bufctx, 2D, &dst->base, WR);
+   BCTX_REFN(nv50->bufctx, 2D, &src->base, RD);
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx);
+   if (nouveau_pushbuf_validate(nv50->base.pushbuf))
+      return;
+
+   for (i = 0; i < info->dst.box.depth; ++i) {
+      if (i > 0) {
+         /* no scaling in z-direction possible for eng2d blits */
+         if (dst->layout_3d) {
+            BEGIN_NV04(push, NV50_2D(DST_LAYER), 1);
+            PUSH_DATA (push, info->dst.box.z + i);
+         } else {
+            const unsigned z = info->dst.box.z + i;
+            BEGIN_NV04(push, NV50_2D(DST_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+         }
+         if (src->layout_3d) {
+            /* not possible because of depth tiling */
+            assert(0);
+         } else {
+            const unsigned z = info->src.box.z + i;
+            BEGIN_NV04(push, NV50_2D(SRC_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
+            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+         }
+         BEGIN_NV04(push, NV50_2D(BLIT_SRC_Y_INT), 1); /* trigger */
+         PUSH_DATA (push, srcy >> 32);
+      } else {
+         BEGIN_NV04(push, NV50_2D(BLIT_SRC_X_FRACT), 4);
+         PUSH_DATA (push, srcx);
+         PUSH_DATA (push, srcx >> 32);
+         PUSH_DATA (push, srcy);
+         PUSH_DATA (push, srcy >> 32);
+      }
+   }
+   nv50_bufctx_fence(nv50->bufctx, FALSE);
+
+   nouveau_bufctx_reset(nv50->bufctx, NV50_BIND_2D);
+
+   if (info->scissor_enable) {
+      BEGIN_NV04(push, NV50_2D(CLIP_ENABLE), 1);
+      PUSH_DATA (push, 0);
+   }
+   if (mask != 0xffffffff) {
+      BEGIN_NV04(push, NV50_2D(OPERATION), 1);
+      PUSH_DATA (push, NV50_2D_OPERATION_SRCCOPY);
+   }
+}
+
+static void
+nv50_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   boolean eng3d = FALSE;
+
+   if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
+      if (!(info->mask & PIPE_MASK_ZS))
+         return;
+      if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
+          info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+         eng3d = TRUE;
+      if (info->filter != PIPE_TEX_FILTER_NEAREST)
+         eng3d = TRUE;
+   } else {
+      if (!(info->mask & PIPE_MASK_RGBA))
+         return;
+      if (info->mask != PIPE_MASK_RGBA)
+         eng3d = TRUE;
+   }
+
+   if (nv50_miptree(info->src.resource)->layout_3d) {
+      eng3d = TRUE;
+   } else
+   if (info->src.box.depth != info->dst.box.depth) {
+      eng3d = TRUE;
+      debug_printf("blit: cannot filter array or cube textures in z direction");
+   }
+
+   if (!eng3d && info->dst.format != info->src.format) {
+      if (!nv50_2d_dst_format_faithful(info->dst.format) ||
+          !nv50_2d_src_format_faithful(info->src.format)) {
+         eng3d = TRUE;
+      } else
+      if (!nv50_2d_src_format_faithful(info->src.format)) {
+         if (!util_format_is_luminance(info->src.format)) {
+            if (util_format_is_intensity(info->src.format))
+               eng3d = TRUE;
+            else
+            if (!nv50_2d_dst_format_ops_supported(info->dst.format))
+               eng3d = TRUE;
+            else
+               eng3d = !nv50_2d_format_supported(info->src.format);
+         }
+      } else
+      if (util_format_is_luminance_alpha(info->src.format))
+         eng3d = TRUE;
+   }
+
+   if (info->src.resource->nr_samples == 8 &&
+       info->dst.resource->nr_samples <= 1)
+      eng3d = TRUE;
+
+   /* FIXME: can't make this work with eng2d anymore */
+   if (info->src.resource->nr_samples > 1 ||
+       info->dst.resource->nr_samples > 1)
+      eng3d = TRUE;
+
+   /* FIXME: find correct src coordinate adjustments */
+   if ((info->src.box.width !=  info->dst.box.width &&
+        info->src.box.width != -info->dst.box.width) ||
+       (info->src.box.height !=  info->dst.box.height &&
+        info->src.box.height != -info->dst.box.height))
+      eng3d = TRUE;
+
+   if (!eng3d)
+      nv50_blit_eng2d(nv50, info);
+   else
+      nv50_blit_3d(nv50, info);
+}
+
+boolean
+nv50_blitter_create(struct nv50_screen *screen)
+{
+   screen->blitter = CALLOC_STRUCT(nv50_blitter);
+   if (!screen->blitter) {
+      NOUVEAU_ERR("failed to allocate blitter struct\n");
+      return FALSE;
+   }
+
+   pipe_mutex_init(screen->blitter->mutex);
+
+   nv50_blitter_make_vp(screen->blitter);
+   nv50_blitter_make_sampler(screen->blitter);
+
+   return TRUE;
+}
+
+void
+nv50_blitter_destroy(struct nv50_screen *screen)
+{
+   struct nv50_blitter *blitter = screen->blitter;
+   unsigned i, m;
+
+   for (i = 0; i < NV50_BLIT_MAX_TEXTURE_TYPES; ++i) {
+      for (m = 0; m < NV50_BLIT_MODES; ++m) {
+         struct nv50_program *prog = blitter->fp[i][m];
+         if (prog) {
+            nv50_program_destroy(NULL, prog);
+            FREE((void *)prog->pipe.tokens);
+            FREE(prog);
+         }
+      }
+   }
+
+   FREE(blitter);
+}
+
+boolean
+nv50_blitctx_create(struct nv50_context *nv50)
+{
+   nv50->blit = CALLOC_STRUCT(nv50_blitctx);
+   if (!nv50->blit) {
+      NOUVEAU_ERR("failed to allocate blit context\n");
+      return FALSE;
+   }
+
+   nv50->blit->nv50 = nv50;
+
+   nv50->blit->rast.pipe.half_pixel_center = 1;
+
+   return TRUE;
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+   struct pipe_context *pipe = &nv50->base.pipe;
+
+   pipe->resource_copy_region = nv50_resource_copy_region;
+   pipe->blit = nv50_blit;
+   pipe->clear_render_target = nv50_clear_render_target;
+   pipe->clear_depth_stencil = nv50_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
new file mode 100644
index 00000000000..9e512928381
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_resource.h"
+#include "nv50/nv50_texture.xml.h"
+#include "nv50/nv50_defs.xml.h"
+
+#include "util/u_format.h"
+
+#define NV50_TIC_0_SWIZZLE__MASK                      \
+   (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
+    NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
+
+static INLINE uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+{
+   switch (swz) {
+   case PIPE_SWIZZLE_RED:
+      return (tc & NV50_TIC_0_MAPR__MASK) >> NV50_TIC_0_MAPR__SHIFT;
+   case PIPE_SWIZZLE_GREEN:
+      return (tc & NV50_TIC_0_MAPG__MASK) >> NV50_TIC_0_MAPG__SHIFT;
+   case PIPE_SWIZZLE_BLUE:
+      return (tc & NV50_TIC_0_MAPB__MASK) >> NV50_TIC_0_MAPB__SHIFT;
+   case PIPE_SWIZZLE_ALPHA:
+      return (tc & NV50_TIC_0_MAPA__MASK) >> NV50_TIC_0_MAPA__SHIFT;
+   case PIPE_SWIZZLE_ONE:
+      return tex_int ? NV50_TIC_MAP_ONE_INT : NV50_TIC_MAP_ONE_FLOAT;
+   case PIPE_SWIZZLE_ZERO:
+   default:
+      return NV50_TIC_MAP_ZERO;
+   }
+}
+
+struct pipe_sampler_view *
+nv50_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *res,
+                         const struct pipe_sampler_view *templ)
+{
+   uint32_t flags = 0;
+
+   if (res->target == PIPE_TEXTURE_RECT || res->target == PIPE_BUFFER)
+      flags |= NV50_TEXVIEW_SCALED_COORDS;
+
+   return nv50_create_texture_view(pipe, res, templ, flags, res->target);
+}
+
+struct pipe_sampler_view *
+nv50_create_texture_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ,
+                         uint32_t flags,
+                         enum pipe_texture_target target)
+{
+   const struct util_format_description *desc;
+   uint64_t addr;
+   uint32_t *tic;
+   uint32_t swz[4];
+   uint32_t depth;
+   struct nv50_tic_entry *view;
+   struct nv50_miptree *mt = nv50_miptree(texture);
+   boolean tex_int;
+
+   view = MALLOC_STRUCT(nv50_tic_entry);
+   if (!view)
+      return NULL;
+
+   view->pipe = *templ;
+   view->pipe.reference.count = 1;
+   view->pipe.texture = NULL;
+   view->pipe.context = pipe;
+
+   view->id = -1;
+
+   pipe_resource_reference(&view->pipe.texture, texture);
+
+   tic = &view->tic[0];
+
+   desc = util_format_description(view->pipe.format);
+
+   /* TIC[0] */
+
+   tic[0] = nv50_format_table[view->pipe.format].tic;
+
+   tex_int = util_format_is_pure_integer(view->pipe.format);
+
+   swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r, tex_int);
+   swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g, tex_int);
+   swz[2] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_b, tex_int);
+   swz[3] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_a, tex_int);
+   tic[0] = (tic[0] & ~NV50_TIC_0_SWIZZLE__MASK) |
+      (swz[0] << NV50_TIC_0_MAPR__SHIFT) |
+      (swz[1] << NV50_TIC_0_MAPG__SHIFT) |
+      (swz[2] << NV50_TIC_0_MAPB__SHIFT) |
+      (swz[3] << NV50_TIC_0_MAPA__SHIFT);
+
+   addr = mt->base.address;
+
+   if (mt->base.base.target == PIPE_TEXTURE_1D_ARRAY ||
+       mt->base.base.target == PIPE_TEXTURE_2D_ARRAY) {
+      addr += view->pipe.u.tex.first_layer * mt->layer_stride;
+      depth = view->pipe.u.tex.last_layer - view->pipe.u.tex.first_layer + 1;
+   } else {
+      depth = mt->base.base.depth0;
+   }
+
+   tic[2] = 0x10001000 | NV50_TIC_2_NO_BORDER;
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+      tic[2] |= NV50_TIC_2_COLORSPACE_SRGB;
+
+   if (!(flags & NV50_TEXVIEW_SCALED_COORDS))
+      tic[2] |= NV50_TIC_2_NORMALIZED_COORDS;
+
+   if (unlikely(!nouveau_bo_memtype(nv04_resource(texture)->bo))) {
+      if (target == PIPE_BUFFER) {
+         addr += view->pipe.u.buf.first_element * desc->block.bits / 8;
+         tic[2] |= NV50_TIC_2_LINEAR | NV50_TIC_2_TARGET_BUFFER;
+         tic[3] = 0;
+         tic[4] = /* width */
+            view->pipe.u.buf.last_element - view->pipe.u.buf.first_element + 1;
+         tic[5] = 0;
+      } else {
+         tic[2] |= NV50_TIC_2_LINEAR | NV50_TIC_2_TARGET_RECT;
+         tic[3] = mt->level[0].pitch;
+         tic[4] = mt->base.base.width0;
+         tic[5] = (1 << 16) | mt->base.base.height0;
+      }
+      tic[6] =
+      tic[7] = 0;
+      tic[1] = addr;
+      tic[2] |= addr >> 32;
+      return &view->pipe;
+   }
+
+   tic[1] = addr;
+   tic[2] |= (addr >> 32) & 0xff;
+
+   tic[2] |=
+      ((mt->level[0].tile_mode & 0x0f0) << (22 - 4)) |
+      ((mt->level[0].tile_mode & 0xf00) << (25 - 8));
+
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      tic[2] |= NV50_TIC_2_TARGET_1D;
+      break;
+   case PIPE_TEXTURE_2D:
+      tic[2] |= NV50_TIC_2_TARGET_2D;
+      break;
+   case PIPE_TEXTURE_RECT:
+      tic[2] |= NV50_TIC_2_TARGET_RECT;
+      break;
+   case PIPE_TEXTURE_3D:
+      tic[2] |= NV50_TIC_2_TARGET_3D;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      depth /= 6;
+      tic[2] |= NV50_TIC_2_TARGET_CUBE;
+      break;
+   case PIPE_TEXTURE_1D_ARRAY:
+      tic[2] |= NV50_TIC_2_TARGET_1D_ARRAY;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+      tic[2] |= NV50_TIC_2_TARGET_2D_ARRAY;
+      break;
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      depth /= 6;
+      tic[2] |= NV50_TIC_2_TARGET_CUBE_ARRAY;
+      break;
+   case PIPE_BUFFER:
+      assert(0); /* should be linear and handled above ! */
+      tic[2] |= NV50_TIC_2_TARGET_BUFFER | NV50_TIC_2_LINEAR;
+      break;
+   default:
+      NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
+      return FALSE;
+   }
+
+   tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
+
+   tic[4] = (1 << 31) | (mt->base.base.width0 << mt->ms_x);
+
+   tic[5] = (mt->base.base.height0 << mt->ms_y) & 0xffff;
+   tic[5] |= depth << 16;
+   tic[5] |= mt->base.base.last_level << NV50_TIC_5_LAST_LEVEL__SHIFT;
+
+   tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000; /* sampling points */
+
+   tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+
+   if (unlikely(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS)))
+      if (mt->base.base.last_level)
+         tic[5] &= ~NV50_TIC_5_LAST_LEVEL__MASK;
+
+   return &view->pipe;
+}
+
+static boolean
+nv50_validate_tic(struct nv50_context *nv50, int s)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nouveau_bo *txc = nv50->screen->txc;
+   unsigned i;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nv50->num_textures[s]; ++i) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(nv50->textures[s][i]);
+      struct nv04_resource *res;
+
+      if (!tic) {
+         BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1);
+         PUSH_DATA (push, (i << 1) | 0);
+         continue;
+      }
+      res = &nv50_miptree(tic->pipe.texture)->base;
+
+      if (tic->id < 0) {
+         tic->id = nv50_screen_tic_alloc(nv50->screen, tic);
+
+         BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
+         PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+         PUSH_DATA (push, 1);
+         BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
+         PUSH_DATA (push, 262144);
+         PUSH_DATA (push, 65536);
+         PUSH_DATA (push, 1);
+         PUSH_DATAh(push, txc->offset);
+         PUSH_DATA (push, txc->offset);
+         BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+         BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, 1);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 1);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 1);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, tic->id * 32);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         BEGIN_NI04(push, NV50_2D(SIFC_DATA), 8);
+         PUSH_DATAp(push, &tic->tic[0], 8);
+
+         need_flush = TRUE;
+      } else
+      if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         BEGIN_NV04(push, NV50_3D(TEX_CACHE_CTL), 1);
+         PUSH_DATA (push, 0x20);
+      }
+
+      nv50->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      res->status &= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      BCTX_REFN(nv50->bufctx_3d, TEXTURES, res, RD);
+
+      BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1);
+      PUSH_DATA (push, (tic->id << 9) | (i << 1) | 1);
+   }
+   for (; i < nv50->state.num_textures[s]; ++i) {
+      BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1);
+      PUSH_DATA (push, (i << 1) | 0);
+   }
+   nv50->state.num_textures[s] = nv50->num_textures[s];
+
+   return need_flush;
+}
+
+void nv50_validate_textures(struct nv50_context *nv50)
+{
+   boolean need_flush;
+
+   need_flush  = nv50_validate_tic(nv50, 0);
+   need_flush |= nv50_validate_tic(nv50, 2);
+
+   if (need_flush) {
+      BEGIN_NV04(nv50->base.pushbuf, NV50_3D(TIC_FLUSH), 1);
+      PUSH_DATA (nv50->base.pushbuf, 0);
+   }
+}
+
+static boolean
+nv50_validate_tsc(struct nv50_context *nv50, int s)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned i;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nv50->num_samplers[s]; ++i) {
+      struct nv50_tsc_entry *tsc = nv50_tsc_entry(nv50->samplers[s][i]);
+
+      if (!tsc) {
+         BEGIN_NV04(push, NV50_3D(BIND_TSC(s)), 1);
+         PUSH_DATA (push, (i << 4) | 0);
+         continue;
+      }
+      if (tsc->id < 0) {
+         tsc->id = nv50_screen_tsc_alloc(nv50->screen, tsc);
+
+         nv50_sifc_linear_u8(&nv50->base, nv50->screen->txc,
+                             65536 + tsc->id * 32,
+                             NOUVEAU_BO_VRAM, 32, tsc->tsc);
+         need_flush = TRUE;
+      }
+      nv50->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
+
+      BEGIN_NV04(push, NV50_3D(BIND_TSC(s)), 1);
+      PUSH_DATA (push, (tsc->id << 12) | (i << 4) | 1);
+   }
+   for (; i < nv50->state.num_samplers[s]; ++i) {
+      BEGIN_NV04(push, NV50_3D(BIND_TSC(s)), 1);
+      PUSH_DATA (push, (i << 4) | 0);
+   }
+   nv50->state.num_samplers[s] = nv50->num_samplers[s];
+
+   return need_flush;
+}
+
+void nv50_validate_samplers(struct nv50_context *nv50)
+{
+   boolean need_flush;
+
+   need_flush  = nv50_validate_tsc(nv50, 0);
+   need_flush |= nv50_validate_tsc(nv50, 2);
+
+   if (need_flush) {
+      BEGIN_NV04(nv50->base.pushbuf, NV50_3D(TSC_FLUSH), 1);
+      PUSH_DATA (nv50->base.pushbuf, 0);
+   }
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_texture.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_texture.xml.h
new file mode 100644
index 00000000000..31eab9b5d87
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_texture.xml.h
@@ -0,0 +1,306 @@
+#ifndef NV50_TEXTURE_XML
+#define NV50_TEXTURE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/nv50_texture.xml (   8648 bytes, from 2013-04-13 12:49:11)
+- rnndb/copyright.xml    (   6452 bytes, from 2011-08-11 18:25:12)
+- rnndb/nvchipsets.xml   (   3954 bytes, from 2013-03-26 01:26:43)
+- rnndb/nv50_defs.xml    (  16652 bytes, from 2013-04-04 10:57:15)
+
+Copyright (C) 2006-2013 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#define NV50_TIC_MAP_ZERO					0x00000000
+#define NV50_TIC_MAP_C0						0x00000002
+#define NV50_TIC_MAP_C1						0x00000003
+#define NV50_TIC_MAP_C2						0x00000004
+#define NV50_TIC_MAP_C3						0x00000005
+#define NV50_TIC_MAP_ONE_INT					0x00000006
+#define NV50_TIC_MAP_ONE_FLOAT					0x00000007
+#define NV50_TIC_TYPE_SNORM					0x00000001
+#define NV50_TIC_TYPE_UNORM					0x00000002
+#define NV50_TIC_TYPE_SINT					0x00000003
+#define NV50_TIC_TYPE_UINT					0x00000004
+#define NV50_TIC_TYPE_SSCALED					0x00000005
+#define NV50_TIC_TYPE_USCALED					0x00000006
+#define NV50_TIC_TYPE_FLOAT					0x00000007
+#define NV50_TSC_WRAP_REPEAT					0x00000000
+#define NV50_TSC_WRAP_MIRROR_REPEAT				0x00000001
+#define NV50_TSC_WRAP_CLAMP_TO_EDGE				0x00000002
+#define NV50_TSC_WRAP_CLAMP_TO_BORDER				0x00000003
+#define NV50_TSC_WRAP_CLAMP					0x00000004
+#define NV50_TSC_WRAP_MIRROR_CLAMP_TO_EDGE			0x00000005
+#define NV50_TSC_WRAP_MIRROR_CLAMP_TO_BORDER			0x00000006
+#define NV50_TSC_WRAP_MIRROR_CLAMP				0x00000007
+#define NV50_TIC__SIZE						0x00000020
+#define NV50_TIC_0						0x00000000
+#define NV50_TIC_0_MAPA__MASK					0x38000000
+#define NV50_TIC_0_MAPA__SHIFT					27
+#define NV50_TIC_0_MAPB__MASK					0x07000000
+#define NV50_TIC_0_MAPB__SHIFT					24
+#define NV50_TIC_0_MAPG__MASK					0x00e00000
+#define NV50_TIC_0_MAPG__SHIFT					21
+#define NV50_TIC_0_MAPR__MASK					0x001c0000
+#define NV50_TIC_0_MAPR__SHIFT					18
+#define NV50_TIC_0_TYPE3__MASK					0x00038000
+#define NV50_TIC_0_TYPE3__SHIFT					15
+#define NV50_TIC_0_TYPE2__MASK					0x00007000
+#define NV50_TIC_0_TYPE2__SHIFT					12
+#define NV50_TIC_0_TYPE1__MASK					0x00000e00
+#define NV50_TIC_0_TYPE1__SHIFT					9
+#define NV50_TIC_0_TYPE0__MASK					0x000001c0
+#define NV50_TIC_0_TYPE0__SHIFT					6
+#define NV50_TIC_0_FMT__MASK					0x0000003f
+#define NV50_TIC_0_FMT__SHIFT					0
+#define NV50_TIC_0_FMT_32_32_32_32				0x00000001
+#define NVC0_TIC_0_FMT_32_32_32					0x00000002
+#define NV50_TIC_0_FMT_16_16_16_16				0x00000003
+#define NV50_TIC_0_FMT_32_32					0x00000004
+#define NV50_TIC_0_FMT_32_8_X24					0x00000005
+#define NV50_TIC_0_FMT_8_8_8_8					0x00000008
+#define NV50_TIC_0_FMT_10_10_10_2				0x00000009
+#define NV50_TIC_0_FMT_16_16					0x0000000c
+#define NV50_TIC_0_FMT_24_8					0x0000000d
+#define NV50_TIC_0_FMT_8_24					0x0000000e
+#define NV50_TIC_0_FMT_32					0x0000000f
+#define NV50_TIC_0_FMT_BPTC_FLOAT				0x00000010
+#define NV50_TIC_0_FMT_BPTC_UFLOAT				0x00000011
+#define NV50_TIC_0_FMT_4_4_4_4					0x00000012
+#define NV50_TIC_0_FMT_1_5_5_5					0x00000013
+#define NV50_TIC_0_FMT_5_5_5_1					0x00000014
+#define NV50_TIC_0_FMT_5_6_5					0x00000015
+#define NV50_TIC_0_FMT_5_5_6					0x00000016
+#define NV50_TIC_0_FMT_BPTC					0x00000017
+#define NV50_TIC_0_FMT_8_8					0x00000018
+#define NV50_TIC_0_FMT_16					0x0000001b
+#define NV50_TIC_0_FMT_8					0x0000001d
+#define NV50_TIC_0_FMT_4_4					0x0000001e
+#define NV50_TIC_0_FMT_BITMAP					0x0000001f
+#define NV50_TIC_0_FMT_9_9_9_E5					0x00000020
+#define NV50_TIC_0_FMT_11_11_10					0x00000021
+#define NV50_TIC_0_FMT_U8_YA8_V8_YB8				0x00000022
+#define NV50_TIC_0_FMT_YA8_U8_YB8_V8				0x00000023
+#define NV50_TIC_0_FMT_DXT1					0x00000024
+#define NV50_TIC_0_FMT_DXT3					0x00000025
+#define NV50_TIC_0_FMT_DXT5					0x00000026
+#define NV50_TIC_0_FMT_RGTC1					0x00000027
+#define NV50_TIC_0_FMT_RGTC2					0x00000028
+#define NV50_TIC_0_FMT_S8_Z24					0x00000029
+#define NV50_TIC_0_FMT_Z24_X8					0x0000002a
+#define NV50_TIC_0_FMT_Z24_S8					0x0000002b
+#define NV50_TIC_0_FMT_Z24_C8_MS4_CS4				0x0000002c
+#define NV50_TIC_0_FMT_Z24_C8_MS8_CS8				0x0000002d
+#define NV50_TIC_0_FMT_Z24_C8_MS4_CS12				0x0000002e
+#define NV50_TIC_0_FMT_Z32					0x0000002f
+#define NV50_TIC_0_FMT_Z32_S8_X24				0x00000030
+#define NV50_TIC_0_FMT_Z24_X8_S8_C8_X16_MS4_CS4			0x00000031
+#define NV50_TIC_0_FMT_Z24_X8_S8_C8_X16_MS8_CS8			0x00000032
+#define NV50_TIC_0_FMT_Z32_X8_C8_X16_MS4_CS4			0x00000033
+#define NV50_TIC_0_FMT_Z32_X8_C8_X16_MS8_CS8			0x00000034
+#define NV50_TIC_0_FMT_Z32_S8_C8_X16_MS4_CS4			0x00000035
+#define NV50_TIC_0_FMT_Z32_S8_C8_X16_MS8_CS8			0x00000036
+#define NV50_TIC_0_FMT_Z24_X8_S8_C8_X16_MS4_CS12		0x00000037
+#define NV50_TIC_0_FMT_Z32_X8_C8_X16_MS4_CS12			0x00000038
+#define NV50_TIC_0_FMT_Z32_S8_C8_X16_MS4_CS12			0x00000039
+#define NV50_TIC_0_FMT_Z16					0x0000003a
+
+#define NV50_TIC_1						0x00000004
+#define NV50_TIC_1_OFFSET_LOW__MASK				0xffffffff
+#define NV50_TIC_1_OFFSET_LOW__SHIFT				0
+
+#define NV50_TIC_2						0x00000008
+#define NV50_TIC_2_OFFSET_HIGH__MASK				0x000000ff
+#define NV50_TIC_2_OFFSET_HIGH__SHIFT				0
+#define NV50_TIC_2_COLORSPACE_SRGB				0x00000400
+#define NV50_TIC_2_TARGET__MASK					0x0003c000
+#define NV50_TIC_2_TARGET__SHIFT				14
+#define NV50_TIC_2_TARGET_1D					0x00000000
+#define NV50_TIC_2_TARGET_2D					0x00004000
+#define NV50_TIC_2_TARGET_3D					0x00008000
+#define NV50_TIC_2_TARGET_CUBE					0x0000c000
+#define NV50_TIC_2_TARGET_1D_ARRAY				0x00010000
+#define NV50_TIC_2_TARGET_2D_ARRAY				0x00014000
+#define NV50_TIC_2_TARGET_BUFFER				0x00018000
+#define NV50_TIC_2_TARGET_RECT					0x0001c000
+#define NV50_TIC_2_TARGET_CUBE_ARRAY				0x00020000
+#define NV50_TIC_2_LINEAR					0x00040000
+#define NV50_TIC_2_TILE_MODE_X__MASK				0x00380000
+#define NV50_TIC_2_TILE_MODE_X__SHIFT				19
+#define NV50_TIC_2_TILE_MODE_Y__MASK				0x01c00000
+#define NV50_TIC_2_TILE_MODE_Y__SHIFT				22
+#define NV50_TIC_2_TILE_MODE_Z__MASK				0x0e000000
+#define NV50_TIC_2_TILE_MODE_Z__SHIFT				25
+#define NV50_TIC_2_2D_UNK0258__MASK				0x30000000
+#define NV50_TIC_2_2D_UNK0258__SHIFT				28
+#define NV50_TIC_2_NO_BORDER					0x40000000
+#define NV50_TIC_2_NORMALIZED_COORDS				0x80000000
+
+#define NV50_TIC_3						0x0000000c
+#define NV50_TIC_3_PITCH__MASK					0xffffffff
+#define NV50_TIC_3_PITCH__SHIFT					0
+
+#define NV50_TIC_4						0x00000010
+#define NV50_TIC_4_WIDTH__MASK					0xffffffff
+#define NV50_TIC_4_WIDTH__SHIFT					0
+
+#define NV50_TIC_5						0x00000014
+#define NV50_TIC_5_LAST_LEVEL__MASK				0xf0000000
+#define NV50_TIC_5_LAST_LEVEL__SHIFT				28
+#define NV50_TIC_5_DEPTH__MASK					0x0fff0000
+#define NV50_TIC_5_DEPTH__SHIFT					16
+#define NV50_TIC_5_HEIGHT__MASK					0x0000ffff
+#define NV50_TIC_5_HEIGHT__SHIFT				0
+
+#define NV50_TIC_7						0x0000001c
+#define NV50_TIC_7_BASE_LEVEL__MASK				0x0000000f
+#define NV50_TIC_7_BASE_LEVEL__SHIFT				0
+#define NV50_TIC_7_MAX_LEVEL__MASK				0x000000f0
+#define NV50_TIC_7_MAX_LEVEL__SHIFT				4
+#define NV50_TIC_7_MS_MODE__MASK				0x0000f000
+#define NV50_TIC_7_MS_MODE__SHIFT				12
+#define NV50_TIC_7_MS_MODE_MS1					0x00000000
+#define NV50_TIC_7_MS_MODE_MS2					0x00001000
+#define NV50_TIC_7_MS_MODE_MS4					0x00002000
+#define NV50_TIC_7_MS_MODE_MS8					0x00003000
+#define NVA3_TIC_7_MS_MODE_MS8_ALT				0x00004000
+#define NVA3_TIC_7_MS_MODE_MS2_ALT				0x00005000
+#define NVC0_TIC_7_MS_MODE_UNK6					0x00006000
+#define NV50_TIC_7_MS_MODE_MS4_CS4				0x00008000
+#define NV50_TIC_7_MS_MODE_MS4_CS12				0x00009000
+#define NV50_TIC_7_MS_MODE_MS8_CS8				0x0000a000
+#define NVC0_TIC_7_MS_MODE_MS8_CS24				0x0000b000
+
+#define NV50_TSC__SIZE						0x00000020
+#define NV50_TSC_0						0x00000000
+#define NV50_TSC_0_WRAPS__MASK					0x00000007
+#define NV50_TSC_0_WRAPS__SHIFT					0
+#define NV50_TSC_0_WRAPT__MASK					0x00000038
+#define NV50_TSC_0_WRAPT__SHIFT					3
+#define NV50_TSC_0_WRAPR__MASK					0x000001c0
+#define NV50_TSC_0_WRAPR__SHIFT					6
+#define NV50_TSC_0_SHADOW_COMPARE_ENABLE			0x00000200
+#define NV50_TSC_0_SHADOW_COMPARE_FUNC__MASK			0x00001c00
+#define NV50_TSC_0_SHADOW_COMPARE_FUNC__SHIFT			10
+#define NV50_TSC_0_SRGB_CONVERSION_ALLOWED			0x00002000
+#define NV50_TSC_0_BOX_S__MASK					0x0001c000
+#define NV50_TSC_0_BOX_S__SHIFT					14
+#define NV50_TSC_0_BOX_T__MASK					0x000e0000
+#define NV50_TSC_0_BOX_T__SHIFT					17
+#define NV50_TSC_0_ANISOTROPY_MASK__MASK			0x00700000
+#define NV50_TSC_0_ANISOTROPY_MASK__SHIFT			20
+
+#define NV50_TSC_1						0x00000004
+#define NV50_TSC_1_UNKN_ANISO_15				0x10000000
+#define NV50_TSC_1_UNKN_ANISO_35				0x18000000
+#define NV50_TSC_1_MAGF__MASK					0x00000003
+#define NV50_TSC_1_MAGF__SHIFT					0
+#define NV50_TSC_1_MAGF_NEAREST					0x00000001
+#define NV50_TSC_1_MAGF_LINEAR					0x00000002
+#define NV50_TSC_1_MINF__MASK					0x00000030
+#define NV50_TSC_1_MINF__SHIFT					4
+#define NV50_TSC_1_MINF_NEAREST					0x00000010
+#define NV50_TSC_1_MINF_LINEAR					0x00000020
+#define NV50_TSC_1_MIPF__MASK					0x000000c0
+#define NV50_TSC_1_MIPF__SHIFT					6
+#define NV50_TSC_1_MIPF_NONE					0x00000040
+#define NV50_TSC_1_MIPF_NEAREST					0x00000080
+#define NV50_TSC_1_MIPF_LINEAR					0x000000c0
+#define NVE4_TSC_1_CUBE_SEAMLESS				0x00000200
+#define NV50_TSC_1_LOD_BIAS__MASK				0x01fff000
+#define NV50_TSC_1_LOD_BIAS__SHIFT				12
+#define NVE4_TSC_1_FORCE_NONNORMALIZED_COORDS			0x02000000
+
+#define NV50_TSC_2						0x00000008
+#define NV50_TSC_2_MIN_LOD__MASK				0x00000fff
+#define NV50_TSC_2_MIN_LOD__SHIFT				0
+#define NV50_TSC_2_MAX_LOD__MASK				0x00fff000
+#define NV50_TSC_2_MAX_LOD__SHIFT				12
+#define NV50_TSC_2_BORDER_COLOR_SRGB_RED__MASK			0xff000000
+#define NV50_TSC_2_BORDER_COLOR_SRGB_RED__SHIFT			24
+
+#define NV50_TSC_3						0x0000000c
+#define NV50_TSC_3_BORDER_COLOR_SRGB_GREEN__MASK		0x000ff000
+#define NV50_TSC_3_BORDER_COLOR_SRGB_GREEN__SHIFT		12
+#define NV50_TSC_3_BORDER_COLOR_SRGB_BLUE__MASK			0x0ff00000
+#define NV50_TSC_3_BORDER_COLOR_SRGB_BLUE__SHIFT		20
+
+#define NV50_TSC_4						0x00000010
+#define NV50_TSC_4_BORDER_COLOR_RED__MASK			0xffffffff
+#define NV50_TSC_4_BORDER_COLOR_RED__SHIFT			0
+
+#define NV50_TSC_5						0x00000014
+#define NV50_TSC_5_BORDER_COLOR_GREEN__MASK			0xffffffff
+#define NV50_TSC_5_BORDER_COLOR_GREEN__SHIFT			0
+
+#define NV50_TSC_6						0x00000018
+#define NV50_TSC_6_BORDER_COLOR_BLUE__MASK			0xffffffff
+#define NV50_TSC_6_BORDER_COLOR_BLUE__SHIFT			0
+
+#define NV50_TSC_7						0x0000001c
+#define NV50_TSC_7_BORDER_COLOR_ALPHA__MASK			0xffffffff
+#define NV50_TSC_7_BORDER_COLOR_ALPHA__SHIFT			0
+
+
+#endif /* NV50_TEXTURE_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_transfer.c b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
new file mode 100644
index 00000000000..a9906829fec
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.c
@@ -0,0 +1,412 @@
+
+#include "util/u_format.h"
+
+#include "nv50/nv50_context.h"
+
+#include "nv50/nv50_defs.xml.h"
+
+struct nv50_transfer {
+   struct pipe_transfer base;
+   struct nv50_m2mf_rect rect[2];
+   uint32_t nblocksx;
+   uint32_t nblocksy;
+};
+
+void
+nv50_m2mf_rect_setup(struct nv50_m2mf_rect *rect,
+                     struct pipe_resource *restrict res, unsigned l,
+                     unsigned x, unsigned y, unsigned z)
+{
+   struct nv50_miptree *mt = nv50_miptree(res);
+   const unsigned w = u_minify(res->width0, l);
+   const unsigned h = u_minify(res->height0, l);
+
+   rect->bo = mt->base.bo;
+   rect->domain = mt->base.domain;
+   rect->base = mt->level[l].offset;
+   rect->pitch = mt->level[l].pitch;
+   if (util_format_is_plain(res->format)) {
+      rect->width = w << mt->ms_x;
+      rect->height = h << mt->ms_y;
+      rect->x = x << mt->ms_x;
+      rect->y = y << mt->ms_y;
+   } else {
+      rect->width = util_format_get_nblocksx(res->format, w);
+      rect->height = util_format_get_nblocksy(res->format, h);
+      rect->x = util_format_get_nblocksx(res->format, x);
+      rect->y = util_format_get_nblocksy(res->format, y);
+   }
+   rect->tile_mode = mt->level[l].tile_mode;
+   rect->cpp = util_format_get_blocksize(res->format);
+
+   if (mt->layout_3d) {
+      rect->z = z;
+      rect->depth = u_minify(res->depth0, l);
+   } else {
+      rect->base += z * mt->layer_stride;
+      rect->z = 0;
+      rect->depth = 1;
+   }
+}
+
+void
+nv50_m2mf_transfer_rect(struct nv50_context *nv50,
+                        const struct nv50_m2mf_rect *dst,
+                        const struct nv50_m2mf_rect *src,
+                        uint32_t nblocksx, uint32_t nblocksy)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nouveau_bufctx *bctx = nv50->bufctx;
+   const int cpp = dst->cpp;
+   uint32_t src_ofst = src->base;
+   uint32_t dst_ofst = dst->base;
+   uint32_t height = nblocksy;
+   uint32_t sy = src->y;
+   uint32_t dy = dst->y;
+
+   assert(dst->cpp == src->cpp);
+
+   nouveau_bufctx_refn(bctx, 0, src->bo, src->domain | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(bctx, 0, dst->bo, dst->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   if (nouveau_bo_memtype(src->bo)) {
+      BEGIN_NV04(push, NV50_M2MF(LINEAR_IN), 6);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, src->tile_mode);
+      PUSH_DATA (push, src->width * cpp);
+      PUSH_DATA (push, src->height);
+      PUSH_DATA (push, src->depth);
+      PUSH_DATA (push, src->z);
+   } else {
+      src_ofst += src->y * src->pitch + src->x * cpp;
+
+      BEGIN_NV04(push, NV50_M2MF(LINEAR_IN), 1);
+      PUSH_DATA (push, 1);
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_PITCH_IN), 1);
+      PUSH_DATA (push, src->pitch);
+   }
+
+   if (nouveau_bo_memtype(dst->bo)) {
+      BEGIN_NV04(push, NV50_M2MF(LINEAR_OUT), 6);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, dst->tile_mode);
+      PUSH_DATA (push, dst->width * cpp);
+      PUSH_DATA (push, dst->height);
+      PUSH_DATA (push, dst->depth);
+      PUSH_DATA (push, dst->z);
+   } else {
+      dst_ofst += dst->y * dst->pitch + dst->x * cpp;
+
+      BEGIN_NV04(push, NV50_M2MF(LINEAR_OUT), 1);
+      PUSH_DATA (push, 1);
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_PITCH_OUT), 1);
+      PUSH_DATA (push, dst->pitch);
+   }
+
+   while (height) {
+      int line_count = height > 2047 ? 2047 : height;
+
+      BEGIN_NV04(push, NV50_M2MF(OFFSET_IN_HIGH), 2);
+      PUSH_DATAh(push, src->bo->offset + src_ofst);
+      PUSH_DATAh(push, dst->bo->offset + dst_ofst);
+
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_OFFSET_IN), 2);
+      PUSH_DATA (push, src->bo->offset + src_ofst);
+      PUSH_DATA (push, dst->bo->offset + dst_ofst);
+
+      if (nouveau_bo_memtype(src->bo)) {
+         BEGIN_NV04(push, NV50_M2MF(TILING_POSITION_IN), 1);
+         PUSH_DATA (push, (sy << 16) | (src->x * cpp));
+      } else {
+         src_ofst += line_count * src->pitch;
+      }
+      if (nouveau_bo_memtype(dst->bo)) {
+         BEGIN_NV04(push, NV50_M2MF(TILING_POSITION_OUT), 1);
+         PUSH_DATA (push, (dy << 16) | (dst->x * cpp));
+      } else {
+         dst_ofst += line_count * dst->pitch;
+      }
+
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_LINE_LENGTH_IN), 4);
+      PUSH_DATA (push, nblocksx * cpp);
+      PUSH_DATA (push, line_count);
+      PUSH_DATA (push, (1 << 8) | (1 << 0));
+      PUSH_DATA (push, 0);
+
+      height -= line_count;
+      sy += line_count;
+      dy += line_count;
+   }
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+void
+nv50_sifc_linear_u8(struct nouveau_context *nv,
+                    struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                    unsigned size, const void *data)
+{
+   struct nv50_context *nv50 = nv50_context(&nv->pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   uint32_t *src = (uint32_t *)data;
+   unsigned count = (size + 3) / 4;
+   unsigned xcoord = offset & 0xff;
+
+   nouveau_bufctx_refn(nv50->bufctx, 0, dst, domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nv50->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   offset &= ~0xff;
+
+   BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_2D(DST_PITCH), 5);
+   PUSH_DATA (push, 262144);
+   PUSH_DATA (push, 65536);
+   PUSH_DATA (push, 1);
+   PUSH_DATAh(push, dst->offset + offset);
+   PUSH_DATA (push, dst->offset + offset);
+   BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM);
+   BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10);
+   PUSH_DATA (push, size);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, xcoord);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   while (count) {
+      unsigned nr;
+
+      if (!PUSH_SPACE(push, 16))
+         break;
+      nr = PUSH_AVAIL(push);
+      assert(nr >= 16);
+      nr = MIN2(count, nr - 1);
+      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
+
+      BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr);
+      PUSH_DATAp(push, src, nr);
+
+      src += nr;
+      count -= nr;
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx, 0);
+}
+
+void
+nv50_m2mf_copy_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned dstoff, unsigned dstdom,
+                      struct nouveau_bo *src, unsigned srcoff, unsigned srcdom,
+                      unsigned size)
+{
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   struct nouveau_bufctx *bctx = nv50_context(&nv->pipe)->bufctx;
+
+   nouveau_bufctx_refn(bctx, 0, src, srcdom | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(bctx, 0, dst, dstdom | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   BEGIN_NV04(push, NV50_M2MF(LINEAR_IN), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_M2MF(LINEAR_OUT), 1);
+   PUSH_DATA (push, 1);
+
+   while (size) {
+      unsigned bytes = MIN2(size, 1 << 17);
+
+      BEGIN_NV04(push, NV50_M2MF(OFFSET_IN_HIGH), 2);
+      PUSH_DATAh(push, src->offset + srcoff);
+      PUSH_DATAh(push, dst->offset + dstoff);
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_OFFSET_IN), 2);
+      PUSH_DATA (push, src->offset + srcoff);
+      PUSH_DATA (push, dst->offset + dstoff);
+      BEGIN_NV04(push, SUBC_M2MF(NV03_M2MF_LINE_LENGTH_IN), 4);
+      PUSH_DATA (push, bytes);
+      PUSH_DATA (push, 1);
+      PUSH_DATA (push, (1 << 8) | (1 << 0));
+      PUSH_DATA (push, 0);
+
+      srcoff += bytes;
+      dstoff += bytes;
+      size -= bytes;
+   }
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+void *
+nv50_miptree_transfer_map(struct pipe_context *pctx,
+                          struct pipe_resource *res,
+                          unsigned level,
+                          unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **ptransfer)
+{
+   struct nv50_screen *screen = nv50_screen(pctx->screen);
+   struct nv50_context *nv50 = nv50_context(pctx);
+   struct nouveau_device *dev = nv50->screen->base.device;
+   const struct nv50_miptree *mt = nv50_miptree(res);
+   struct nv50_transfer *tx;
+   uint32_t size;
+   int ret;
+   unsigned flags = 0;
+
+   if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
+      return NULL;
+
+   tx = CALLOC_STRUCT(nv50_transfer);
+   if (!tx)
+      return NULL;
+
+   pipe_resource_reference(&tx->base.resource, res);
+
+   tx->base.level = level;
+   tx->base.usage = usage;
+   tx->base.box = *box;
+
+   if (util_format_is_plain(res->format)) {
+      tx->nblocksx = box->width << mt->ms_x;
+      tx->nblocksy = box->height << mt->ms_x;
+   } else {
+      tx->nblocksx = util_format_get_nblocksx(res->format, box->width);
+      tx->nblocksy = util_format_get_nblocksy(res->format, box->height);
+   }
+
+   tx->base.stride = tx->nblocksx * util_format_get_blocksize(res->format);
+   tx->base.layer_stride = tx->nblocksy * tx->base.stride;
+
+   nv50_m2mf_rect_setup(&tx->rect[0], res, level, box->x, box->y, box->z);
+
+   size = tx->base.layer_stride;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+                        size * tx->base.box.depth, NULL, &tx->rect[1].bo);
+   if (ret) {
+      FREE(tx);
+      return NULL;
+   }
+
+   tx->rect[1].cpp = tx->rect[0].cpp;
+   tx->rect[1].width = tx->nblocksx;
+   tx->rect[1].height = tx->nblocksy;
+   tx->rect[1].depth = 1;
+   tx->rect[1].pitch = tx->base.stride;
+   tx->rect[1].domain = NOUVEAU_BO_GART;
+
+   if (usage & PIPE_TRANSFER_READ) {
+      unsigned base = tx->rect[0].base;
+      unsigned z = tx->rect[0].z;
+      unsigned i;
+      for (i = 0; i < box->depth; ++i) {
+         nv50_m2mf_transfer_rect(nv50, &tx->rect[1], &tx->rect[0],
+                                 tx->nblocksx, tx->nblocksy);
+         if (mt->layout_3d)
+            tx->rect[0].z++;
+         else
+            tx->rect[0].base += mt->layer_stride;
+         tx->rect[1].base += size;
+      }
+      tx->rect[0].z = z;
+      tx->rect[0].base = base;
+      tx->rect[1].base = 0;
+   }
+
+   if (tx->rect[1].bo->map) {
+      *ptransfer = &tx->base;
+      return tx->rect[1].bo->map;
+   }
+
+   if (usage & PIPE_TRANSFER_READ)
+      flags = NOUVEAU_BO_RD;
+   if (usage & PIPE_TRANSFER_WRITE)
+      flags |= NOUVEAU_BO_WR;
+
+   ret = nouveau_bo_map(tx->rect[1].bo, flags, screen->base.client);
+   if (ret) {
+      nouveau_bo_ref(NULL, &tx->rect[1].bo);
+      FREE(tx);
+      return NULL;
+   }
+
+   *ptransfer = &tx->base;
+   return tx->rect[1].bo->map;
+}
+
+void
+nv50_miptree_transfer_unmap(struct pipe_context *pctx,
+                            struct pipe_transfer *transfer)
+{
+   struct nv50_context *nv50 = nv50_context(pctx);
+   struct nv50_transfer *tx = (struct nv50_transfer *)transfer;
+   struct nv50_miptree *mt = nv50_miptree(tx->base.resource);
+   unsigned i;
+
+   if (tx->base.usage & PIPE_TRANSFER_WRITE) {
+      for (i = 0; i < tx->base.box.depth; ++i) {
+         nv50_m2mf_transfer_rect(nv50, &tx->rect[0], &tx->rect[1],
+                                 tx->nblocksx, tx->nblocksy);
+         if (mt->layout_3d)
+            tx->rect[0].z++;
+         else
+            tx->rect[0].base += mt->layer_stride;
+         tx->rect[1].base += tx->nblocksy * tx->base.stride;
+      }
+   }
+
+   nouveau_bo_ref(NULL, &tx->rect[1].bo);
+   pipe_resource_reference(&transfer->resource, NULL);
+
+   FREE(tx);
+}
+
+void
+nv50_cb_push(struct nouveau_context *nv,
+             struct nouveau_bo *bo, unsigned domain,
+             unsigned base, unsigned size,
+             unsigned offset, unsigned words, const uint32_t *data)
+{
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   struct nouveau_bufctx *bctx = nv50_context(&nv->pipe)->bufctx;
+
+   assert(!(offset & 3));
+   size = align(size, 0x100);
+
+   nouveau_bufctx_refn(bctx, 0, bo, NOUVEAU_BO_WR | domain);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   while (words) {
+      unsigned nr;
+
+      nr = PUSH_AVAIL(push);
+      nr = MIN2(nr - 7, words);
+      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+
+      BEGIN_NV04(push, NV50_3D(CB_DEF_ADDRESS_HIGH), 3);
+      PUSH_DATAh(push, bo->offset + base);
+      PUSH_DATA (push, bo->offset + base);
+      PUSH_DATA (push, (NV50_CB_TMP << 16) | (size & 0xffff));
+      BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
+      PUSH_DATA (push, (offset << 6) | NV50_CB_TMP);
+      BEGIN_NI04(push, NV50_3D(CB_DATA(0)), nr);
+      PUSH_DATAp(push, data, nr);
+
+      words -= nr;
+      data += nr;
+      offset += nr * 4;
+   }
+
+   nouveau_bufctx_reset(bctx, 0);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_transfer.h b/src/gallium/drivers/nouveau/nv50/nv50_transfer.h
new file mode 100644
index 00000000000..c58cb0008df
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_transfer.h
@@ -0,0 +1,27 @@
+
+#ifndef __NV50_TRANSFER_H__
+#define __NV50_TRANSFER_H__
+
+#include "pipe/p_state.h"
+
+struct nv50_m2mf_rect {
+   struct nouveau_bo *bo;
+   uint32_t base;
+   unsigned domain;
+   uint32_t pitch;
+   uint32_t width;
+   uint32_t x;
+   uint32_t height;
+   uint32_t y;
+   uint16_t depth;
+   uint16_t z;
+   uint16_t tile_mode;
+   uint16_t cpp;
+};
+
+void
+nv50_m2mf_rect_setup(struct nv50_m2mf_rect *rect,
+                     struct pipe_resource *restrict res, unsigned l,
+                     unsigned x, unsigned y, unsigned z);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
new file mode 100644
index 00000000000..c6162b5f415
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_resource.h"
+
+#include "nv50/nv50_3d.xml.h"
+
+void
+nv50_vertex_state_delete(struct pipe_context *pipe,
+                         void *hwcso)
+{
+   struct nv50_vertex_stateobj *so = hwcso;
+
+   if (so->translate)
+      so->translate->release(so->translate);
+   FREE(hwcso);
+}
+
+void *
+nv50_vertex_state_create(struct pipe_context *pipe,
+                         unsigned num_elements,
+                         const struct pipe_vertex_element *elements)
+{
+    struct nv50_vertex_stateobj *so;
+    struct translate_key transkey;
+    unsigned i;
+
+    so = MALLOC(sizeof(*so) +
+                num_elements * sizeof(struct nv50_vertex_element));
+    if (!so)
+        return NULL;
+    so->num_elements = num_elements;
+    so->instance_elts = 0;
+    so->instance_bufs = 0;
+    so->need_conversion = FALSE;
+
+    memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
+
+    for (i = 0; i < PIPE_MAX_ATTRIBS; ++i)
+       so->min_instance_div[i] = 0xffffffff;
+
+    transkey.nr_elements = 0;
+    transkey.output_stride = 0;
+
+    for (i = 0; i < num_elements; ++i) {
+        const struct pipe_vertex_element *ve = &elements[i];
+        const unsigned vbi = ve->vertex_buffer_index;
+        unsigned size;
+        enum pipe_format fmt = ve->src_format;
+
+        so->element[i].pipe = elements[i];
+        so->element[i].state = nv50_format_table[fmt].vtx;
+
+        if (!so->element[i].state) {
+            switch (util_format_get_nr_components(fmt)) {
+            case 1: fmt = PIPE_FORMAT_R32_FLOAT; break;
+            case 2: fmt = PIPE_FORMAT_R32G32_FLOAT; break;
+            case 3: fmt = PIPE_FORMAT_R32G32B32_FLOAT; break;
+            case 4: fmt = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
+            default:
+                assert(0);
+                FREE(so);
+                return NULL;
+            }
+            so->element[i].state = nv50_format_table[fmt].vtx;
+            so->need_conversion = TRUE;
+        }
+        so->element[i].state |= i;
+
+        size = util_format_get_blocksize(fmt);
+        if (so->vb_access_size[vbi] < (ve->src_offset + size))
+           so->vb_access_size[vbi] = ve->src_offset + size;
+
+        if (1) {
+            unsigned j = transkey.nr_elements++;
+
+            transkey.element[j].type = TRANSLATE_ELEMENT_NORMAL;
+            transkey.element[j].input_format = ve->src_format;
+            transkey.element[j].input_buffer = vbi;
+            transkey.element[j].input_offset = ve->src_offset;
+            transkey.element[j].instance_divisor = ve->instance_divisor;
+
+            transkey.element[j].output_format = fmt;
+            transkey.element[j].output_offset = transkey.output_stride;
+            transkey.output_stride += (util_format_get_stride(fmt, 1) + 3) & ~3;
+
+            if (unlikely(ve->instance_divisor)) {
+               so->instance_elts |= 1 << i;
+               so->instance_bufs |= 1 << vbi;
+               if (ve->instance_divisor < so->min_instance_div[vbi])
+                  so->min_instance_div[vbi] = ve->instance_divisor;
+            }
+        }
+    }
+
+    so->translate = translate_create(&transkey);
+    so->vertex_size = transkey.output_stride / 4;
+    so->packet_vertex_limit = NV04_PFIFO_MAX_PACKET_LEN /
+       MAX2(so->vertex_size, 1);
+
+    return so;
+}
+
+#define NV50_3D_VERTEX_ATTRIB_INACTIVE              \
+   NV50_3D_VERTEX_ARRAY_ATTRIB_TYPE_FLOAT |         \
+   NV50_3D_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 | \
+   NV50_3D_VERTEX_ARRAY_ATTRIB_CONST
+
+static void
+nv50_emit_vtxattr(struct nv50_context *nv50, struct pipe_vertex_buffer *vb,
+                  struct pipe_vertex_element *ve, unsigned attr)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   const void *data = (const uint8_t *)vb->user_buffer + ve->src_offset;
+   float v[4];
+   const unsigned nc = util_format_get_nr_components(ve->src_format);
+   const struct util_format_description *desc =
+      util_format_description(ve->src_format);
+
+   assert(vb->user_buffer);
+
+   if (desc->channel[0].pure_integer) {
+      if (desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+         desc->unpack_rgba_sint((int32_t *)v, 0, data, 0, 1, 1);
+      } else {
+         desc->unpack_rgba_uint((uint32_t *)v, 0, data, 0, 1, 1);
+      }
+   } else {
+      desc->unpack_rgba_float(v, 0, data, 0, 1, 1);
+   }
+
+   switch (nc) {
+   case 4:
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_4F_X(attr)), 4);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      PUSH_DATAf(push, v[2]);
+      PUSH_DATAf(push, v[3]);
+      break;
+   case 3:
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_3F_X(attr)), 3);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      PUSH_DATAf(push, v[2]);
+      break;
+   case 2:
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_2F_X(attr)), 2);
+      PUSH_DATAf(push, v[0]);
+      PUSH_DATAf(push, v[1]);
+      break;
+   case 1:
+      if (attr == nv50->vertprog->vp.edgeflag) {
+         BEGIN_NV04(push, NV50_3D(EDGEFLAG), 1);
+         PUSH_DATA (push, v[0] ? 1 : 0);
+      }
+      BEGIN_NV04(push, NV50_3D(VTX_ATTR_1F(attr)), 1);
+      PUSH_DATAf(push, v[0]);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static INLINE void
+nv50_user_vbuf_range(struct nv50_context *nv50, int vbi,
+                     uint32_t *base, uint32_t *size)
+{
+   if (unlikely(nv50->vertex->instance_bufs & (1 << vbi))) {
+      /* TODO: use min and max instance divisor to get a proper range */
+      *base = 0;
+      *size = nv50->vtxbuf[vbi].buffer->width0;
+   } else {
+      /* NOTE: if there are user buffers, we *must* have index bounds */
+      assert(nv50->vb_elt_limit != ~0);
+      *base = nv50->vb_elt_first * nv50->vtxbuf[vbi].stride;
+      *size = nv50->vb_elt_limit * nv50->vtxbuf[vbi].stride +
+         nv50->vertex->vb_access_size[vbi];
+   }
+}
+
+static void
+nv50_upload_user_buffers(struct nv50_context *nv50,
+                         uint64_t addrs[], uint32_t limits[])
+{
+   unsigned b;
+
+   for (b = 0; b < nv50->num_vtxbufs; ++b) {
+      struct nouveau_bo *bo;
+      const struct pipe_vertex_buffer *vb = &nv50->vtxbuf[b];
+      uint32_t base, size;
+
+      if (!(nv50->vbo_user & (1 << b)) || !vb->stride)
+         continue;
+      nv50_user_vbuf_range(nv50, b, &base, &size);
+
+      limits[b] = base + size - 1;
+      addrs[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size,
+                                      &bo);
+      if (addrs[b])
+         BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
+                      NOUVEAU_BO_RD, bo);
+   }
+   nv50->base.vbo_dirty = TRUE;
+}
+
+static void
+nv50_update_user_vbufs(struct nv50_context *nv50)
+{
+   uint64_t address[PIPE_MAX_ATTRIBS];
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned i;
+   uint32_t written = 0;
+
+   for (i = 0; i < nv50->vertex->num_elements; ++i) {
+      struct pipe_vertex_element *ve = &nv50->vertex->element[i].pipe;
+      const unsigned b = ve->vertex_buffer_index;
+      struct pipe_vertex_buffer *vb = &nv50->vtxbuf[b];
+      uint32_t base, size;
+
+      if (!(nv50->vbo_user & (1 << b)))
+         continue;
+
+      if (!vb->stride) {
+         nv50_emit_vtxattr(nv50, vb, ve, i);
+         continue;
+      }
+      nv50_user_vbuf_range(nv50, b, &base, &size);
+
+      if (!(written & (1 << b))) {
+         struct nouveau_bo *bo;
+         const uint32_t bo_flags = NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+         written |= 1 << b;
+         address[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer,
+                                           base, size, &bo);
+         if (address[b])
+            BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, bo_flags, bo);
+      }
+
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
+      PUSH_DATAh(push, address[b] + base + size - 1);
+      PUSH_DATA (push, address[b] + base + size - 1);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_START_HIGH(i)), 2);
+      PUSH_DATAh(push, address[b] + ve->src_offset);
+      PUSH_DATA (push, address[b] + ve->src_offset);
+   }
+   nv50->base.vbo_dirty = TRUE;
+}
+
+static INLINE void
+nv50_release_user_vbufs(struct nv50_context *nv50)
+{
+   if (nv50->vbo_user) {
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX_TMP);
+      nouveau_scratch_done(&nv50->base);
+   }
+}
+
+void
+nv50_vertex_arrays_validate(struct nv50_context *nv50)
+{
+   uint64_t addrs[PIPE_MAX_ATTRIBS];
+   uint32_t limits[PIPE_MAX_ATTRIBS];
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_vertex_stateobj *vertex = nv50->vertex;
+   struct pipe_vertex_buffer *vb;
+   struct nv50_vertex_element *ve;
+   uint32_t mask;
+   uint32_t refd = 0;
+   unsigned i;
+   const unsigned n = MAX2(vertex->num_elements, nv50->state.num_vtxelts);
+
+   if (unlikely(vertex->need_conversion))
+      nv50->vbo_fifo = ~0;
+   else
+   if (nv50->vbo_user & ~nv50->vbo_constant)
+      nv50->vbo_fifo = nv50->vbo_push_hint ? ~0 : 0;
+   else
+      nv50->vbo_fifo = 0;
+
+   if (!nv50->vbo_fifo) {
+      /* if vertex buffer was written by GPU - flush VBO cache */
+      for (i = 0; i < nv50->num_vtxbufs; ++i) {
+         struct nv04_resource *buf = nv04_resource(nv50->vtxbuf[i].buffer);
+         if (buf && buf->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+            buf->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+            nv50->base.vbo_dirty = TRUE;
+            break;
+         }
+      }
+   }
+
+   /* update vertex format state */
+   BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_ATTRIB(0)), n);
+   if (nv50->vbo_fifo) {
+      nv50->state.num_vtxelts = vertex->num_elements;
+      for (i = 0; i < vertex->num_elements; ++i)
+         PUSH_DATA (push, vertex->element[i].state);
+      for (; i < n; ++i)
+         PUSH_DATA (push, NV50_3D_VERTEX_ATTRIB_INACTIVE);
+      for (i = 0; i < n; ++i) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+         PUSH_DATA (push, 0);
+      }
+      return;
+   }
+   for (i = 0; i < vertex->num_elements; ++i) {
+      const unsigned b = vertex->element[i].pipe.vertex_buffer_index;
+      ve = &vertex->element[i];
+      vb = &nv50->vtxbuf[b];
+
+      if (likely(vb->stride) || !(nv50->vbo_user & (1 << b)))
+         PUSH_DATA(push, ve->state);
+      else
+         PUSH_DATA(push, ve->state | NV50_3D_VERTEX_ARRAY_ATTRIB_CONST);
+   }
+   for (; i < n; ++i)
+      PUSH_DATA(push, NV50_3D_VERTEX_ATTRIB_INACTIVE);
+
+   /* update per-instance enables */
+   mask = vertex->instance_elts ^ nv50->state.instance_elts;
+   while (mask) {
+      const int i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_PER_INSTANCE(i)), 1);
+      PUSH_DATA (push, (vertex->instance_elts >> i) & 1);
+   }
+   nv50->state.instance_elts = vertex->instance_elts;
+
+   if (nv50->vbo_user & ~nv50->vbo_constant)
+      nv50_upload_user_buffers(nv50, addrs, limits);
+
+   /* update buffers and set constant attributes */
+   for (i = 0; i < vertex->num_elements; ++i) {
+      uint64_t address, limit;
+      const unsigned b = vertex->element[i].pipe.vertex_buffer_index;
+      ve = &vertex->element[i];
+      vb = &nv50->vtxbuf[b];
+
+      if (unlikely(nv50->vbo_constant & (1 << b))) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+         PUSH_DATA (push, 0);
+         nv50_emit_vtxattr(nv50, vb, &ve->pipe, i);
+         continue;
+      } else
+      if (nv50->vbo_user & (1 << b)) {
+         address = addrs[b] + ve->pipe.src_offset;
+         limit = addrs[b] + limits[b];
+      } else {
+         struct nv04_resource *buf = nv04_resource(vb->buffer);
+         if (!(refd & (1 << b))) {
+            refd |= 1 << b;
+            BCTX_REFN(nv50->bufctx_3d, VERTEX, buf, RD);
+         }
+         address = buf->address + vb->buffer_offset + ve->pipe.src_offset;
+         limit = buf->address + buf->base.width0 - 1;
+      }
+
+      if (unlikely(ve->pipe.instance_divisor)) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 4);
+         PUSH_DATA (push, NV50_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
+         PUSH_DATAh(push, address);
+         PUSH_DATA (push, address);
+         PUSH_DATA (push, ve->pipe.instance_divisor);
+      } else {
+         BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 3);
+         PUSH_DATA (push, NV50_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
+         PUSH_DATAh(push, address);
+         PUSH_DATA (push, address);
+      }
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
+      PUSH_DATAh(push, limit);
+      PUSH_DATA (push, limit);
+   }
+   for (; i < nv50->state.num_vtxelts; ++i) {
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FETCH(i)), 1);
+      PUSH_DATA (push, 0);
+   }
+   nv50->state.num_vtxelts = vertex->num_elements;
+}
+
+#define NV50_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
+
+static INLINE unsigned
+nv50_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NV50_PRIM_GL_CASE(POINTS);
+   NV50_PRIM_GL_CASE(LINES);
+   NV50_PRIM_GL_CASE(LINE_LOOP);
+   NV50_PRIM_GL_CASE(LINE_STRIP);
+   NV50_PRIM_GL_CASE(TRIANGLES);
+   NV50_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NV50_PRIM_GL_CASE(TRIANGLE_FAN);
+   NV50_PRIM_GL_CASE(QUADS);
+   NV50_PRIM_GL_CASE(QUAD_STRIP);
+   NV50_PRIM_GL_CASE(POLYGON);
+   NV50_PRIM_GL_CASE(LINES_ADJACENCY);
+   NV50_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
+   NV50_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
+   NV50_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
+   default:
+      return NV50_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
+      break;
+   }
+}
+
+/* For pre-nva0 transform feedback. */
+static const uint8_t nv50_pipe_prim_to_prim_size[PIPE_PRIM_MAX + 1] =
+{
+   [PIPE_PRIM_POINTS] = 1,
+   [PIPE_PRIM_LINES] = 2,
+   [PIPE_PRIM_LINE_LOOP] = 2,
+   [PIPE_PRIM_LINE_STRIP] = 2,
+   [PIPE_PRIM_TRIANGLES] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP] = 3,
+   [PIPE_PRIM_TRIANGLE_FAN] = 3,
+   [PIPE_PRIM_QUADS] = 3,
+   [PIPE_PRIM_QUAD_STRIP] = 3,
+   [PIPE_PRIM_POLYGON] = 3,
+   [PIPE_PRIM_LINES_ADJACENCY] = 2,
+   [PIPE_PRIM_LINE_STRIP_ADJACENCY] = 2,
+   [PIPE_PRIM_TRIANGLES_ADJACENCY] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = 3
+};
+
+static void
+nv50_draw_arrays(struct nv50_context *nv50,
+                 unsigned mode, unsigned start, unsigned count,
+                 unsigned instance_count)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned prim;
+
+   if (nv50->state.index_bias) {
+      BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
+      PUSH_DATA (push, 0);
+      nv50->state.index_bias = 0;
+   }
+
+   prim = nv50_prim_gl(mode);
+
+   while (instance_count--) {
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, prim);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BUFFER_FIRST), 2);
+      PUSH_DATA (push, start);
+      PUSH_DATA (push, count);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+
+      prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+}
+
+static void
+nv50_draw_elements_inline_u08(struct nouveau_pushbuf *push, const uint8_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 3) {
+      unsigned i;
+      BEGIN_NI04(push, NV50_3D(VB_ELEMENT_U32), count & 3);
+      for (i = 0; i < (count & 3); ++i)
+         PUSH_DATA(push, *map++);
+      count &= ~3;
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 4) / 4;
+
+      BEGIN_NI04(push, NV50_3D(VB_ELEMENT_U8), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push,
+                   (map[3] << 24) | (map[2] << 16) | (map[1] << 8) | map[0]);
+         map += 4;
+      }
+      count -= nr * 4;
+   }
+}
+
+static void
+nv50_draw_elements_inline_u16(struct nouveau_pushbuf *push, const uint16_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      count &= ~1;
+      BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 2) / 2;
+
+      BEGIN_NI04(push, NV50_3D(VB_ELEMENT_U16), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+      count -= nr * 2;
+   }
+}
+
+static void
+nv50_draw_elements_inline_u32(struct nouveau_pushbuf *push, const uint32_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   while (count) {
+      const unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+
+      BEGIN_NI04(push, NV50_3D(VB_ELEMENT_U32), nr);
+      PUSH_DATAp(push, map, nr);
+
+      map += nr;
+      count -= nr;
+   }
+}
+
+static void
+nv50_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
+                                    const uint32_t *map,
+                                    unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      count--;
+      BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 2) / 2;
+
+      BEGIN_NI04(push, NV50_3D(VB_ELEMENT_U16), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+      count -= nr * 2;
+   }
+}
+
+static void
+nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
+                   unsigned mode, unsigned start, unsigned count,
+                   unsigned instance_count, int32_t index_bias)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned prim;
+   const unsigned index_size = nv50->idxbuf.index_size;
+
+   prim = nv50_prim_gl(mode);
+
+   if (index_bias != nv50->state.index_bias) {
+      BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
+      PUSH_DATA (push, index_bias);
+      nv50->state.index_bias = index_bias;
+   }
+
+   if (nv50->idxbuf.buffer) {
+      struct nv04_resource *buf = nv04_resource(nv50->idxbuf.buffer);
+      unsigned pb_start;
+      unsigned pb_bytes;
+      const unsigned base = (buf->offset + nv50->idxbuf.offset) & ~3;
+
+      start += ((buf->offset + nv50->idxbuf.offset) & 3) >> (index_size >> 1);
+
+      assert(nouveau_resource_mapped_by_gpu(nv50->idxbuf.buffer));
+
+      while (instance_count--) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+         PUSH_DATA (push, prim);
+
+         nouveau_pushbuf_space(push, 8, 0, 1);
+
+         switch (index_size) {
+         case 4:
+            BEGIN_NL50(push, NV50_3D(VB_ELEMENT_U32), count);
+            nouveau_pushbuf_data(push, buf->bo, base + start * 4, count * 4);
+            break;
+         case 2:
+            pb_start = (start & ~1) * 2;
+            pb_bytes = ((start + count + 1) & ~1) * 2 - pb_start;
+
+            BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U16_SETUP), 1);
+            PUSH_DATA (push, (start << 31) | count);
+            BEGIN_NL50(push, NV50_3D(VB_ELEMENT_U16), pb_bytes / 4);
+            nouveau_pushbuf_data(push, buf->bo, base + pb_start, pb_bytes);
+            BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U16_SETUP), 1);
+            PUSH_DATA (push, 0);
+            break;
+         default:
+            assert(index_size == 1);
+            pb_start = start & ~3;
+            pb_bytes = ((start + count + 3) & ~3) - pb_start;
+
+            BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U8_SETUP), 1);
+            PUSH_DATA (push, (start << 30) | count);
+            BEGIN_NL50(push, NV50_3D(VB_ELEMENT_U8), pb_bytes / 4);
+            nouveau_pushbuf_data(push, buf->bo, base + pb_start, pb_bytes);
+            BEGIN_NV04(push, NV50_3D(VB_ELEMENT_U8_SETUP), 1);
+            PUSH_DATA (push, 0);
+            break;
+         }
+         BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+         PUSH_DATA (push, 0);
+
+         prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+      }
+   } else {
+      const void *data = nv50->idxbuf.user_buffer;
+
+      while (instance_count--) {
+         BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+         PUSH_DATA (push, prim);
+         switch (index_size) {
+         case 1:
+            nv50_draw_elements_inline_u08(push, data, start, count);
+            break;
+         case 2:
+            nv50_draw_elements_inline_u16(push, data, start, count);
+            break;
+         case 4:
+            if (shorten)
+               nv50_draw_elements_inline_u32_short(push, data, start, count);
+            else
+               nv50_draw_elements_inline_u32(push, data, start, count);
+            break;
+         default:
+            assert(0);
+            return;
+         }
+         BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+         PUSH_DATA (push, 0);
+
+         prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+      }
+   }
+}
+
+static void
+nva0_draw_stream_output(struct nv50_context *nv50,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_so_target *so = nv50_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned num_instances = info->instance_count;
+   unsigned mode = nv50_prim_gl(info->mode);
+
+   if (unlikely(nv50->screen->base.class_3d < NVA0_3D_CLASS)) {
+      /* A proper implementation without waiting doesn't seem possible,
+       * so don't bother.
+       */
+      NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+      return;
+   }
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      PUSH_SPACE(push, 4);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   assert(num_instances);
+   do {
+      PUSH_SPACE(push, 8);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, mode);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BASE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BYTES), 1);
+      nv50_query_pushbuf_submit(push, so->pq, 0x4);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+
+      mode |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   } while (--num_instances);
+}
+
+static void
+nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
+{
+   struct nv50_screen *screen = chan->user_priv;
+
+   nouveau_fence_update(&screen->base, TRUE);
+
+   nv50_bufctx_fence(screen->cur_ctx->bufctx_3d, TRUE);
+}
+
+void
+nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
+   nv50->vb_elt_first = info->min_index + info->index_bias;
+   nv50->vb_elt_limit = info->max_index - info->min_index;
+   nv50->instance_off = info->start_instance;
+   nv50->instance_max = info->instance_count - 1;
+
+   /* For picking only a few vertices from a large user buffer, push is better,
+    * if index count is larger and we expect repeated vertices, suggest upload.
+    */
+   nv50->vbo_push_hint = /* the 64 is heuristic */
+      !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count));
+
+   if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) {
+      if (!!nv50->vbo_fifo != nv50->vbo_push_hint)
+         nv50->dirty |= NV50_NEW_ARRAYS;
+      else
+      if (!nv50->vbo_fifo)
+         nv50_update_user_vbufs(nv50);
+   }
+
+   if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
+      nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
+
+   nv50_state_validate(nv50, ~0, 8); /* 8 as minimum, we use flush_notify */
+
+   push->kick_notify = nv50_draw_vbo_kick_notify;
+
+   if (nv50->vbo_fifo) {
+      nv50_push_vbo(nv50, info);
+      push->kick_notify = nv50_default_kick_notify;
+      nouveau_pushbuf_bufctx(push, NULL);
+      return;
+   }
+
+   if (nv50->state.instance_base != info->start_instance) {
+      nv50->state.instance_base = info->start_instance;
+      /* NOTE: this does not affect the shader input, should it ? */
+      BEGIN_NV04(push, NV50_3D(VB_INSTANCE_BASE), 1);
+      PUSH_DATA (push, info->start_instance);
+   }
+
+   if (nv50->base.vbo_dirty) {
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
+      PUSH_DATA (push, 0);
+      nv50->base.vbo_dirty = FALSE;
+   }
+
+   if (info->indexed) {
+      boolean shorten = info->max_index <= 65535;
+
+      if (info->primitive_restart != nv50->state.prim_restart) {
+         if (info->primitive_restart) {
+            BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 2);
+            PUSH_DATA (push, 1);
+            PUSH_DATA (push, info->restart_index);
+
+            if (info->restart_index > 65535)
+               shorten = FALSE;
+         } else {
+            BEGIN_NV04(push, NV50_3D(PRIM_RESTART_ENABLE), 1);
+            PUSH_DATA (push, 0);
+         }
+         nv50->state.prim_restart = info->primitive_restart;
+      } else
+      if (info->primitive_restart) {
+         BEGIN_NV04(push, NV50_3D(PRIM_RESTART_INDEX), 1);
+         PUSH_DATA (push, info->restart_index);
+
+         if (info->restart_index > 65535)
+            shorten = FALSE;
+      }
+
+      nv50_draw_elements(nv50, shorten,
+                         info->mode, info->start, info->count,
+                         info->instance_count, info->index_bias);
+   } else
+   if (unlikely(info->count_from_stream_output)) {
+      nva0_draw_stream_output(nv50, info);
+   } else {
+      nv50_draw_arrays(nv50,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
+   }
+   push->kick_notify = nv50_default_kick_notify;
+
+   nv50_release_user_vbufs(nv50);
+
+   nouveau_pushbuf_bufctx(push, NULL);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
new file mode 100644
index 00000000000..e8578c8be6f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -0,0 +1,125 @@
+
+#ifndef __NV50_WINSYS_H__
+#define __NV50_WINSYS_H__
+
+#include <stdint.h>
+#include <unistd.h>
+
+#include "pipe/p_defines.h"
+
+#include "nouveau_winsys.h"
+#include "nouveau_buffer.h"
+
+
+#ifndef NV04_PFIFO_MAX_PACKET_LEN
+#define NV04_PFIFO_MAX_PACKET_LEN 2047
+#endif
+
+
+static INLINE void
+nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
+                            unsigned flags, struct nouveau_bo *bo)
+{
+   nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
+}
+
+static INLINE void
+nv50_add_bufctx_resident(struct nouveau_bufctx *bufctx, int bin,
+                         struct nv04_resource *res, unsigned flags)
+{
+   struct nouveau_bufref *ref =
+      nouveau_bufctx_refn(bufctx, bin, res->bo, flags | res->domain);
+   ref->priv = res;
+   ref->priv_data = flags;
+}
+
+#define BCTX_REFN_bo(ctx, bin, fl, bo) \
+   nv50_add_bufctx_resident_bo(ctx, NV50_BIND_##bin, fl, bo);
+
+#define BCTX_REFN(bctx, bin, res, acc) \
+   nv50_add_bufctx_resident(bctx, NV50_BIND_##bin, res, NOUVEAU_BO_##acc)
+
+static INLINE void
+PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
+{
+   struct nouveau_pushbuf_refn ref = { bo, flags };
+   nouveau_pushbuf_refn(push, &ref, 1);
+}
+
+
+#define SUBC_3D(m) 3, (m)
+#define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
+
+#define SUBC_2D(m) 4, (m)
+#define NV50_2D(n) SUBC_2D(NV50_2D_##n)
+
+#define SUBC_M2MF(m) 5, (m)
+#define NV50_M2MF(n) SUBC_M2MF(NV50_M2MF_##n)
+
+#define SUBC_COMPUTE(m) 6, (m)
+#define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
+
+
+static INLINE uint32_t
+NV50_FIFO_PKHDR(int subc, int mthd, unsigned size)
+{
+   return 0x00000000 | (size << 18) | (subc << 13) | mthd;
+}
+
+static INLINE uint32_t
+NV50_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
+{
+   return 0x40000000 | (size << 18) | (subc << 13) | mthd;
+}
+
+static INLINE uint32_t
+NV50_FIFO_PKHDR_L(int subc, int mthd)
+{
+   return 0x00030000 | (subc << 13) | mthd;
+}
+
+
+static INLINE uint32_t
+nouveau_bo_memtype(const struct nouveau_bo *bo)
+{
+   return bo->config.nv50.memtype;
+}
+
+
+static INLINE void
+PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
+{
+   *push->cur++ = (uint32_t)(data >> 32);
+}
+
+static INLINE void
+BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
+{
+#ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, size + 1);
+#endif
+   PUSH_DATA (push, NV50_FIFO_PKHDR(subc, mthd, size));
+}
+
+static INLINE void
+BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
+{
+#ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, size + 1);
+#endif
+   PUSH_DATA (push, NV50_FIFO_PKHDR_NI(subc, mthd, size));
+}
+
+/* long, non-incremental, nv50-only */
+static INLINE void
+BEGIN_NL50(struct nouveau_pushbuf *push, int subc, int mthd, uint32_t size)
+{
+#ifndef NV50_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, 2);
+#endif
+   PUSH_DATA (push, NV50_FIFO_PKHDR_L(subc, mthd));
+   PUSH_DATA (push, size);
+}
+
+#endif /* __NV50_WINSYS_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.c b/src/gallium/drivers/nouveau/nv50/nv84_video.c
new file mode 100644
index 00000000000..3fee6d95f66
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright 2013 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <fcntl.h>
+
+#include "util/u_format.h"
+#include "util/u_sampler.h"
+#include "vl/vl_zscan.h"
+
+#include "nv50/nv84_video.h"
+
+static int
+nv84_copy_firmware(const char *path, void *dest, ssize_t len)
+{
+   int fd = open(path, O_RDONLY | O_CLOEXEC);
+   ssize_t r;
+   if (fd < 0) {
+      fprintf(stderr, "opening firmware file %s failed: %m\n", path);
+      return 1;
+   }
+   r = read(fd, dest, len);
+   close(fd);
+
+   if (r != len) {
+      fprintf(stderr, "reading firwmare file %s failed: %m\n", path);
+      return 1;
+   }
+
+   return 0;
+}
+
+static int
+filesize(const char *path)
+{
+   int ret;
+   struct stat statbuf;
+
+   ret = stat(path, &statbuf);
+   if (ret)
+      return ret;
+   return statbuf.st_size;
+}
+
+static struct nouveau_bo *
+nv84_load_firmwares(struct nouveau_device *dev, struct nv84_decoder *dec,
+                    const char *fw1, const char *fw2)
+{
+   int ret, size1, size2 = 0;
+   struct nouveau_bo *fw;
+
+   size1 = filesize(fw1);
+   if (fw2)
+      size2 = filesize(fw2);
+   if (size1 < 0 || size2 < 0)
+      return NULL;
+
+   dec->vp_fw2_offset = align(size1, 0x100);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, dec->vp_fw2_offset + size2, NULL, &fw);
+   if (ret)
+      return NULL;
+   ret = nouveau_bo_map(fw, NOUVEAU_BO_WR, dec->client);
+   if (ret)
+      goto error;
+
+   ret = nv84_copy_firmware(fw1, fw->map, size1);
+   if (fw2 && !ret)
+      ret = nv84_copy_firmware(fw2, fw->map + dec->vp_fw2_offset, size2);
+   munmap(fw->map, fw->size);
+   fw->map = NULL;
+   if (!ret)
+      return fw;
+error:
+   nouveau_bo_ref(NULL, &fw);
+   return NULL;
+}
+
+static struct nouveau_bo *
+nv84_load_bsp_firmware(struct nouveau_device *dev, struct nv84_decoder *dec)
+{
+   return nv84_load_firmwares(
+         dev, dec, "/lib/firmware/nouveau/nv84_bsp-h264", NULL);
+}
+
+static struct nouveau_bo *
+nv84_load_vp_firmware(struct nouveau_device *dev, struct nv84_decoder *dec)
+{
+   return nv84_load_firmwares(
+         dev, dec,
+         "/lib/firmware/nouveau/nv84_vp-h264-1",
+         "/lib/firmware/nouveau/nv84_vp-h264-2");
+}
+
+static struct nouveau_bo *
+nv84_load_vp_firmware_mpeg(struct nouveau_device *dev, struct nv84_decoder *dec)
+{
+   return nv84_load_firmwares(
+         dev, dec, "/lib/firmware/nouveau/nv84_vp-mpeg12", NULL);
+}
+
+static void
+nv84_decoder_decode_bitstream_h264(struct pipe_video_codec *decoder,
+                                   struct pipe_video_buffer *video_target,
+                                   struct pipe_picture_desc *picture,
+                                   unsigned num_buffers,
+                                   const void *const *data,
+                                   const unsigned *num_bytes)
+{
+   struct nv84_decoder *dec = (struct nv84_decoder *)decoder;
+   struct nv84_video_buffer *target = (struct nv84_video_buffer *)video_target;
+
+   struct pipe_h264_picture_desc *desc = (struct pipe_h264_picture_desc *)picture;
+
+   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
+
+   nv84_decoder_bsp(dec, desc, num_buffers, data, num_bytes, target);
+   nv84_decoder_vp_h264(dec, desc, target);
+}
+
+static void
+nv84_decoder_flush(struct pipe_video_codec *decoder)
+{
+}
+
+static void
+nv84_decoder_begin_frame_h264(struct pipe_video_codec *decoder,
+                              struct pipe_video_buffer *target,
+                              struct pipe_picture_desc *picture)
+{
+}
+
+static void
+nv84_decoder_end_frame_h264(struct pipe_video_codec *decoder,
+                            struct pipe_video_buffer *target,
+                            struct pipe_picture_desc *picture)
+{
+}
+
+static void
+nv84_decoder_decode_bitstream_mpeg12(struct pipe_video_codec *decoder,
+                                     struct pipe_video_buffer *video_target,
+                                     struct pipe_picture_desc *picture,
+                                     unsigned num_buffers,
+                                     const void *const *data,
+                                     const unsigned *num_bytes)
+{
+   struct nv84_decoder *dec = (struct nv84_decoder *)decoder;
+
+   assert(video_target->buffer_format == PIPE_FORMAT_NV12);
+
+   vl_mpg12_bs_decode(dec->mpeg12_bs,
+                      video_target,
+                      (struct pipe_mpeg12_picture_desc *)picture,
+                      num_buffers,
+                      data,
+                      num_bytes);
+}
+
+static void
+nv84_decoder_begin_frame_mpeg12(struct pipe_video_codec *decoder,
+                              struct pipe_video_buffer *target,
+                              struct pipe_picture_desc *picture)
+{
+   struct nv84_decoder *dec = (struct nv84_decoder *)decoder;
+   struct pipe_mpeg12_picture_desc *desc = (struct pipe_mpeg12_picture_desc *)picture;
+   int i;
+
+   nouveau_bo_wait(dec->mpeg12_bo, NOUVEAU_BO_RDWR, dec->client);
+   dec->mpeg12_mb_info = dec->mpeg12_bo->map + 0x100;
+   dec->mpeg12_data = dec->mpeg12_bo->map + 0x100 +
+      align(0x20 * mb(dec->base.width) * mb(dec->base.height), 0x100);
+   if (desc->intra_matrix) {
+      dec->zscan = desc->alternate_scan ? vl_zscan_alternate : vl_zscan_normal;
+      for (i = 0; i < 64; i++) {
+         dec->mpeg12_intra_matrix[i] = desc->intra_matrix[dec->zscan[i]];
+         dec->mpeg12_non_intra_matrix[i] = desc->non_intra_matrix[dec->zscan[i]];
+      }
+      dec->mpeg12_intra_matrix[0] = 1 << (7 - desc->intra_dc_precision);
+   }
+}
+
+static void
+nv84_decoder_end_frame_mpeg12(struct pipe_video_codec *decoder,
+                              struct pipe_video_buffer *target,
+                              struct pipe_picture_desc *picture)
+{
+   nv84_decoder_vp_mpeg12(
+         (struct nv84_decoder *)decoder,
+         (struct pipe_mpeg12_picture_desc *)picture,
+         (struct nv84_video_buffer *)target);
+}
+
+static void
+nv84_decoder_decode_macroblock(struct pipe_video_codec *decoder,
+                               struct pipe_video_buffer *target,
+                               struct pipe_picture_desc *picture,
+                               const struct pipe_macroblock *macroblocks,
+                               unsigned num_macroblocks)
+{
+   const struct pipe_mpeg12_macroblock *mb = (const struct pipe_mpeg12_macroblock *)macroblocks;
+   for (int i = 0; i < num_macroblocks; i++, mb++) {
+      nv84_decoder_vp_mpeg12_mb(
+            (struct nv84_decoder *)decoder,
+            (struct pipe_mpeg12_picture_desc *)picture,
+            mb);
+   }
+}
+
+static void
+nv84_decoder_destroy(struct pipe_video_codec *decoder)
+{
+   struct nv84_decoder *dec = (struct nv84_decoder *)decoder;
+
+   nouveau_bo_ref(NULL, &dec->bsp_fw);
+   nouveau_bo_ref(NULL, &dec->bsp_data);
+   nouveau_bo_ref(NULL, &dec->vp_fw);
+   nouveau_bo_ref(NULL, &dec->vp_data);
+   nouveau_bo_ref(NULL, &dec->mbring);
+   nouveau_bo_ref(NULL, &dec->vpring);
+   nouveau_bo_ref(NULL, &dec->bitstream);
+   nouveau_bo_ref(NULL, &dec->vp_params);
+   nouveau_bo_ref(NULL, &dec->fence);
+
+   nouveau_object_del(&dec->bsp);
+   nouveau_object_del(&dec->vp);
+
+   nouveau_bufctx_del(&dec->bsp_bufctx);
+   nouveau_pushbuf_del(&dec->bsp_pushbuf);
+   nouveau_object_del(&dec->bsp_channel);
+
+   nouveau_bufctx_del(&dec->vp_bufctx);
+   nouveau_pushbuf_del(&dec->vp_pushbuf);
+   nouveau_object_del(&dec->vp_channel);
+
+   nouveau_client_del(&dec->client);
+
+   if (dec->mpeg12_bs)
+      FREE(dec->mpeg12_bs);
+   FREE(dec);
+}
+
+struct pipe_video_codec *
+nv84_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ)
+{
+   struct nv50_context *nv50 = (struct nv50_context *)context;
+   struct nouveau_screen *screen = &nv50->screen->base;
+   struct nv84_decoder *dec;
+   struct nouveau_pushbuf *bsp_push, *vp_push;
+   struct nv50_surface surf;
+   struct nv50_miptree mip;
+   union pipe_color_union color;
+   struct nv04_fifo nv04_data = { .vram = 0xbeef0201, .gart = 0xbeef0202 };
+   int ret, i;
+   int is_h264 = u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC;
+   int is_mpeg12 = u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_MPEG12;
+
+   if (getenv("XVMC_VL"))
+      return vl_create_decoder(context, templ);
+
+   if ((is_h264 && templ->entrypoint != PIPE_VIDEO_ENTRYPOINT_BITSTREAM) ||
+       (is_mpeg12 && templ->entrypoint > PIPE_VIDEO_ENTRYPOINT_IDCT)) {
+      debug_printf("%x\n", templ->entrypoint);
+      return NULL;
+   }
+
+   if (!is_h264 && !is_mpeg12) {
+      debug_printf("invalid profile: %x\n", templ->profile);
+      return NULL;
+   }
+
+   dec = CALLOC_STRUCT(nv84_decoder);
+   if (!dec)
+      return NULL;
+
+   dec->base = *templ;
+   dec->base.context = context;
+   dec->base.destroy = nv84_decoder_destroy;
+   dec->base.flush = nv84_decoder_flush;
+   if (is_h264) {
+      dec->base.decode_bitstream = nv84_decoder_decode_bitstream_h264;
+      dec->base.begin_frame = nv84_decoder_begin_frame_h264;
+      dec->base.end_frame = nv84_decoder_end_frame_h264;
+
+      dec->frame_mbs = mb(dec->base.width) * mb_half(dec->base.height) * 2;
+      dec->frame_size = dec->frame_mbs << 8;
+      dec->vpring_deblock = align(0x30 * dec->frame_mbs, 0x100);
+      dec->vpring_residual = 0x2000 + MAX2(0x32000, 0x600 * dec->frame_mbs);
+      dec->vpring_ctrl = MAX2(0x10000, align(0x1080 + 0x144 * dec->frame_mbs, 0x100));
+   } else if (is_mpeg12) {
+      dec->base.decode_macroblock = nv84_decoder_decode_macroblock;
+      dec->base.begin_frame = nv84_decoder_begin_frame_mpeg12;
+      dec->base.end_frame = nv84_decoder_end_frame_mpeg12;
+
+      if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+         dec->mpeg12_bs = CALLOC_STRUCT(vl_mpg12_bs);
+         if (!dec->mpeg12_bs)
+            goto fail;
+         vl_mpg12_bs_init(dec->mpeg12_bs, &dec->base);
+         dec->base.decode_bitstream = nv84_decoder_decode_bitstream_mpeg12;
+      }
+   } else {
+      goto fail;
+   }
+
+   ret = nouveau_client_new(screen->device, &dec->client);
+   if (ret)
+      goto fail;
+
+   if (is_h264) {
+      ret = nouveau_object_new(&screen->device->object, 0,
+                               NOUVEAU_FIFO_CHANNEL_CLASS,
+                               &nv04_data, sizeof(nv04_data), &dec->bsp_channel);
+      if (ret)
+         goto fail;
+
+      ret = nouveau_pushbuf_new(dec->client, dec->bsp_channel, 4,
+                                32 * 1024, true, &dec->bsp_pushbuf);
+      if (ret)
+         goto fail;
+
+      ret = nouveau_bufctx_new(dec->client, 1, &dec->bsp_bufctx);
+      if (ret)
+         goto fail;
+   }
+
+   ret = nouveau_object_new(&screen->device->object, 0,
+                            NOUVEAU_FIFO_CHANNEL_CLASS,
+                            &nv04_data, sizeof(nv04_data), &dec->vp_channel);
+   if (ret)
+      goto fail;
+   ret = nouveau_pushbuf_new(dec->client, dec->vp_channel, 4,
+                             32 * 1024, true, &dec->vp_pushbuf);
+   if (ret)
+      goto fail;
+
+   ret = nouveau_bufctx_new(dec->client, 1, &dec->vp_bufctx);
+   if (ret)
+      goto fail;
+
+   bsp_push = dec->bsp_pushbuf;
+   vp_push = dec->vp_pushbuf;
+
+   if (is_h264) {
+      dec->bsp_fw = nv84_load_bsp_firmware(screen->device, dec);
+      dec->vp_fw = nv84_load_vp_firmware(screen->device, dec);
+      if (!dec->bsp_fw || !dec->vp_fw)
+         goto fail;
+   }
+   if (is_mpeg12) {
+      dec->vp_fw = nv84_load_vp_firmware_mpeg(screen->device, dec);
+      if (!dec->vp_fw)
+         goto fail;
+   }
+
+   if (is_h264) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP,
+                           0, 0x40000, NULL, &dec->bsp_data);
+      if (ret)
+         goto fail;
+   }
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP,
+                        0, 0x40000, NULL, &dec->vp_data);
+   if (ret)
+      goto fail;
+   if (is_h264) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP,
+                           0,
+                           2 * (dec->vpring_deblock +
+                                dec->vpring_residual +
+                                dec->vpring_ctrl +
+                                0x1000),
+                           NULL, &dec->vpring);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP,
+                           0,
+                           (templ->max_references + 1) * dec->frame_mbs * 0x40 +
+                           dec->frame_size + 0x2000,
+                           NULL, &dec->mbring);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART,
+                           0, 2 * (0x700 + MAX2(0x40000, 0x800 + 0x180 * dec->frame_mbs)),
+                           NULL, &dec->bitstream);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_map(dec->bitstream, NOUVEAU_BO_WR, dec->client);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART,
+                           0, 0x2000, NULL, &dec->vp_params);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_map(dec->vp_params, NOUVEAU_BO_WR, dec->client);
+      if (ret)
+         goto fail;
+   }
+   if (is_mpeg12) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART,
+                           0,
+                           align(0x20 * mb(templ->width) * mb(templ->height), 0x100) +
+                           (6 * 64 * 8) * mb(templ->width) * mb(templ->height) + 0x100,
+                           NULL, &dec->mpeg12_bo);
+      if (ret)
+         goto fail;
+      ret = nouveau_bo_map(dec->mpeg12_bo, NOUVEAU_BO_WR, dec->client);
+      if (ret)
+         goto fail;
+   }
+
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                        0, 0x1000, NULL, &dec->fence);
+   if (ret)
+      goto fail;
+   ret = nouveau_bo_map(dec->fence, NOUVEAU_BO_WR, dec->client);
+   if (ret)
+      goto fail;
+   *(uint32_t *)dec->fence->map = 0;
+
+   if (is_h264) {
+      nouveau_pushbuf_bufctx(bsp_push, dec->bsp_bufctx);
+      nouveau_bufctx_refn(dec->bsp_bufctx, 0,
+                          dec->bsp_fw, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+      nouveau_bufctx_refn(dec->bsp_bufctx, 0,
+                          dec->bsp_data, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+   }
+
+   nouveau_pushbuf_bufctx(vp_push, dec->vp_bufctx);
+   nouveau_bufctx_refn(dec->vp_bufctx, 0, dec->vp_fw,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(dec->vp_bufctx, 0, dec->vp_data,
+                       NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+
+   if (is_h264 && !ret)
+      ret = nouveau_object_new(dec->bsp_channel, 0xbeef74b0, 0x74b0,
+                               NULL, 0, &dec->bsp);
+
+   if (!ret)
+      ret = nouveau_object_new(dec->vp_channel, 0xbeef7476, 0x7476,
+                               NULL, 0, &dec->vp);
+
+   if (ret)
+      goto fail;
+
+
+   if (is_h264) {
+      /* Zero out some parts of mbring/vpring. there's gotta be some cleaner way
+       * of doing this... perhaps makes sense to just copy the relevant logic
+       * here. */
+      color.f[0] = color.f[1] = color.f[2] = color.f[3] = 0;
+      surf.offset = dec->frame_size;
+      surf.width = 64;
+      surf.height = (templ->max_references + 1) * dec->frame_mbs / 4;
+      surf.depth = 1;
+      surf.base.format = PIPE_FORMAT_B8G8R8A8_UNORM;
+      surf.base.u.tex.level = 0;
+      surf.base.texture = &mip.base.base;
+      mip.level[0].tile_mode = 0;
+      mip.level[0].pitch = surf.width * 4;
+      mip.base.domain = NOUVEAU_BO_VRAM;
+      mip.base.bo = dec->mbring;
+      context->clear_render_target(context, &surf.base, &color, 0, 0, 64, 4760);
+      surf.offset = dec->vpring->size / 2 - 0x1000;
+      surf.width = 1024;
+      surf.height = 1;
+      mip.level[0].pitch = surf.width * 4;
+      mip.base.bo = dec->vpring;
+      context->clear_render_target(context, &surf.base, &color, 0, 0, 1024, 1);
+      surf.offset = dec->vpring->size - 0x1000;
+      context->clear_render_target(context, &surf.base, &color, 0, 0, 1024, 1);
+
+      PUSH_SPACE(screen->pushbuf, 5);
+      PUSH_REFN(screen->pushbuf, dec->fence, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
+      /* The clear_render_target is done via 3D engine, so use it to write to a
+       * sempahore to indicate that it's done.
+       */
+      BEGIN_NV04(screen->pushbuf, NV50_3D(QUERY_ADDRESS_HIGH), 4);
+      PUSH_DATAh(screen->pushbuf, dec->fence->offset);
+      PUSH_DATA (screen->pushbuf, dec->fence->offset);
+      PUSH_DATA (screen->pushbuf, 1);
+      PUSH_DATA (screen->pushbuf, 0xf010);
+      PUSH_KICK (screen->pushbuf);
+
+      PUSH_SPACE(bsp_push, 2 + 12 + 2 + 4 + 3);
+
+      BEGIN_NV04(bsp_push, SUBC_BSP(NV01_SUBCHAN_OBJECT), 1);
+      PUSH_DATA (bsp_push, dec->bsp->handle);
+
+      BEGIN_NV04(bsp_push, SUBC_BSP(0x180), 11);
+      for (i = 0; i < 11; i++)
+         PUSH_DATA(bsp_push, nv04_data.vram);
+      BEGIN_NV04(bsp_push, SUBC_BSP(0x1b8), 1);
+      PUSH_DATA (bsp_push, nv04_data.vram);
+
+      BEGIN_NV04(bsp_push, SUBC_BSP(0x600), 3);
+      PUSH_DATAh(bsp_push, dec->bsp_fw->offset);
+      PUSH_DATA (bsp_push, dec->bsp_fw->offset);
+      PUSH_DATA (bsp_push, dec->bsp_fw->size);
+
+      BEGIN_NV04(bsp_push, SUBC_BSP(0x628), 2);
+      PUSH_DATA (bsp_push, dec->bsp_data->offset >> 8);
+      PUSH_DATA (bsp_push, dec->bsp_data->size);
+      PUSH_KICK (bsp_push);
+   }
+
+   PUSH_SPACE(vp_push, 2 + 12 + 2 + 4 + 3);
+
+   BEGIN_NV04(vp_push, SUBC_VP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (vp_push, dec->vp->handle);
+
+   BEGIN_NV04(vp_push, SUBC_VP(0x180), 11);
+   for (i = 0; i < 11; i++)
+      PUSH_DATA(vp_push, nv04_data.vram);
+
+   BEGIN_NV04(vp_push, SUBC_VP(0x1b8), 1);
+   PUSH_DATA (vp_push, nv04_data.vram);
+
+   BEGIN_NV04(vp_push, SUBC_VP(0x600), 3);
+   PUSH_DATAh(vp_push, dec->vp_fw->offset);
+   PUSH_DATA (vp_push, dec->vp_fw->offset);
+   PUSH_DATA (vp_push, dec->vp_fw->size);
+
+   BEGIN_NV04(vp_push, SUBC_VP(0x628), 2);
+   PUSH_DATA (vp_push, dec->vp_data->offset >> 8);
+   PUSH_DATA (vp_push, dec->vp_data->size);
+   PUSH_KICK (vp_push);
+
+   return &dec->base;
+fail:
+   nv84_decoder_destroy(&dec->base);
+   return NULL;
+}
+
+static struct pipe_sampler_view **
+nv84_video_buffer_sampler_view_planes(struct pipe_video_buffer *buffer)
+{
+   struct nv84_video_buffer *buf = (struct nv84_video_buffer *)buffer;
+   return buf->sampler_view_planes;
+}
+
+static struct pipe_sampler_view **
+nv84_video_buffer_sampler_view_components(struct pipe_video_buffer *buffer)
+{
+   struct nv84_video_buffer *buf = (struct nv84_video_buffer *)buffer;
+   return buf->sampler_view_components;
+}
+
+static struct pipe_surface **
+nv84_video_buffer_surfaces(struct pipe_video_buffer *buffer)
+{
+   struct nv84_video_buffer *buf = (struct nv84_video_buffer *)buffer;
+   return buf->surfaces;
+}
+
+static void
+nv84_video_buffer_destroy(struct pipe_video_buffer *buffer)
+{
+   struct nv84_video_buffer *buf = (struct nv84_video_buffer *)buffer;
+   unsigned i;
+
+   assert(buf);
+
+   for (i = 0; i < VL_NUM_COMPONENTS; ++i) {
+      pipe_resource_reference(&buf->resources[i], NULL);
+      pipe_sampler_view_reference(&buf->sampler_view_planes[i], NULL);
+      pipe_sampler_view_reference(&buf->sampler_view_components[i], NULL);
+      pipe_surface_reference(&buf->surfaces[i * 2], NULL);
+      pipe_surface_reference(&buf->surfaces[i * 2 + 1], NULL);
+   }
+
+   nouveau_bo_ref(NULL, &buf->interlaced);
+   nouveau_bo_ref(NULL, &buf->full);
+
+   FREE(buffer);
+}
+
+struct pipe_video_buffer *
+nv84_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *template)
+{
+   struct nv84_video_buffer *buffer;
+   struct pipe_resource templ;
+   unsigned i, j, component;
+   struct pipe_sampler_view sv_templ;
+   struct pipe_surface surf_templ;
+   struct nv50_miptree *mt0, *mt1;
+   struct nouveau_bo *empty = NULL;
+   struct nouveau_screen *screen = &((struct nv50_context *)pipe)->screen->base;
+   union nouveau_bo_config cfg;
+   unsigned bo_size;
+
+   if (getenv("XVMC_VL") || template->buffer_format != PIPE_FORMAT_NV12)
+      return vl_video_buffer_create(pipe, template);
+
+   if (!template->interlaced) {
+      debug_printf("Require interlaced video buffers\n");
+      return NULL;
+   }
+   if (template->chroma_format != PIPE_VIDEO_CHROMA_FORMAT_420) {
+      debug_printf("Must use 4:2:0 format\n");
+      return NULL;
+   }
+
+   /*
+    * Note that there are always going to be exactly two planes, one for Y,
+    * and one for UV. These are also the resources. VP expects these to be
+    * adjacent, so they need to belong to the same BO.
+    */
+
+   buffer = CALLOC_STRUCT(nv84_video_buffer);
+   if (!buffer) return NULL;
+
+   buffer->mvidx = -1;
+
+   buffer->base.buffer_format = template->buffer_format;
+   buffer->base.context = pipe;
+   buffer->base.destroy = nv84_video_buffer_destroy;
+   buffer->base.chroma_format = template->chroma_format;
+   buffer->base.width = template->width;
+   buffer->base.height = template->height;
+   buffer->base.get_sampler_view_planes = nv84_video_buffer_sampler_view_planes;
+   buffer->base.get_sampler_view_components = nv84_video_buffer_sampler_view_components;
+   buffer->base.get_surfaces = nv84_video_buffer_surfaces;
+   buffer->base.interlaced = true;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D_ARRAY;
+   templ.depth0 = 1;
+   templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+   templ.format = PIPE_FORMAT_R8_UNORM;
+   templ.width0 = align(template->width, 2);
+   templ.height0 = align(template->height, 4) / 2;
+   templ.flags = NV50_RESOURCE_FLAG_VIDEO | NV50_RESOURCE_FLAG_NOALLOC;
+   templ.array_size = 2;
+
+   cfg.nv50.tile_mode = 0x20;
+   cfg.nv50.memtype = 0x70;
+
+   buffer->resources[0] = pipe->screen->resource_create(pipe->screen, &templ);
+   if (!buffer->resources[0])
+      goto error;
+
+   templ.format = PIPE_FORMAT_R8G8_UNORM;
+   templ.width0 /= 2;
+   templ.height0 /= 2;
+   buffer->resources[1] = pipe->screen->resource_create(pipe->screen, &templ);
+   if (!buffer->resources[1])
+      goto error;
+
+   mt0 = nv50_miptree(buffer->resources[0]);
+   mt1 = nv50_miptree(buffer->resources[1]);
+
+   bo_size = mt0->total_size + mt1->total_size;
+   if (nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP, 0,
+                      bo_size, &cfg, &buffer->interlaced))
+      goto error;
+   /* XXX Change reference frame management so that this is only allocated in
+    * the decoder when necessary. */
+   if (nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM | NOUVEAU_BO_NOSNOOP, 0,
+                      bo_size, &cfg, &buffer->full))
+      goto error;
+
+   mt0->base.bo = buffer->interlaced;
+   mt0->base.domain = NOUVEAU_BO_VRAM;
+   mt0->base.offset = 0;
+   mt0->base.address = buffer->interlaced->offset + mt0->base.offset;
+   nouveau_bo_ref(buffer->interlaced, &empty);
+
+   mt1->base.bo = buffer->interlaced;
+   mt1->base.domain = NOUVEAU_BO_VRAM;
+   mt1->base.offset = mt0->layer_stride * 2;
+   mt1->base.address = buffer->interlaced->offset + mt1->base.offset;
+   nouveau_bo_ref(buffer->interlaced, &empty);
+
+   memset(&sv_templ, 0, sizeof(sv_templ));
+   for (component = 0, i = 0; i < 2; ++i ) {
+      struct pipe_resource *res = buffer->resources[i];
+      unsigned nr_components = util_format_get_nr_components(res->format);
+
+      u_sampler_view_default_template(&sv_templ, res, res->format);
+      buffer->sampler_view_planes[i] =
+         pipe->create_sampler_view(pipe, res, &sv_templ);
+      if (!buffer->sampler_view_planes[i])
+         goto error;
+
+      for (j = 0; j < nr_components; ++j, ++component) {
+         sv_templ.swizzle_r = sv_templ.swizzle_g = sv_templ.swizzle_b =
+            PIPE_SWIZZLE_RED + j;
+         sv_templ.swizzle_a = PIPE_SWIZZLE_ONE;
+
+         buffer->sampler_view_components[component] =
+            pipe->create_sampler_view(pipe, res, &sv_templ);
+         if (!buffer->sampler_view_components[component])
+            goto error;
+      }
+   }
+
+   memset(&surf_templ, 0, sizeof(surf_templ));
+   for (j = 0; j < 2; ++j) {
+      surf_templ.format = buffer->resources[j]->format;
+      surf_templ.u.tex.first_layer = surf_templ.u.tex.last_layer = 0;
+      buffer->surfaces[j * 2] =
+         pipe->create_surface(pipe, buffer->resources[j], &surf_templ);
+      if (!buffer->surfaces[j * 2])
+         goto error;
+
+      surf_templ.u.tex.first_layer = surf_templ.u.tex.last_layer = 1;
+      buffer->surfaces[j * 2 + 1] =
+         pipe->create_surface(pipe, buffer->resources[j], &surf_templ);
+      if (!buffer->surfaces[j * 2 + 1])
+         goto error;
+   }
+
+   return &buffer->base;
+
+error:
+   nv84_video_buffer_destroy(&buffer->base);
+   return NULL;
+}
+
+int
+nv84_screen_get_video_param(struct pipe_screen *pscreen,
+                            enum pipe_video_profile profile,
+                            enum pipe_video_entrypoint entrypoint,
+                            enum pipe_video_cap param)
+{
+   switch (param) {
+   case PIPE_VIDEO_CAP_SUPPORTED:
+      return u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC ||
+         u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_MPEG12;
+   case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_VIDEO_CAP_MAX_WIDTH:
+   case PIPE_VIDEO_CAP_MAX_HEIGHT:
+      return 2048;
+   case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+      return PIPE_FORMAT_NV12;
+   case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
+   case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
+      return true;
+   case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
+      return false;
+   case PIPE_VIDEO_CAP_MAX_LEVEL:
+      switch (profile) {
+      case PIPE_VIDEO_PROFILE_MPEG1:
+         return 0;
+      case PIPE_VIDEO_PROFILE_MPEG2_SIMPLE:
+      case PIPE_VIDEO_PROFILE_MPEG2_MAIN:
+         return 3;
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN:
+      case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
+         return 41;
+      default:
+         debug_printf("unknown video profile: %d\n", profile);
+         return 0;
+      }
+   default:
+      debug_printf("unknown video param: %d\n", param);
+      return 0;
+   }
+}
+
+boolean
+nv84_screen_video_supported(struct pipe_screen *screen,
+                            enum pipe_format format,
+                            enum pipe_video_profile profile,
+                            enum pipe_video_entrypoint entrypoint)
+{
+   if (profile != PIPE_VIDEO_PROFILE_UNKNOWN)
+      return format == PIPE_FORMAT_NV12;
+
+   return vl_video_buffer_is_format_supported(screen, format, profile, entrypoint);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.h b/src/gallium/drivers/nouveau/nv50/nv84_video.h
new file mode 100644
index 00000000000..2edba389dbf
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2013 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NV84_VIDEO_H_
+#define NV84_VIDEO_H_
+
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
+#include "vl/vl_types.h"
+
+#include "vl/vl_mpeg12_bitstream.h"
+
+#include "util/u_video.h"
+
+#include "nv50/nv50_context.h"
+
+/* These are expected to be on their own pushbufs */
+#define SUBC_BSP(m) 2, (m)
+#define SUBC_VP(m) 2, (m)
+
+union pipe_desc {
+   struct pipe_picture_desc *base;
+   struct pipe_mpeg12_picture_desc *mpeg12;
+   struct pipe_mpeg4_picture_desc *mpeg4;
+   struct pipe_vc1_picture_desc *vc1;
+   struct pipe_h264_picture_desc *h264;
+};
+
+struct nv84_video_buffer {
+   struct pipe_video_buffer base;
+   struct pipe_resource *resources[VL_NUM_COMPONENTS];
+   struct pipe_sampler_view *sampler_view_planes[VL_NUM_COMPONENTS];
+   struct pipe_sampler_view *sampler_view_components[VL_NUM_COMPONENTS];
+   struct pipe_surface *surfaces[VL_NUM_COMPONENTS * 2];
+
+   struct nouveau_bo *interlaced, *full;
+   int mvidx;
+   unsigned frame_num, frame_num_max;
+};
+
+struct nv84_decoder {
+   struct pipe_video_codec base;
+   struct nouveau_client *client;
+   struct nouveau_object *bsp_channel, *vp_channel, *bsp, *vp;
+   struct nouveau_pushbuf *bsp_pushbuf, *vp_pushbuf;
+   struct nouveau_bufctx *bsp_bufctx, *vp_bufctx;
+
+   struct nouveau_bo *bsp_fw, *bsp_data;
+   struct nouveau_bo *vp_fw, *vp_data;
+   struct nouveau_bo *mbring, *vpring;
+
+   /*
+    * states:
+    *  0: init
+    *  1: vpring/mbring cleared, bsp is ready
+    *  2: bsp is done, vp is ready
+    * and then vp it back to 1
+    */
+   struct nouveau_bo *fence;
+
+   struct nouveau_bo *bitstream;
+   struct nouveau_bo *vp_params;
+
+   size_t vp_fw2_offset;
+
+   unsigned frame_mbs, frame_size;
+   /* VPRING layout:
+        RESIDUAL
+        CTRL
+        DEBLOCK
+        0x1000
+   */
+   unsigned vpring_deblock, vpring_residual, vpring_ctrl;
+
+
+   struct vl_mpg12_bs *mpeg12_bs;
+
+   struct nouveau_bo *mpeg12_bo;
+   void *mpeg12_mb_info;
+   uint16_t *mpeg12_data;
+   const int *zscan;
+   uint8_t mpeg12_intra_matrix[64];
+   uint8_t mpeg12_non_intra_matrix[64];
+};
+
+static INLINE uint32_t mb(uint32_t coord)
+{
+   return (coord + 0xf)>>4;
+}
+
+static INLINE uint32_t mb_half(uint32_t coord)
+{
+   return (coord + 0x1f)>>5;
+}
+
+int
+nv84_decoder_bsp(struct nv84_decoder *dec,
+                 struct pipe_h264_picture_desc *desc,
+                 unsigned num_buffers,
+                 const void *const *data,
+                 const unsigned *num_bytes,
+                 struct nv84_video_buffer *dest);
+
+void
+nv84_decoder_vp_h264(struct nv84_decoder *dec,
+                     struct pipe_h264_picture_desc *desc,
+                     struct nv84_video_buffer *dest);
+
+void
+nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
+                          struct pipe_mpeg12_picture_desc *desc,
+                          const struct pipe_mpeg12_macroblock *mb);
+
+void
+nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
+                       struct pipe_mpeg12_picture_desc *desc,
+                       struct nv84_video_buffer *dest);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_bsp.c
new file mode 100644
index 00000000000..86047b5f463
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video_bsp.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright 2013 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv84_video.h"
+
+struct iparm {
+   struct iseqparm {
+      uint32_t chroma_format_idc; // 00
+      uint32_t pad[(0x128 - 0x4) / 4];
+      uint32_t log2_max_frame_num_minus4; // 128
+      uint32_t pic_order_cnt_type; // 12c
+      uint32_t log2_max_pic_order_cnt_lsb_minus4; // 130
+      uint32_t delta_pic_order_always_zero_flag; // 134
+      uint32_t num_ref_frames; // 138
+      uint32_t pic_width_in_mbs_minus1; // 13c
+      uint32_t pic_height_in_map_units_minus1; // 140
+      uint32_t frame_mbs_only_flag; // 144
+      uint32_t mb_adaptive_frame_field_flag; // 148
+      uint32_t direct_8x8_inference_flag; // 14c
+   } iseqparm; // 000
+   struct ipicparm {
+      uint32_t entropy_coding_mode_flag; // 00
+      uint32_t pic_order_present_flag; // 04
+      uint32_t num_slice_groups_minus1; // 08
+      uint32_t slice_group_map_type; // 0c
+      uint32_t pad1[0x60 / 4];
+      uint32_t u70; // 70
+      uint32_t u74; // 74
+      uint32_t u78; // 78
+      uint32_t num_ref_idx_l0_active_minus1; // 7c
+      uint32_t num_ref_idx_l1_active_minus1; // 80
+      uint32_t weighted_pred_flag; // 84
+      uint32_t weighted_bipred_idc; // 88
+      uint32_t pic_init_qp_minus26; // 8c
+      uint32_t chroma_qp_index_offset; // 90
+      uint32_t deblocking_filter_control_present_flag; // 94
+      uint32_t constrained_intra_pred_flag; // 98
+      uint32_t redundant_pic_cnt_present_flag; // 9c
+      uint32_t transform_8x8_mode_flag; // a0
+      uint32_t pad2[(0x1c8 - 0xa0 - 4) / 4];
+      uint32_t second_chroma_qp_index_offset; // 1c8
+      uint32_t u1cc; // 1cc
+      uint32_t curr_pic_order_cnt; // 1d0
+      uint32_t field_order_cnt[2]; // 1d4
+      uint32_t curr_mvidx; // 1dc
+      struct iref {
+         uint32_t u00; // 00
+         uint32_t field_is_ref; // 04 // bit0: top, bit1: bottom
+         uint8_t is_long_term; // 08
+         uint8_t non_existing; // 09
+         uint32_t frame_idx; // 0c
+         uint32_t field_order_cnt[2]; // 10
+         uint32_t mvidx; // 18
+         uint8_t field_pic_flag; // 1c
+         // 20
+      } refs[0x10]; // 1e0
+   } ipicparm; // 150
+};
+
+int
+nv84_decoder_bsp(struct nv84_decoder *dec,
+                 struct pipe_h264_picture_desc *desc,
+                 unsigned num_buffers,
+                 const void *const *data,
+                 const unsigned *num_bytes,
+                 struct nv84_video_buffer *dest)
+{
+   struct iparm params;
+   uint32_t more_params[0x44 / 4] = {0};
+   unsigned total_bytes = 0;
+   int i;
+   static const uint32_t end[] = {0x0b010000, 0, 0x0b010000, 0};
+   char indexes[17] = {0};
+   struct nouveau_pushbuf *push = dec->bsp_pushbuf;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->bitstream, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
+      { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+
+   nouveau_bo_wait(dec->fence, NOUVEAU_BO_RDWR, dec->client);
+
+   STATIC_ASSERT(sizeof(struct iparm) == 0x530);
+
+   memset(&params, 0, sizeof(params));
+
+   dest->frame_num = dest->frame_num_max = desc->frame_num;
+
+   for (i = 0; i < 16; i++) {
+      struct iref *ref = &params.ipicparm.refs[i];
+      struct nv84_video_buffer *frame = (struct nv84_video_buffer *)desc->ref[i];
+      if (!frame) break;
+      /* The frame index is relative to the last IDR frame. So once the frame
+       * num goes back to 0, previous reference frames need to have a negative
+       * index.
+       */
+      if (desc->frame_num >= frame->frame_num_max) {
+         frame->frame_num_max = desc->frame_num;
+      } else {
+         frame->frame_num -= frame->frame_num_max + 1;
+         frame->frame_num_max = desc->frame_num;
+      }
+      ref->non_existing = 0;
+      ref->field_is_ref = (desc->top_is_reference[i] ? 1 : 0) |
+         (desc->bottom_is_reference[i] ? 2 : 0);
+      ref->is_long_term = desc->is_long_term[i];
+      ref->field_order_cnt[0] = desc->field_order_cnt_list[i][0];
+      ref->field_order_cnt[1] = desc->field_order_cnt_list[i][1];
+      ref->frame_idx = frame->frame_num;
+      ref->u00 = ref->mvidx = frame->mvidx;
+      ref->field_pic_flag = desc->field_pic_flag;
+      indexes[frame->mvidx] = 1;
+   }
+
+   /* Needs to be adjusted if we ever support non-4:2:0 videos */
+   params.iseqparm.chroma_format_idc = 1;
+
+   params.iseqparm.pic_width_in_mbs_minus1 = mb(dec->base.width) - 1;
+   if (desc->field_pic_flag || desc->mb_adaptive_frame_field_flag)
+      params.iseqparm.pic_height_in_map_units_minus1 = mb_half(dec->base.height) - 1;
+   else
+      params.iseqparm.pic_height_in_map_units_minus1 = mb(dec->base.height) - 1;
+
+   if (desc->bottom_field_flag)
+      params.ipicparm.curr_pic_order_cnt = desc->field_order_cnt[1];
+   else
+      params.ipicparm.curr_pic_order_cnt = desc->field_order_cnt[0];
+   params.ipicparm.field_order_cnt[0] = desc->field_order_cnt[0];
+   params.ipicparm.field_order_cnt[1] = desc->field_order_cnt[1];
+   if (desc->is_reference) {
+      if (dest->mvidx < 0) {
+         for (i = 0; i < desc->num_ref_frames + 1; i++) {
+            if (!indexes[i]) {
+               dest->mvidx = i;
+               break;
+            }
+         }
+         assert(i != desc->num_ref_frames + 1);
+      }
+
+      params.ipicparm.u1cc = params.ipicparm.curr_mvidx = dest->mvidx;
+   }
+
+   params.iseqparm.num_ref_frames = desc->num_ref_frames;
+   params.iseqparm.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
+   params.ipicparm.constrained_intra_pred_flag = desc->constrained_intra_pred_flag;
+   params.ipicparm.weighted_pred_flag = desc->weighted_pred_flag;
+   params.ipicparm.weighted_bipred_idc = desc->weighted_bipred_idc;
+   params.iseqparm.frame_mbs_only_flag = desc->frame_mbs_only_flag;
+   params.ipicparm.transform_8x8_mode_flag = desc->transform_8x8_mode_flag;
+   params.ipicparm.chroma_qp_index_offset = desc->chroma_qp_index_offset;
+   params.ipicparm.second_chroma_qp_index_offset = desc->second_chroma_qp_index_offset;
+   params.ipicparm.pic_init_qp_minus26 = desc->pic_init_qp_minus26;
+   params.ipicparm.num_ref_idx_l0_active_minus1 = desc->num_ref_idx_l0_active_minus1;
+   params.ipicparm.num_ref_idx_l1_active_minus1 = desc->num_ref_idx_l1_active_minus1;
+   params.iseqparm.log2_max_frame_num_minus4 = desc->log2_max_frame_num_minus4;
+   params.iseqparm.pic_order_cnt_type = desc->pic_order_cnt_type;
+   params.iseqparm.log2_max_pic_order_cnt_lsb_minus4 = desc->log2_max_pic_order_cnt_lsb_minus4;
+   params.iseqparm.delta_pic_order_always_zero_flag = desc->delta_pic_order_always_zero_flag;
+   params.iseqparm.direct_8x8_inference_flag = desc->direct_8x8_inference_flag;
+   params.ipicparm.entropy_coding_mode_flag = desc->entropy_coding_mode_flag;
+   params.ipicparm.pic_order_present_flag = desc->pic_order_present_flag;
+   params.ipicparm.deblocking_filter_control_present_flag = desc->deblocking_filter_control_present_flag;
+   params.ipicparm.redundant_pic_cnt_present_flag = desc->redundant_pic_cnt_present_flag;
+
+   memcpy(dec->bitstream->map, &params, sizeof(params));
+   for (i = 0; i < num_buffers; i++) {
+      assert(total_bytes + num_bytes[i] < dec->bitstream->size / 2 - 0x700);
+      memcpy(dec->bitstream->map + 0x700 + total_bytes, data[i], num_bytes[i]);
+      total_bytes += num_bytes[i];
+   }
+   memcpy(dec->bitstream->map + 0x700 + total_bytes, end, sizeof(end));
+   total_bytes += sizeof(end);
+   more_params[1] = total_bytes;
+   memcpy(dec->bitstream->map + 0x600, more_params, sizeof(more_params));
+
+   PUSH_SPACE(push, 5 + 21 + 3 + 2 + 4 + 2);
+   nouveau_pushbuf_refn(push, bo_refs, sizeof(bo_refs)/sizeof(bo_refs[0]));
+
+   /* Wait for the fence = 1 */
+   BEGIN_NV04(push, SUBC_BSP(0x10), 4);
+   PUSH_DATAh(push, dec->fence->offset);
+   PUSH_DATA (push, dec->fence->offset);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 1);
+
+   /* TODO: Use both halves of bitstream/vpring for alternating frames */
+
+   /* Kick off the BSP */
+   BEGIN_NV04(push, SUBC_BSP(0x400), 20);
+   PUSH_DATA (push, dec->bitstream->offset >> 8);
+   PUSH_DATA (push, (dec->bitstream->offset >> 8) + 7);
+   PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
+   PUSH_DATA (push, (dec->bitstream->offset >> 8) + 6);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, dec->mbring->offset >> 8);
+   PUSH_DATA (push, dec->frame_size);
+   PUSH_DATA (push, (dec->mbring->offset + dec->frame_size) >> 8);
+   PUSH_DATA (push, dec->vpring->offset >> 8);
+   PUSH_DATA (push, dec->vpring->size / 2);
+   PUSH_DATA (push, dec->vpring_residual);
+   PUSH_DATA (push, dec->vpring_ctrl);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, dec->vpring_residual);
+   PUSH_DATA (push, dec->vpring_residual + dec->vpring_ctrl);
+   PUSH_DATA (push, dec->vpring_deblock);
+   PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
+                     dec->vpring_residual + dec->vpring_deblock) >> 8);
+   PUSH_DATA (push, 0x654321);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0x100008);
+
+   BEGIN_NV04(push, SUBC_BSP(0x620), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 0);
+
+   /* Write fence = 2, intr */
+   BEGIN_NV04(push, SUBC_BSP(0x610), 3);
+   PUSH_DATAh(push, dec->fence->offset);
+   PUSH_DATA (push, dec->fence->offset);
+   PUSH_DATA (push, 2);
+
+   BEGIN_NV04(push, SUBC_BSP(0x304), 1);
+   PUSH_DATA (push, 0x101);
+   PUSH_KICK (push);
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
new file mode 100644
index 00000000000..619aa4e7a40
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video_vp.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2013 Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv84_video.h"
+
+#include "util/u_sse.h"
+
+struct h264_iparm1 {
+   uint8_t scaling_lists_4x4[6][16]; // 00
+   uint8_t scaling_lists_8x8[2][64]; // 60
+   uint32_t width; // e0
+   uint32_t height; // e4
+   uint64_t ref1_addrs[16]; // e8
+   uint64_t ref2_addrs[16]; // 168
+   uint32_t unk1e8;
+   uint32_t unk1ec;
+   uint32_t w1; // 1f0
+   uint32_t w2; // 1f4
+   uint32_t w3; // 1f8
+   uint32_t h1; // 1fc
+   uint32_t h2; // 200
+   uint32_t h3; // 204
+   uint32_t mb_adaptive_frame_field_flag; // 208
+   uint32_t field_pic_flag; // 20c
+   uint32_t format; // 210
+   uint32_t unk214; // 214
+};
+
+struct h264_iparm2 {
+   uint32_t width; // 00
+   uint32_t height; // 04
+   uint32_t mbs; // 08
+   uint32_t w1; // 0c
+   uint32_t w2; // 10
+   uint32_t w3; // 14
+   uint32_t h1; // 18
+   uint32_t h2; // 1c
+   uint32_t h3; // 20
+   uint32_t unk24;
+   uint32_t mb_adaptive_frame_field_flag; // 28
+   uint32_t top; // 2c
+   uint32_t bottom; // 30
+   uint32_t is_reference; // 34
+};
+
+void
+nv84_decoder_vp_h264(struct nv84_decoder *dec,
+                     struct pipe_h264_picture_desc *desc,
+                     struct nv84_video_buffer *dest)
+{
+   struct h264_iparm1 param1;
+   struct h264_iparm2 param2;
+   int i, width = align(dest->base.width, 16),
+      height = align(dest->base.height, 16);
+
+   struct nouveau_pushbuf *push = dec->vp_pushbuf;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dest->full, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->vpring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->mbring, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->vp_params, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
+      { dec->fence, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+   bool is_ref = desc->is_reference;
+
+   STATIC_ASSERT(sizeof(struct h264_iparm1) == 0x218);
+   STATIC_ASSERT(sizeof(struct h264_iparm2) == 0x38);
+
+   memset(&param1, 0, sizeof(param1));
+   memset(&param2, 0, sizeof(param2));
+
+   memcpy(&param1.scaling_lists_4x4, desc->scaling_lists_4x4,
+          sizeof(param1.scaling_lists_4x4));
+   memcpy(&param1.scaling_lists_8x8, desc->scaling_lists_8x8,
+          sizeof(param1.scaling_lists_8x8));
+
+   param1.width = width;
+   param1.w1 = param1.w2 = param1.w3 = align(width, 64);
+   param1.height = param1.h2 = height;
+   param1.h1 = param1.h3 = align(height, 32);
+   param1.format = 0x3231564e; /* 'NV12' */
+   param1.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
+   param1.field_pic_flag = desc->field_pic_flag;
+
+   param2.width = width;
+   param2.w1 = param2.w2 = param2.w3 = param1.w1;
+   if (desc->field_pic_flag)
+      param2.height = align(height, 32) / 2;
+   else
+      param2.height = height;
+   param2.h1 = param2.h2 = align(height, 32);
+   param2.h3 = height;
+   param2.mbs = width * height >> 8;
+   if (desc->field_pic_flag) {
+      param2.top = desc->bottom_field_flag ? 2 : 1;
+      param2.bottom = desc->bottom_field_flag;
+   }
+   param2.mb_adaptive_frame_field_flag = desc->mb_adaptive_frame_field_flag;
+   param2.is_reference = desc->is_reference;
+
+   PUSH_SPACE(push, 5 + 16 + 3 + 2 + 6 + (is_ref ? 2 : 0) + 3 + 2 + 4 + 2);
+
+   struct nouveau_bo *ref2_default = dest->full;
+
+   for (i = 0; i < 16; i++) {
+      struct nv84_video_buffer *buf = (struct nv84_video_buffer *)desc->ref[i];
+      struct nouveau_bo *bo1, *bo2;
+      if (buf) {
+         bo1 = buf->interlaced;
+         bo2 = buf->full;
+         if (i == 0)
+            ref2_default = buf->full;
+      } else {
+         bo1 = dest->interlaced;
+         bo2 = ref2_default;
+      }
+      param1.ref1_addrs[i] = bo1->offset;
+      param1.ref2_addrs[i] = bo2->offset;
+      struct nouveau_pushbuf_refn bo_refs[] = {
+         { bo1, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+         { bo2, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      };
+      nouveau_pushbuf_refn(push, bo_refs, sizeof(bo_refs)/sizeof(bo_refs[0]));
+   }
+
+   memcpy(dec->vp_params->map, &param1, sizeof(param1));
+   memcpy(dec->vp_params->map + 0x400, &param2, sizeof(param2));
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   /* Wait for BSP to have completed */
+   BEGIN_NV04(push, SUBC_VP(0x10), 4);
+   PUSH_DATAh(push, dec->fence->offset);
+   PUSH_DATA (push, dec->fence->offset);
+   PUSH_DATA (push, 2);
+   PUSH_DATA (push, 1); /* wait for sem == 2 */
+
+   /* VP step 1 */
+   BEGIN_NV04(push, SUBC_VP(0x400), 15);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, param2.mbs);
+   PUSH_DATA (push, 0x3987654); /* each nibble probably a dma index */
+   PUSH_DATA (push, 0x55001); /* constant */
+   PUSH_DATA (push, dec->vp_params->offset >> 8);
+   PUSH_DATA (push, (dec->vpring->offset + dec->vpring_residual) >> 8);
+   PUSH_DATA (push, dec->vpring_ctrl);
+   PUSH_DATA (push, dec->vpring->offset >> 8);
+   PUSH_DATA (push, dec->bitstream->size / 2 - 0x700);
+   PUSH_DATA (push, (dec->mbring->offset + dec->mbring->size - 0x2000) >> 8);
+   PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
+                     dec->vpring_residual + dec->vpring_deblock) >> 8);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0x100008);
+   PUSH_DATA (push, dest->interlaced->offset >> 8);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, SUBC_VP(0x620), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+
+   /* VP step 2 */
+   BEGIN_NV04(push, SUBC_VP(0x400), 5);
+   PUSH_DATA (push, 0x54530201);
+   PUSH_DATA (push, (dec->vp_params->offset >> 8) + 0x4);
+   PUSH_DATA (push, (dec->vpring->offset + dec->vpring_ctrl +
+                     dec->vpring_residual) >> 8);
+   PUSH_DATA (push, dest->interlaced->offset >> 8);
+   PUSH_DATA (push, dest->interlaced->offset >> 8);
+
+   if (is_ref) {
+      BEGIN_NV04(push, SUBC_VP(0x414), 1);
+      PUSH_DATA (push, dest->full->offset >> 8);
+   }
+
+   BEGIN_NV04(push, SUBC_VP(0x620), 2);
+   PUSH_DATAh(push, dec->vp_fw2_offset);
+   PUSH_DATA (push, dec->vp_fw2_offset);
+
+   BEGIN_NV04(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+
+   /* Set the semaphore back to 1 */
+   BEGIN_NV04(push, SUBC_VP(0x610), 3);
+   PUSH_DATAh(push, dec->fence->offset);
+   PUSH_DATA (push, dec->fence->offset);
+   PUSH_DATA (push, 1);
+
+   /* Write to the semaphore location, intr */
+   BEGIN_NV04(push, SUBC_VP(0x304), 1);
+   PUSH_DATA (push, 0x101);
+
+   for (i = 0; i < 2; i++) {
+      struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+   }
+
+   PUSH_KICK (push);
+}
+
+static INLINE int16_t inverse_quantize(int16_t val, uint8_t quant, int mpeg1) {
+   int16_t ret = val * quant / 16;
+   if (mpeg1 && ret) {
+      if (ret > 0)
+         ret = (ret - 1) | 1;
+      else
+         ret = (ret + 1) | 1;
+   }
+   if (ret < -2048)
+      ret = -2048;
+   else if (ret > 2047)
+      ret = 2047;
+   return ret;
+}
+
+struct mpeg12_mb_info {
+   uint32_t index;
+   uint8_t unk4;
+   uint8_t unk5;
+   uint16_t coded_block_pattern;
+   uint8_t block_counts[6];
+   uint16_t PMV[8];
+   uint16_t skipped;
+};
+
+void
+nv84_decoder_vp_mpeg12_mb(struct nv84_decoder *dec,
+                          struct pipe_mpeg12_picture_desc *desc,
+                          const struct pipe_mpeg12_macroblock *macrob)
+{
+   STATIC_ASSERT(sizeof(struct mpeg12_mb_info) == 32);
+
+   struct mpeg12_mb_info info = {0};
+   int i, sum = 0, mask, block_index, count;
+   const int16_t *blocks;
+   int intra = macrob->macroblock_type & PIPE_MPEG12_MB_TYPE_INTRA;
+   int motion = macrob->macroblock_type &
+      (PIPE_MPEG12_MB_TYPE_MOTION_FORWARD | PIPE_MPEG12_MB_TYPE_MOTION_BACKWARD);
+   const uint8_t *quant_matrix = intra ? dec->mpeg12_intra_matrix :
+      dec->mpeg12_non_intra_matrix;
+   int mpeg1 = dec->base.profile == PIPE_VIDEO_PROFILE_MPEG1;
+
+   info.index = macrob->y * mb(dec->base.width) + macrob->x;
+   info.unk4 = motion;
+   if (intra)
+      info.unk4 |= 1;
+   if (macrob->macroblock_modes.bits.dct_type)
+      info.unk4 |= 0x20;
+   info.unk5 = (macrob->motion_vertical_field_select << 4) |
+      (macrob->macroblock_modes.value & 0xf);
+   info.coded_block_pattern = macrob->coded_block_pattern;
+   if (motion) {
+      memcpy(info.PMV, macrob->PMV, sizeof(info.PMV));
+   }
+   blocks = macrob->blocks;
+   for (mask = 0x20, block_index = 0; mask > 0; mask >>= 1, block_index++) {
+      if ((macrob->coded_block_pattern & mask) == 0)
+         continue;
+
+      count = 0;
+
+      /*
+       * The observation here is that there are a lot of 0's, and things go
+       * a lot faster if one skips over them.
+       */
+
+#if defined(PIPE_ARCH_SSE) && defined(PIPE_ARCH_X86_64)
+/* Note that the SSE implementation is much more tuned to X86_64. As it's not
+ * benchmarked on X86_32, disable it there. I suspect that the code needs to
+ * be reorganized in terms of 32-bit wide data in order to be more
+ * efficient. NV84+ were released well into the 64-bit CPU era, so it should
+ * be a minority case.
+ */
+
+/* This returns a 16-bit bit-mask, each 2 bits are both 1 or both 0, depending
+ * on whether the corresponding (16-bit) word in blocks is zero or non-zero. */
+#define wordmask(blocks, zero) \
+      (uint64_t)(_mm_movemask_epi8( \
+                       _mm_cmpeq_epi16( \
+                             zero, _mm_load_si128((__m128i *)(blocks)))))
+
+      __m128i zero = _mm_setzero_si128();
+
+      /* TODO: Look into doing the inverse quantization in terms of SSE
+       * operations unconditionally, when necessary. */
+      uint64_t bmask0 = wordmask(blocks, zero);
+      bmask0 |= wordmask(blocks + 8, zero) << 16;
+      bmask0 |= wordmask(blocks + 16, zero) << 32;
+      bmask0 |= wordmask(blocks + 24, zero) << 48;
+      uint64_t bmask1 = wordmask(blocks + 32, zero);
+      bmask1 |= wordmask(blocks + 40, zero) << 16;
+      bmask1 |= wordmask(blocks + 48, zero) << 32;
+      bmask1 |= wordmask(blocks + 56, zero) << 48;
+
+      /* The wordmask macro returns the inverse of what we want, since it
+       * returns a 1 for equal-to-zero. Invert. */
+      bmask0 = ~bmask0;
+      bmask1 = ~bmask1;
+
+      /* Note that the bitmask is actually sequences of 2 bits for each block
+       * index. This is because there is no movemask_epi16. That means that
+       * (a) ffs will never return 64, since the prev bit will always be set
+       * in that case, and (b) we need to do an extra bit shift. Or'ing the
+       * bitmasks together is faster than having a loop that computes them one
+       * at a time and processes them, on a Core i7-920. Trying to put bmask
+       * into an array and then looping also slows things down.
+       */
+
+      /* shift needs to be the same width as i, and unsigned so that / 2
+       * becomes a rshift operation */
+      uint32_t shift;
+      i = 0;
+
+      if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+         int16_t tmp;
+         while ((shift = __builtin_ffsll(bmask0))) {
+            i += (shift - 1) / 2;
+            bmask0 >>= shift - 1;
+            *dec->mpeg12_data++ = dec->zscan[i] * 2;
+            tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
+            *dec->mpeg12_data++ = tmp;
+            sum += tmp;
+            count++;
+            i++;
+            bmask0 >>= 2;
+         }
+         i = 32;
+         while ((shift = __builtin_ffsll(bmask1))) {
+            i += (shift - 1) / 2;
+            bmask1 >>= shift - 1;
+            *dec->mpeg12_data++ = dec->zscan[i] * 2;
+            tmp = inverse_quantize(blocks[i], quant_matrix[i], mpeg1);
+            *dec->mpeg12_data++ = tmp;
+            sum += tmp;
+            count++;
+            i++;
+            bmask1 >>= 2;
+         }
+      } else {
+         while ((shift = __builtin_ffsll(bmask0))) {
+            i += (shift - 1) / 2;
+            bmask0 >>= shift - 1;
+            *dec->mpeg12_data++ = i * 2;
+            *dec->mpeg12_data++ = blocks[i];
+            count++;
+            i++;
+            bmask0 >>= 2;
+         }
+         i = 32;
+         while ((shift = __builtin_ffsll(bmask1))) {
+            i += (shift - 1) / 2;
+            bmask1 >>= shift - 1;
+            *dec->mpeg12_data++ = i * 2;
+            *dec->mpeg12_data++ = blocks[i];
+            count++;
+            i++;
+            bmask1 >>= 2;
+         }
+      }
+#undef wordmask
+#else
+
+      /*
+       * This loop looks ridiculously written... and it is. I tried a lot of
+       * different ways of achieving this scan, and this was the fastest, at
+       * least on a Core i7-920. Note that it's not necessary to skip the 0's,
+       * the firmware will deal with those just fine. But it's faster to skip
+       * them. Note to people trying benchmarks: make sure to use realistic
+       * mpeg data, which can often be a single data point first followed by
+       * 63 0's, or <data> 7x <0> <data> 7x <0> etc.
+       */
+      i = 0;
+      if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+         while (true) {
+            int16_t tmp;
+            while (likely(i < 64 && !(tmp = blocks[i]))) i++;
+            if (i >= 64) break;
+            *dec->mpeg12_data++ = dec->zscan[i] * 2;
+            tmp = inverse_quantize(tmp, quant_matrix[i], mpeg1);
+            *dec->mpeg12_data++ = tmp;
+            sum += tmp;
+            count++;
+            i++;
+         }
+      } else {
+         while (true) {
+            int16_t tmp;
+            while (likely(i < 64 && !(tmp = blocks[i]))) i++;
+            if (i >= 64) break;
+            *dec->mpeg12_data++ = i * 2;
+            *dec->mpeg12_data++ = tmp;
+            count++;
+            i++;
+         }
+      }
+
+#endif
+
+      if (dec->base.entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+         if (!mpeg1 && (sum & 1) == 0) {
+            if (count && *(dec->mpeg12_data - 2) == 63 * 2) {
+               uint16_t *val = dec->mpeg12_data - 1;
+               if (*val & 1) *val -= 1;
+               else *val += 1;
+            } else {
+               *dec->mpeg12_data++ = 63 * 2;
+               *dec->mpeg12_data++ = 1;
+               count++;
+            }
+         }
+      }
+
+      if (count) {
+         *(dec->mpeg12_data - 2) |= 1;
+      } else {
+         *dec->mpeg12_data++ = 1;
+         *dec->mpeg12_data++ = 0;
+         count = 1;
+      }
+      info.block_counts[block_index] = count;
+      blocks += 64;
+   }
+
+   memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
+   dec->mpeg12_mb_info += sizeof(info);
+
+   if (macrob->num_skipped_macroblocks) {
+      info.index++;
+      info.coded_block_pattern = 0;
+      info.skipped = macrob->num_skipped_macroblocks - 1;
+      memset(info.block_counts, 0, sizeof(info.block_counts));
+      memcpy(dec->mpeg12_mb_info, &info, sizeof(info));
+      dec->mpeg12_mb_info += sizeof(info);
+   }
+}
+
+struct mpeg12_header {
+   uint32_t luma_top_size; // 00
+   uint32_t luma_bottom_size; // 04
+   uint32_t chroma_top_size; // 08
+   uint32_t mbs; // 0c
+   uint32_t mb_info_size; // 10
+   uint32_t mb_width_minus1; // 14
+   uint32_t mb_height_minus1; // 18
+   uint32_t width; // 1c
+   uint32_t height; // 20
+   uint8_t progressive; // 24
+   uint8_t mocomp_only; // 25
+   uint8_t frames; // 26
+   uint8_t picture_structure; // 27
+   uint32_t unk28; // 28 -- 0x50100
+   uint32_t unk2c; // 2c
+   uint32_t pad[4 * 13];
+};
+
+void
+nv84_decoder_vp_mpeg12(struct nv84_decoder *dec,
+                       struct pipe_mpeg12_picture_desc *desc,
+                       struct nv84_video_buffer *dest)
+{
+   struct nouveau_pushbuf *push = dec->vp_pushbuf;
+   struct nv84_video_buffer *ref1 = (struct nv84_video_buffer *)desc->ref[0];
+   struct nv84_video_buffer *ref2 = (struct nv84_video_buffer *)desc->ref[1];
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { dest->interlaced, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { NULL, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+      { dec->mpeg12_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_GART },
+   };
+   int i, num_refs = sizeof(bo_refs) / sizeof(*bo_refs);
+   struct mpeg12_header header = {0};
+   struct nv50_miptree *y = nv50_miptree(dest->resources[0]);
+   struct nv50_miptree *uv = nv50_miptree(dest->resources[1]);
+
+   STATIC_ASSERT(sizeof(struct mpeg12_header) == 0x100);
+
+   if (ref1 == NULL)
+      ref1 = dest;
+   if (ref2 == NULL)
+      ref2 = dest;
+   bo_refs[1].bo = ref1->interlaced;
+   bo_refs[2].bo = ref2->interlaced;
+
+   header.luma_top_size = y->layer_stride;
+   header.luma_bottom_size = y->layer_stride;
+   header.chroma_top_size = uv->layer_stride;
+   header.mbs = mb(dec->base.width) * mb(dec->base.height);
+   header.mb_info_size = dec->mpeg12_mb_info - dec->mpeg12_bo->map - 0x100;
+   header.mb_width_minus1 = mb(dec->base.width) - 1;
+   header.mb_height_minus1 = mb(dec->base.height) - 1;
+   header.width = align(dec->base.width, 16);
+   header.height = align(dec->base.height, 16);
+   header.progressive = desc->frame_pred_frame_dct;
+   header.frames = 1 + (desc->ref[0] != NULL) + (desc->ref[1] != NULL);
+   header.picture_structure = desc->picture_structure;
+   header.unk28 = 0x50100;
+
+   memcpy(dec->mpeg12_bo->map, &header, sizeof(header));
+
+   PUSH_SPACE(push, 10 + 3 + 2);
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   BEGIN_NV04(push, SUBC_VP(0x400), 9);
+   PUSH_DATA (push, 0x543210); /* each nibble possibly a dma index */
+   PUSH_DATA (push, 0x555001); /* constant */
+   PUSH_DATA (push, dec->mpeg12_bo->offset >> 8);
+   PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100) >> 8);
+   PUSH_DATA (push, (dec->mpeg12_bo->offset + 0x100 +
+                     align(0x20 * mb(dec->base.width) *
+                           mb(dec->base.height), 0x100)) >> 8);
+   PUSH_DATA (push, dest->interlaced->offset >> 8);
+   PUSH_DATA (push, ref1->interlaced->offset >> 8);
+   PUSH_DATA (push, ref2->interlaced->offset >> 8);
+   PUSH_DATA (push, 6 * 64 * 8 * header.mbs);
+
+   BEGIN_NV04(push, SUBC_VP(0x620), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+
+   for (i = 0; i < 2; i++) {
+      struct nv50_miptree *mt = nv50_miptree(dest->resources[i]);
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+   }
+   PUSH_KICK (push);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.c b/src/gallium/drivers/nouveau/nv50/nv98_video.c
new file mode 100644
index 00000000000..069481de207
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst, Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv98_video.h"
+
+#include "util/u_sampler.h"
+#include "util/u_format.h"
+
+static void
+nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
+                              struct pipe_video_buffer *video_target,
+                              struct pipe_picture_desc *picture,
+                              unsigned num_buffers,
+                              const void *const *data,
+                              const unsigned *num_bytes)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
+   uint32_t comm_seq = ++dec->fence_seq;
+   union pipe_desc desc;
+
+   unsigned vp_caps, is_ref, ret;
+   struct nouveau_vp3_video_buffer *refs[16] = {};
+
+   desc.base = picture;
+
+   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nv98_decoder_bsp(dec, desc, target, comm_seq,
+                          num_buffers, data, num_bytes,
+                          &vp_caps, &is_ref, refs);
+
+   /* did we decode bitstream correctly? */
+   assert(ret == 2);
+
+   nv98_decoder_vp(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+   nv98_decoder_ppp(dec, desc, target, comm_seq);
+}
+
+struct pipe_video_codec *
+nv98_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ)
+{
+   struct nouveau_screen *screen = &((struct nv50_context *)context)->screen->base;
+   struct nouveau_vp3_decoder *dec;
+   struct nouveau_pushbuf **push;
+   struct nv04_fifo nv04_data = {.vram = 0xbeef0201, .gart = 0xbeef0202};
+   union nouveau_bo_config cfg;
+
+   cfg.nv50.tile_mode = 0x20;
+   cfg.nv50.memtype = 0x70;
+
+   int ret, i;
+   uint32_t codec = 1, ppp_codec = 3;
+   uint32_t timeout;
+   u32 tmp_size = 0;
+
+   if (getenv("XVMC_VL"))
+       return vl_create_decoder(context, templ);
+
+   if (templ->entrypoint != PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+      debug_printf("%x\n", templ->entrypoint);
+      return NULL;
+   }
+
+   dec = CALLOC_STRUCT(nouveau_vp3_decoder);
+   if (!dec)
+      return NULL;
+   dec->client = screen->client;
+   dec->base = *templ;
+   nouveau_vp3_decoder_init_common(&dec->base);
+
+   dec->bsp_idx = 5;
+   dec->vp_idx = 6;
+   dec->ppp_idx = 7;
+
+   ret = nouveau_object_new(&screen->device->object, 0,
+                            NOUVEAU_FIFO_CHANNEL_CLASS,
+                            &nv04_data, sizeof(nv04_data), &dec->channel[0]);
+
+   if (!ret)
+      ret = nouveau_pushbuf_new(screen->client, dec->channel[0], 4,
+                                32 * 1024, true, &dec->pushbuf[0]);
+
+   for (i = 1; i < 3; ++i) {
+      dec->channel[i] = dec->channel[0];
+      dec->pushbuf[i] = dec->pushbuf[0];
+   }
+   push = dec->pushbuf;
+
+   if (!ret)
+      ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x85b1, NULL, 0, &dec->bsp);
+   if (!ret)
+      ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x85b2, NULL, 0, &dec->vp);
+   if (!ret)
+      ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x85b3, NULL, 0, &dec->ppp);
+   if (ret)
+      goto fail;
+
+   BEGIN_NV04(push[0], SUBC_BSP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[0], dec->bsp->handle);
+
+   BEGIN_NV04(push[0], SUBC_BSP(0x180), 5);
+   for (i = 0; i < 5; i++)
+      PUSH_DATA (push[0], nv04_data.vram);
+
+   BEGIN_NV04(push[1], SUBC_VP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[1], dec->vp->handle);
+
+   BEGIN_NV04(push[1], SUBC_VP(0x180), 6);
+   for (i = 0; i < 6; i++)
+      PUSH_DATA (push[1], nv04_data.vram);
+
+   BEGIN_NV04(push[2], SUBC_PPP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[2], dec->ppp->handle);
+
+   BEGIN_NV04(push[2], SUBC_PPP(0x180), 5);
+   for (i = 0; i < 5; i++)
+      PUSH_DATA (push[2], nv04_data.vram);
+
+   dec->base.context = context;
+   dec->base.decode_bitstream = nv98_decoder_decode_bitstream;
+
+   for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0, 1 << 20, NULL, &dec->bsp_bo[i]);
+   if (!ret)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0x100, 4 << 20, NULL, &dec->inter_bo[0]);
+   if (!ret)
+      nouveau_bo_ref(dec->inter_bo[0], &dec->inter_bo[1]);
+   if (ret)
+      goto fail;
+
+   switch (u_reduce_video_profile(templ->profile)) {
+   case PIPE_VIDEO_FORMAT_MPEG12: {
+      codec = 1;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4: {
+      codec = 4;
+      tmp_size = mb(templ->height)*16 * mb(templ->width)*16;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_VC1: {
+      ppp_codec = codec = 2;
+      tmp_size = mb(templ->height)*16 * mb(templ->width)*16;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+      codec = 3;
+      dec->tmp_stride = 16 * mb_half(templ->width) * nouveau_vp3_video_align(templ->height) * 3 / 2;
+      tmp_size = dec->tmp_stride * (templ->max_references + 1);
+      assert(templ->max_references <= 16);
+      break;
+   }
+   default:
+      fprintf(stderr, "invalid codec\n");
+      goto fail;
+   }
+
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x4000, NULL, &dec->fw_bo);
+   if (ret)
+      goto fail;
+
+   ret = nouveau_vp3_load_firmware(dec, templ->profile, screen->device->chipset);
+   if (ret)
+      goto fw_fail;
+
+   if (codec != 3) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x400, NULL, &dec->bitplane_bo);
+      if (ret)
+         goto fail;
+   }
+
+   dec->ref_stride = mb(templ->width)*16 * (mb_half(templ->height)*32 + nouveau_vp3_video_align(templ->height)/2);
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                        dec->ref_stride * (templ->max_references+2) + tmp_size,
+                        &cfg, &dec->ref_bo);
+   if (ret)
+      goto fail;
+
+   timeout = 0;
+
+   BEGIN_NV04(push[0], SUBC_BSP(0x200), 2);
+   PUSH_DATA (push[0], codec);
+   PUSH_DATA (push[0], timeout);
+
+   BEGIN_NV04(push[1], SUBC_VP(0x200), 2);
+   PUSH_DATA (push[1], codec);
+   PUSH_DATA (push[1], timeout);
+
+   BEGIN_NV04(push[2], SUBC_PPP(0x200), 2);
+   PUSH_DATA (push[2], ppp_codec);
+   PUSH_DATA (push[2], timeout);
+
+   ++dec->fence_seq;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART|NOUVEAU_BO_MAP,
+                        0, 0x1000, NULL, &dec->fence_bo);
+   if (ret)
+      goto fail;
+
+   nouveau_bo_map(dec->fence_bo, NOUVEAU_BO_RDWR, screen->client);
+   dec->fence_map = dec->fence_bo->map;
+   dec->fence_map[0] = dec->fence_map[4] = dec->fence_map[8] = 0;
+   dec->comm = (struct comm *)(dec->fence_map + (COMM_OFFSET/sizeof(*dec->fence_map)));
+
+   /* So lets test if the fence is working? */
+   nouveau_pushbuf_space(push[0], 6, 1, 0);
+   PUSH_REFN (push[0], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NV04(push[0], SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_seq);
+
+   BEGIN_NV04(push[0], SUBC_BSP(0x304), 1);
+   PUSH_DATA (push[0], 0);
+   PUSH_KICK (push[0]);
+
+   nouveau_pushbuf_space(push[1], 6, 1, 0);
+   PUSH_REFN (push[1], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NV04(push[1], SUBC_VP(0x240), 3);
+   PUSH_DATAh(push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], dec->fence_seq);
+
+   BEGIN_NV04(push[1], SUBC_VP(0x304), 1);
+   PUSH_DATA (push[1], 0);
+   PUSH_KICK (push[1]);
+
+   nouveau_pushbuf_space(push[2], 6, 1, 0);
+   PUSH_REFN (push[2], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NV04(push[2], SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], dec->fence_seq);
+
+   BEGIN_NV04(push[2], SUBC_PPP(0x304), 1);
+   PUSH_DATA (push[2], 0);
+   PUSH_KICK (push[2]);
+
+   usleep(100);
+   while (dec->fence_seq > dec->fence_map[0] ||
+          dec->fence_seq > dec->fence_map[4] ||
+          dec->fence_seq > dec->fence_map[8]) {
+      debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+      usleep(100);
+   }
+   debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+#endif
+
+   return &dec->base;
+
+fw_fail:
+   debug_printf("Cannot create decoder without firmware..\n");
+   dec->base.destroy(&dec->base);
+   return NULL;
+
+fail:
+   debug_printf("Creation failed: %s (%i)\n", strerror(-ret), ret);
+   dec->base.destroy(&dec->base);
+   return NULL;
+}
+
+struct pipe_video_buffer *
+nv98_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *templat)
+{
+   return nouveau_vp3_video_buffer_create(
+         pipe, templat, NV50_RESOURCE_FLAG_VIDEO);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.h b/src/gallium/drivers/nouveau/nv50/nv98_video.h
new file mode 100644
index 00000000000..cec761df4ab
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst, Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_screen.h"
+#include "nouveau_vp3_video.h"
+
+#include "vl/vl_decoder.h"
+#include "vl/vl_types.h"
+
+#include "util/u_video.h"
+
+extern unsigned
+nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nouveau_vp3_video_buffer *refs[16]);
+
+extern void
+nv98_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nouveau_vp3_video_buffer *refs[16]);
+
+extern void
+nv98_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target, unsigned comm_seq);
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
new file mode 100644
index 00000000000..97d4119b6d1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_bsp.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst, Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv98_video.h"
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+static void dump_comm_bsp(struct comm *comm)
+{
+   unsigned idx = comm->bsp_cur_index & 0xf;
+   debug_printf("Cur seq: %x, bsp byte ofs: %x\n", comm->bsp_cur_index, comm->byte_ofs);
+   debug_printf("Status: %08x, pos: %08x\n", comm->status[idx], comm->pos[idx]);
+}
+#endif
+
+unsigned
+nv98_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   int ret;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
+
+   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
+                          num_buffers, data, num_bytes);
+
+   nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+
+   nouveau_pushbuf_space(push, 6 + (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC ? 9 : 8) + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   bsp_addr = bsp_bo->offset >> 8;
+   inter_addr = inter_bo->offset >> 8;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->comm, 0, 0x200);
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET) >> 8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+
+   BEGIN_NV04(push, SUBC_BSP(0x700), 5);
+   PUSH_DATA (push, caps); // 700 cmd
+   PUSH_DATA (push, bsp_addr + 1); // 704 strparm_bsp
+   PUSH_DATA (push, bsp_addr + 7); // 708 str addr
+   PUSH_DATA (push, comm_addr); // 70c comm
+   PUSH_DATA (push, comm_seq); // 710 seq
+
+   if (codec != PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      u32 bitplane_addr;
+      int mpeg12 = (codec == PIPE_VIDEO_FORMAT_MPEG12);
+
+      bitplane_addr = dec->bitplane_bo->offset >> 8;
+
+      nouveau_vp3_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NV04(push, SUBC_BSP(0x400), mpeg12 ? 5 : 7);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 408 interdata addr
+      PUSH_DATA (push, ring_size << 8); // 40c interdata_size
+      if (!mpeg12) {
+         PUSH_DATA (push, bitplane_addr); // 410 BITPLANE_DATA
+         PUSH_DATA (push, 0x400); // 414 BITPLANE_DATA_SIZE
+      }
+      PUSH_DATA (push, 0); // dma idx
+   } else {
+      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NV04(push, SUBC_BSP(0x400), 8);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, slice_size << 8); // 408 interparm size?
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 40c interdata addr
+      PUSH_DATA (push, ring_size << 8); // 410 interdata size
+      PUSH_DATA (push, inter_addr + slice_size); // 414 bucket?
+      PUSH_DATA (push, bucket_size << 8); // 418 bucket size? unshifted..
+      PUSH_DATA (push, 0); // 41c targets
+      // TODO: Double check 414 / 418 with nvidia trace
+   }
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NV04(push, SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NV04(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("b%u: %u\n", dec->fence_seq, dec->fence_map[0]);
+            dump_comm_bsp(dec->comm);
+         }
+      } while (dec->fence_seq > dec->fence_map[0]);
+   }
+
+   dump_comm_bsp(dec->comm);
+   return dec->comm->status[comm_seq & 0xf];
+#else
+   BEGIN_NV04(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+   return 2;
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_ppp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_ppp.c
new file mode 100644
index 00000000000..6b0b7148dcb
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_ppp.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst, Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv98_video.h"
+
+static void
+nv98_decoder_setup_ppp(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target, uint32_t low700) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   uint32_t stride_in = mb(dec->base.width);
+   uint32_t stride_out = mb(target->resources[0]->width0);
+   uint32_t dec_h = mb(dec->base.height);
+   uint32_t dec_w = mb(dec->base.width);
+   uint64_t in_addr;
+   uint32_t y2, cbcr, cbcr2, i;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+   };
+   unsigned num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+      bo_refs[i].bo = mt->base.bo;
+   }
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+   nouveau_vp3_ycbcr_offsets(dec, &y2, &cbcr, &cbcr2);
+
+   BEGIN_NV04(push, SUBC_PPP(0x700), 10);
+   in_addr = nouveau_vp3_video_addr(dec, target) >> 8;
+
+   PUSH_DATA (push, (stride_out << 24) | (stride_out << 16) | low700); // 700
+   PUSH_DATA (push, (stride_in << 24) | (stride_in << 16) | (dec_h << 8) | dec_w); // 704
+   assert(dec_w == stride_in);
+
+   /* Input: */
+   PUSH_DATA (push, in_addr); // 708
+   PUSH_DATA (push, in_addr + y2); // 70c
+   PUSH_DATA (push, in_addr + cbcr); // 710
+   PUSH_DATA (push, in_addr + cbcr2); // 714
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+
+      PUSH_DATA (push, mt->base.address >> 8);
+      PUSH_DATA (push, (mt->base.address + mt->total_size/2) >> 8);
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+   }
+}
+
+static uint32_t
+nv98_decoder_vc1_ppp(struct nouveau_vp3_decoder *dec, struct pipe_vc1_picture_desc *desc, struct nouveau_vp3_video_buffer *target) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   nv98_decoder_setup_ppp(dec, target, 0x1412);
+   assert(!desc->deblockEnable);
+   assert(!(dec->base.width & 0xf));
+   assert(!(dec->base.height & 0xf));
+
+   BEGIN_NV04(push, SUBC_PPP(0x400), 1);
+   PUSH_DATA (push, desc->pquant << 11);
+
+   // 728 = wtf?
+   return 0x10;
+}
+
+void
+nv98_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, struct nouveau_vp3_video_buffer *target, unsigned comm_seq) {
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+   unsigned ppp_caps = 0x10;
+   unsigned fence_extra = 0;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   nouveau_pushbuf_space(push, 11 + (codec == PIPE_VIDEO_FORMAT_VC1 ? 2 : 0) + 3 + fence_extra + 2, 4, 0);
+
+   switch (codec) {
+   case PIPE_VIDEO_FORMAT_MPEG12: {
+      unsigned mpeg2 = dec->base.profile != PIPE_VIDEO_PROFILE_MPEG1;
+      nv98_decoder_setup_ppp(dec, target, 0x1410 | mpeg2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4: nv98_decoder_setup_ppp(dec, target, 0x1414); break;
+   case PIPE_VIDEO_FORMAT_VC1: ppp_caps = nv98_decoder_vc1_ppp(dec, desc.vc1, target); break;
+   case PIPE_VIDEO_FORMAT_MPEG4_AVC: nv98_decoder_setup_ppp(dec, target, 0x1413); break;
+   default: assert(0);
+   }
+   BEGIN_NV04(push, SUBC_PPP(0x734), 2);
+   PUSH_DATA (push, comm_seq);
+   PUSH_DATA (push, ppp_caps);
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NV04(push, SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NV04(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff)
+            debug_printf("p%u: %u\n", dec->fence_seq, dec->fence_map[8]);
+      } while (dec->fence_seq > dec->fence_map[8]);
+   }
+#else
+   BEGIN_NV04(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
new file mode 100644
index 00000000000..9b756ea73f5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst, Ilia Mirkin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv98_video.h"
+#include <sys/mman.h>
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+static void dump_comm_vp(struct nouveau_vp3_decoder *dec, struct comm *comm, u32 comm_seq,
+                         struct nouveau_bo *inter_bo, unsigned slice_size)
+{
+	unsigned i, idx = comm->pvp_cur_index & 0xf;
+	debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+#if 0
+	debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+	debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+
+	for (i = 0; i != comm->irq_index; ++i)
+		debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+	for (i = 0; i != comm->parse_endpos_index; ++i)
+		debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+#endif
+	debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+	if (comm->status_vp[idx] == 1)
+		return;
+
+	if ((comm->pvp_stage & 0xff) != 0xff) {
+		unsigned *map;
+		assert(nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client) >= 0);
+		map = inter_bo->map;
+		for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+			debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+		}
+		munmap(inter_bo->map, inter_bo->size);
+		inter_bo->map = NULL;
+	}
+	assert((comm->pvp_stage & 0xff) == 0xff);
+}
+#endif
+
+static void
+nv98_decoder_kick_ref(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target)
+{
+   dec->refs[target->valid_ref].vidbuf = NULL;
+   dec->refs[target->valid_ref].last_used = 0;
+//   debug_printf("Unreffed %p\n", target);
+}
+
+void
+nv98_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[1];
+   uint32_t bsp_addr, comm_addr, inter_addr, ucode_addr, pic_addr[17], last_addr, null_addr;
+   uint32_t slice_size, bucket_size, ring_size, i;
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   u32 fence_extra = 0, codec_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->fw_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs) - !dec->fw_bo;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      codec_extra += 2;
+   } else
+      nouveau_vp3_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+
+   if (dec->base.max_references > 2)
+      codec_extra += 1 + (dec->base.max_references - 2);
+
+   pic_addr[16] = nouveau_vp3_video_addr(dec, target) >> 8;
+   last_addr = null_addr = nouveau_vp3_video_addr(dec, NULL) >> 8;
+
+   for (i = 0; i < dec->base.max_references; ++i) {
+      if (!refs[i])
+         pic_addr[i] = last_addr;
+      else if (dec->refs[refs[i]->valid_ref].vidbuf == refs[i])
+         last_addr = pic_addr[i] = nouveau_vp3_video_addr(dec, refs[i]) >> 8;
+      else
+         pic_addr[i] = null_addr;
+   }
+   if (!is_ref)
+      nv98_decoder_kick_ref(dec, target);
+
+   nouveau_pushbuf_space(push, 8 + 3 * (codec != PIPE_VIDEO_FORMAT_MPEG12) +
+              6 + codec_extra + fence_extra + 2, num_refs, 0);
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   bsp_addr = bsp_bo->offset >> 8;
+#if NOUVEAU_VP3_DEBUG_FENCE
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET)>>8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+   inter_addr = inter_bo->offset >> 8;
+   if (dec->fw_bo)
+      ucode_addr = dec->fw_bo->offset >> 8;
+   else
+      ucode_addr = 0;
+
+   BEGIN_NV04(push, SUBC_VP(0x700), 7);
+   PUSH_DATA (push, caps); // 700
+   PUSH_DATA (push, comm_seq); // 704
+   PUSH_DATA (push, 0); // 708 fuc targets, ignored for nv98
+   PUSH_DATA (push, dec->fw_sizes); // 70c
+   PUSH_DATA (push, bsp_addr+(VP_OFFSET>>8)); // 710 picparm_addr
+   PUSH_DATA (push, inter_addr); // 714 inter_parm
+   PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 718 inter_data_ofs
+
+   if (bucket_size) {
+      uint64_t tmpimg_addr = dec->ref_bo->offset + dec->ref_stride * (dec->base.max_references+2);
+
+      BEGIN_NV04(push, SUBC_VP(0x71c), 2);
+      PUSH_DATA (push, tmpimg_addr >> 8); // 71c
+      PUSH_DATA (push, inter_addr + slice_size); // 720 bucket_ofs
+   }
+
+   BEGIN_NV04(push, SUBC_VP(0x724), 5);
+   PUSH_DATA (push, comm_addr); // 724
+   PUSH_DATA (push, ucode_addr); // 728
+   PUSH_DATA (push, pic_addr[16]); // 734
+   PUSH_DATA (push, pic_addr[0]); // 72c
+   PUSH_DATA (push, pic_addr[1]); // 730
+
+   if (dec->base.max_references > 2) {
+      int i;
+
+      BEGIN_NV04(push, SUBC_VP(0x400), dec->base.max_references - 2);
+      for (i = 2; i < dec->base.max_references; ++i) {
+         assert(0x400 + (i - 2) * 4 < 0x438);
+         PUSH_DATA (push, pic_addr[i]);
+      }
+   }
+
+   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      BEGIN_NV04(push, SUBC_VP(0x438), 1);
+      PUSH_DATA (push, desc.h264->slice_count);
+   }
+
+   //debug_printf("Decoding %08lx with %08lx and %08lx\n", pic_addr[16], pic_addr[0], pic_addr[1]);
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NV04(push, SUBC_VP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NV04(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK(push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("v%u: %u\n", dec->fence_seq, dec->fence_map[4]);
+            dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+         }
+      } while (dec->fence_seq > dec->fence_map[4]);
+   }
+   dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+#else
+   BEGIN_NV04(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_2d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_2d.xml.h
new file mode 100644
index 00000000000..9a488c17be1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_2d.xml.h
@@ -0,0 +1,380 @@
+#ifndef NVC0_2D_XML
+#define NVC0_2D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nvc0_2d.xml    (   9454 bytes, from 2010-10-16 16:03:11)
+- copyright.xml  (   6498 bytes, from 2010-10-03 13:18:37)
+- nv_object.xml  (  11379 bytes, from 2010-10-16 11:43:24)
+- nvchipsets.xml (   2907 bytes, from 2010-10-15 16:28:21)
+- nv_defs.xml    (   4437 bytes, from 2010-07-06 07:43:58)
+- nv50_defs.xml  (   4482 bytes, from 2010-10-03 13:18:37)
+
+Copyright (C) 2006-2010 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro, curro_, currojerez)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVC0_2D_DST_FORMAT					0x00000200
+
+#define NVC0_2D_DST_LINEAR					0x00000204
+
+#define NVC0_2D_DST_TILE_MODE					0x00000208
+
+#define NVC0_2D_DST_DEPTH					0x0000020c
+
+#define NVC0_2D_DST_LAYER					0x00000210
+
+#define NVC0_2D_DST_PITCH					0x00000214
+
+#define NVC0_2D_DST_WIDTH					0x00000218
+
+#define NVC0_2D_DST_HEIGHT					0x0000021c
+
+#define NVC0_2D_DST_ADDRESS_HIGH				0x00000220
+
+#define NVC0_2D_DST_ADDRESS_LOW					0x00000224
+
+#define NVC0_2D_UNK228						0x00000228
+
+#define NVC0_2D_SRC_FORMAT					0x00000230
+
+#define NVC0_2D_SRC_LINEAR					0x00000234
+
+#define NVC0_2D_SRC_TILE_MODE					0x00000238
+
+#define NVC0_2D_SRC_DEPTH					0x0000023c
+
+#define NVC0_2D_SRC_LAYER					0x00000240
+
+#define NVC0_2D_SRC_PITCH					0x00000244
+#define NVC0_2D_SRC_PITCH__MAX					0x00040000
+
+#define NVC0_2D_SRC_WIDTH					0x00000248
+#define NVC0_2D_SRC_WIDTH__MAX					0x00010000
+
+#define NVC0_2D_SRC_HEIGHT					0x0000024c
+#define NVC0_2D_SRC_HEIGHT__MAX					0x00010000
+
+#define NVC0_2D_SRC_ADDRESS_HIGH				0x00000250
+
+#define NVC0_2D_SRC_ADDRESS_LOW					0x00000254
+
+#define NVC0_2D_UNK258						0x00000258
+
+#define NVC0_2D_SINGLE_GPC					0x00000260
+
+#define NVC0_2D_COND_ADDRESS_HIGH				0x00000264
+
+#define NVC0_2D_COND_ADDRESS_LOW				0x00000268
+
+#define NVC0_2D_COND_MODE					0x0000026c
+#define NVC0_2D_COND_MODE_NEVER					0x00000000
+#define NVC0_2D_COND_MODE_ALWAYS				0x00000001
+#define NVC0_2D_COND_MODE_RES_NON_ZERO				0x00000002
+#define NVC0_2D_COND_MODE_EQUAL					0x00000003
+#define NVC0_2D_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NVC0_2D_CLIP_X						0x00000280
+
+#define NVC0_2D_CLIP_Y						0x00000284
+
+#define NVC0_2D_CLIP_W						0x00000288
+
+#define NVC0_2D_CLIP_H						0x0000028c
+
+#define NVC0_2D_CLIP_ENABLE					0x00000290
+
+#define NVC0_2D_COLOR_KEY_FORMAT				0x00000294
+#define NVC0_2D_COLOR_KEY_FORMAT_16BPP				0x00000000
+#define NVC0_2D_COLOR_KEY_FORMAT_15BPP				0x00000001
+#define NVC0_2D_COLOR_KEY_FORMAT_24BPP				0x00000002
+#define NVC0_2D_COLOR_KEY_FORMAT_30BPP				0x00000003
+#define NVC0_2D_COLOR_KEY_FORMAT_8BPP				0x00000004
+#define NVC0_2D_COLOR_KEY_FORMAT_16BPP2				0x00000005
+#define NVC0_2D_COLOR_KEY_FORMAT_32BPP				0x00000006
+
+#define NVC0_2D_COLOR_KEY					0x00000298
+
+#define NVC0_2D_COLOR_KEY_ENABLE				0x0000029c
+
+#define NVC0_2D_ROP						0x000002a0
+
+#define NVC0_2D_BETA1						0x000002a4
+
+#define NVC0_2D_BETA4						0x000002a8
+
+#define NVC0_2D_OPERATION					0x000002ac
+#define NVC0_2D_OPERATION_SRCCOPY_AND				0x00000000
+#define NVC0_2D_OPERATION_ROP_AND				0x00000001
+#define NVC0_2D_OPERATION_BLEND					0x00000002
+#define NVC0_2D_OPERATION_SRCCOPY				0x00000003
+#define NVC0_2D_OPERATION_ROP					0x00000004
+#define NVC0_2D_OPERATION_SRCCOPY_PREMULT			0x00000005
+#define NVC0_2D_OPERATION_BLEND_PREMULT				0x00000006
+
+#define NVC0_2D_UNK2B0						0x000002b0
+#define NVC0_2D_UNK2B0_UNK0__MASK				0x0000003f
+#define NVC0_2D_UNK2B0_UNK0__SHIFT				0
+#define NVC0_2D_UNK2B0_UNK1__MASK				0x00003f00
+#define NVC0_2D_UNK2B0_UNK1__SHIFT				8
+
+#define NVC0_2D_PATTERN_SELECT					0x000002b4
+#define NVC0_2D_PATTERN_SELECT_MONO_8X8				0x00000000
+#define NVC0_2D_PATTERN_SELECT_MONO_64X1			0x00000001
+#define NVC0_2D_PATTERN_SELECT_MONO_1X64			0x00000002
+#define NVC0_2D_PATTERN_SELECT_COLOR				0x00000003
+
+#define NVC0_2D_PATTERN_COLOR_FORMAT				0x000002e8
+#define NVC0_2D_PATTERN_COLOR_FORMAT_16BPP			0x00000000
+#define NVC0_2D_PATTERN_COLOR_FORMAT_15BPP			0x00000001
+#define NVC0_2D_PATTERN_COLOR_FORMAT_32BPP			0x00000002
+#define NVC0_2D_PATTERN_COLOR_FORMAT_8BPP			0x00000003
+#define NVC0_2D_PATTERN_COLOR_FORMAT_UNK4			0x00000004
+#define NVC0_2D_PATTERN_COLOR_FORMAT_UNK5			0x00000005
+
+#define NVC0_2D_PATTERN_MONO_FORMAT				0x000002ec
+#define NVC0_2D_PATTERN_MONO_FORMAT_CGA6			0x00000000
+#define NVC0_2D_PATTERN_MONO_FORMAT_LE				0x00000001
+
+#define NVC0_2D_PATTERN_COLOR(i0)			       (0x000002f0 + 0x4*(i0))
+#define NVC0_2D_PATTERN_COLOR__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_COLOR__LEN				0x00000002
+
+#define NVC0_2D_PATTERN_BITMAP(i0)			       (0x000002f8 + 0x4*(i0))
+#define NVC0_2D_PATTERN_BITMAP__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_BITMAP__LEN				0x00000002
+
+#define NVC0_2D_PATTERN_X8R8G8B8(i0)			       (0x00000300 + 0x4*(i0))
+#define NVC0_2D_PATTERN_X8R8G8B8__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_X8R8G8B8__LEN				0x00000040
+#define NVC0_2D_PATTERN_X8R8G8B8_B__MASK			0x000000ff
+#define NVC0_2D_PATTERN_X8R8G8B8_B__SHIFT			0
+#define NVC0_2D_PATTERN_X8R8G8B8_G__MASK			0x0000ff00
+#define NVC0_2D_PATTERN_X8R8G8B8_G__SHIFT			8
+#define NVC0_2D_PATTERN_X8R8G8B8_R__MASK			0x00ff0000
+#define NVC0_2D_PATTERN_X8R8G8B8_R__SHIFT			16
+
+#define NVC0_2D_PATTERN_R5G6B5(i0)			       (0x00000400 + 0x4*(i0))
+#define NVC0_2D_PATTERN_R5G6B5__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_R5G6B5__LEN				0x00000020
+#define NVC0_2D_PATTERN_R5G6B5_B0__MASK				0x0000001f
+#define NVC0_2D_PATTERN_R5G6B5_B0__SHIFT			0
+#define NVC0_2D_PATTERN_R5G6B5_G0__MASK				0x000007e0
+#define NVC0_2D_PATTERN_R5G6B5_G0__SHIFT			5
+#define NVC0_2D_PATTERN_R5G6B5_R0__MASK				0x0000f800
+#define NVC0_2D_PATTERN_R5G6B5_R0__SHIFT			11
+#define NVC0_2D_PATTERN_R5G6B5_B1__MASK				0x001f0000
+#define NVC0_2D_PATTERN_R5G6B5_B1__SHIFT			16
+#define NVC0_2D_PATTERN_R5G6B5_G1__MASK				0x07e00000
+#define NVC0_2D_PATTERN_R5G6B5_G1__SHIFT			21
+#define NVC0_2D_PATTERN_R5G6B5_R1__MASK				0xf8000000
+#define NVC0_2D_PATTERN_R5G6B5_R1__SHIFT			27
+
+#define NVC0_2D_PATTERN_X1R5G5B5(i0)			       (0x00000480 + 0x4*(i0))
+#define NVC0_2D_PATTERN_X1R5G5B5__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_X1R5G5B5__LEN				0x00000020
+#define NVC0_2D_PATTERN_X1R5G5B5_B0__MASK			0x0000001f
+#define NVC0_2D_PATTERN_X1R5G5B5_B0__SHIFT			0
+#define NVC0_2D_PATTERN_X1R5G5B5_G0__MASK			0x000003e0
+#define NVC0_2D_PATTERN_X1R5G5B5_G0__SHIFT			5
+#define NVC0_2D_PATTERN_X1R5G5B5_R0__MASK			0x00007c00
+#define NVC0_2D_PATTERN_X1R5G5B5_R0__SHIFT			10
+#define NVC0_2D_PATTERN_X1R5G5B5_B1__MASK			0x001f0000
+#define NVC0_2D_PATTERN_X1R5G5B5_B1__SHIFT			16
+#define NVC0_2D_PATTERN_X1R5G5B5_G1__MASK			0x03e00000
+#define NVC0_2D_PATTERN_X1R5G5B5_G1__SHIFT			21
+#define NVC0_2D_PATTERN_X1R5G5B5_R1__MASK			0x7c000000
+#define NVC0_2D_PATTERN_X1R5G5B5_R1__SHIFT			26
+
+#define NVC0_2D_PATTERN_Y8(i0)				       (0x00000500 + 0x4*(i0))
+#define NVC0_2D_PATTERN_Y8__ESIZE				0x00000004
+#define NVC0_2D_PATTERN_Y8__LEN					0x00000010
+#define NVC0_2D_PATTERN_Y8_Y0__MASK				0x000000ff
+#define NVC0_2D_PATTERN_Y8_Y0__SHIFT				0
+#define NVC0_2D_PATTERN_Y8_Y1__MASK				0x0000ff00
+#define NVC0_2D_PATTERN_Y8_Y1__SHIFT				8
+#define NVC0_2D_PATTERN_Y8_Y2__MASK				0x00ff0000
+#define NVC0_2D_PATTERN_Y8_Y2__SHIFT				16
+#define NVC0_2D_PATTERN_Y8_Y3__MASK				0xff000000
+#define NVC0_2D_PATTERN_Y8_Y3__SHIFT				24
+
+#define NVC0_2D_DRAW_SHAPE					0x00000580
+#define NVC0_2D_DRAW_SHAPE_POINTS				0x00000000
+#define NVC0_2D_DRAW_SHAPE_LINES				0x00000001
+#define NVC0_2D_DRAW_SHAPE_LINE_STRIP				0x00000002
+#define NVC0_2D_DRAW_SHAPE_TRIANGLES				0x00000003
+#define NVC0_2D_DRAW_SHAPE_RECTANGLES				0x00000004
+
+#define NVC0_2D_DRAW_COLOR_FORMAT				0x00000584
+
+#define NVC0_2D_DRAW_COLOR					0x00000588
+
+#define NVC0_2D_UNK58C						0x0000058c
+#define NVC0_2D_UNK58C_0					0x00000001
+#define NVC0_2D_UNK58C_1					0x00000010
+#define NVC0_2D_UNK58C_2					0x00000100
+#define NVC0_2D_UNK58C_3					0x00001000
+
+#define NVC0_2D_DRAW_POINT16					0x000005e0
+#define NVC0_2D_DRAW_POINT16_X__MASK				0x0000ffff
+#define NVC0_2D_DRAW_POINT16_X__SHIFT				0
+#define NVC0_2D_DRAW_POINT16_Y__MASK				0xffff0000
+#define NVC0_2D_DRAW_POINT16_Y__SHIFT				16
+
+#define NVC0_2D_DRAW_POINT32_X(i0)			       (0x00000600 + 0x8*(i0))
+#define NVC0_2D_DRAW_POINT32_X__ESIZE				0x00000008
+#define NVC0_2D_DRAW_POINT32_X__LEN				0x00000040
+
+#define NVC0_2D_DRAW_POINT32_Y(i0)			       (0x00000604 + 0x8*(i0))
+#define NVC0_2D_DRAW_POINT32_Y__ESIZE				0x00000008
+#define NVC0_2D_DRAW_POINT32_Y__LEN				0x00000040
+
+#define NVC0_2D_SIFC_BITMAP_ENABLE				0x00000800
+
+#define NVC0_2D_SIFC_FORMAT					0x00000804
+
+#define NVC0_2D_SIFC_BITMAP_FORMAT				0x00000808
+#define NVC0_2D_SIFC_BITMAP_FORMAT_I1				0x00000000
+#define NVC0_2D_SIFC_BITMAP_FORMAT_I4				0x00000001
+#define NVC0_2D_SIFC_BITMAP_FORMAT_I8				0x00000002
+
+#define NVC0_2D_SIFC_BITMAP_LSB_FIRST				0x0000080c
+
+#define NVC0_2D_SIFC_BITMAP_LINE_PACK_MODE			0x00000810
+#define NVC0_2D_SIFC_BITMAP_LINE_PACK_MODE_PACKED		0x00000000
+#define NVC0_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_BYTE		0x00000001
+#define NVC0_2D_SIFC_BITMAP_LINE_PACK_MODE_ALIGN_WORD		0x00000002
+
+#define NVC0_2D_SIFC_BITMAP_COLOR_BIT0				0x00000814
+
+#define NVC0_2D_SIFC_BITMAP_COLOR_BIT1				0x00000818
+
+#define NVC0_2D_SIFC_BITMAP_WRITE_BIT0_ENABLE			0x0000081c
+
+#define NVC0_2D_SIFC_WIDTH					0x00000838
+
+#define NVC0_2D_SIFC_HEIGHT					0x0000083c
+
+#define NVC0_2D_SIFC_DX_DU_FRACT				0x00000840
+
+#define NVC0_2D_SIFC_DX_DU_INT					0x00000844
+
+#define NVC0_2D_SIFC_DY_DV_FRACT				0x00000848
+
+#define NVC0_2D_SIFC_DY_DV_INT					0x0000084c
+
+#define NVC0_2D_SIFC_DST_X_FRACT				0x00000850
+
+#define NVC0_2D_SIFC_DST_X_INT					0x00000854
+
+#define NVC0_2D_SIFC_DST_Y_FRACT				0x00000858
+
+#define NVC0_2D_SIFC_DST_Y_INT					0x0000085c
+
+#define NVC0_2D_SIFC_DATA					0x00000860
+
+#define NVC0_2D_UNK0870						0x00000870
+
+#define NVC0_2D_UNK0880						0x00000880
+
+#define NVC0_2D_UNK0884						0x00000884
+
+#define NVC0_2D_UNK0888						0x00000888
+
+#define NVC0_2D_BLIT_CONTROL					0x0000088c
+#define NVC0_2D_BLIT_CONTROL_ORIGIN__MASK			0x00000001
+#define NVC0_2D_BLIT_CONTROL_ORIGIN__SHIFT			0
+#define NVC0_2D_BLIT_CONTROL_ORIGIN_CENTER			0x00000000
+#define NVC0_2D_BLIT_CONTROL_ORIGIN_CORNER			0x00000001
+#define NVC0_2D_BLIT_CONTROL_FILTER__MASK			0x00000010
+#define NVC0_2D_BLIT_CONTROL_FILTER__SHIFT			4
+#define NVC0_2D_BLIT_CONTROL_FILTER_POINT_SAMPLE		0x00000000
+#define NVC0_2D_BLIT_CONTROL_FILTER_BILINEAR			0x00000010
+
+#define NVC0_2D_BLIT_DST_X					0x000008b0
+
+#define NVC0_2D_BLIT_DST_Y					0x000008b4
+
+#define NVC0_2D_BLIT_DST_W					0x000008b8
+
+#define NVC0_2D_BLIT_DST_H					0x000008bc
+
+#define NVC0_2D_BLIT_DU_DX_FRACT				0x000008c0
+
+#define NVC0_2D_BLIT_DU_DX_INT					0x000008c4
+
+#define NVC0_2D_BLIT_DV_DY_FRACT				0x000008c8
+
+#define NVC0_2D_BLIT_DV_DY_INT					0x000008cc
+
+#define NVC0_2D_BLIT_SRC_X_FRACT				0x000008d0
+
+#define NVC0_2D_BLIT_SRC_X_INT					0x000008d4
+
+#define NVC0_2D_BLIT_SRC_Y_FRACT				0x000008d8
+
+#define NVC0_2D_BLIT_SRC_Y_INT					0x000008dc
+
+
+#endif /* NVC0_2D_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
new file mode 100644
index 00000000000..d3f719d333f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
@@ -0,0 +1,1350 @@
+#ifndef NVC0_3D_XML
+#define NVC0_3D_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nvc0_3d.xml    (  30827 bytes, from 2011-01-13 18:23:07)
+- copyright.xml  (   6452 bytes, from 2010-11-25 23:28:20)
+- nv_defs.xml    (   4437 bytes, from 2010-07-06 07:43:58)
+- nv_3ddefs.xml  (  16394 bytes, from 2010-12-17 15:10:40)
+- nv_object.xml  (  11898 bytes, from 2010-12-23 14:14:20)
+- nvchipsets.xml (   3074 bytes, from 2010-11-07 00:36:28)
+- nv50_defs.xml  (   4487 bytes, from 2010-12-10 00:37:17)
+
+Copyright (C) 2006-2011 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVC0_3D_NOTIFY_ADDRESS_HIGH				0x00000104
+#define NVC0_3D_NOTIFY_ADDRESS_LOW				0x00000108
+#define NVC0_3D_NOTIFY						0x0000010c
+
+#define NVC0_3D_SERIALIZE					0x00000110
+
+#define NVC0_3D_LINE_WIDTH_SEPARATE				0x0000020c
+
+#define NVC0_3D_FORCE_EARLY_FRAGMENT_TESTS			0x00000210
+
+#define NVC0_3D_MEM_BARRIER					0x0000021c
+#define NVC0_3D_MEM_BARRIER_UNK0				0x00000001
+#define NVC0_3D_MEM_BARRIER_UNK1				0x00000002
+#define NVC0_3D_MEM_BARRIER_UNK2				0x00000004
+#define NVC0_3D_MEM_BARRIER_UNK4				0x00000010
+#define NVC0_3D_MEM_BARRIER_UNK8				0x00000100
+#define NVC0_3D_MEM_BARRIER_UNK12				0x00001000
+
+#define NVC0_3D_CACHE_SPLIT					0x00000308
+#define NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1			0x00000001
+#define NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1			0x00000002
+#define NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1			0x00000003
+
+#define NVC0_3D_TESS_MODE					0x00000320
+#define NVC0_3D_TESS_MODE_PRIM__MASK				0x0000000f
+#define NVC0_3D_TESS_MODE_PRIM__SHIFT				0
+#define NVC0_3D_TESS_MODE_PRIM_ISOLINES				0x00000000
+#define NVC0_3D_TESS_MODE_PRIM_TRIANGLES			0x00000001
+#define NVC0_3D_TESS_MODE_PRIM_QUADS				0x00000002
+#define NVC0_3D_TESS_MODE_SPACING__MASK				0x000000f0
+#define NVC0_3D_TESS_MODE_SPACING__SHIFT			4
+#define NVC0_3D_TESS_MODE_SPACING_EQUAL				0x00000000
+#define NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD		0x00000010
+#define NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN		0x00000020
+#define NVC0_3D_TESS_MODE_CW					0x00000100
+#define NVC0_3D_TESS_MODE_CONNECTED				0x00000200
+
+#define NVC0_3D_TESS_LEVEL_OUTER(i0)			       (0x00000324 + 0x4*(i0))
+#define NVC0_3D_TESS_LEVEL_OUTER__ESIZE				0x00000004
+#define NVC0_3D_TESS_LEVEL_OUTER__LEN				0x00000004
+
+#define NVC0_3D_TESS_LEVEL_INNER(i0)			       (0x00000334 + 0x4*(i0))
+#define NVC0_3D_TESS_LEVEL_INNER__ESIZE				0x00000004
+#define NVC0_3D_TESS_LEVEL_INNER__LEN				0x00000002
+
+#define NVC0_3D_RASTERIZE_ENABLE				0x0000037c
+
+#define NVC0_3D_TFB(i0)					       (0x00000380 + 0x20*(i0))
+#define NVC0_3D_TFB__ESIZE					0x00000020
+#define NVC0_3D_TFB__LEN					0x00000004
+
+#define NVC0_3D_TFB_BUFFER_ENABLE(i0)			       (0x00000380 + 0x20*(i0))
+
+#define NVC0_3D_TFB_ADDRESS_HIGH(i0)			       (0x00000384 + 0x20*(i0))
+
+#define NVC0_3D_TFB_ADDRESS_LOW(i0)			       (0x00000388 + 0x20*(i0))
+
+#define NVC0_3D_TFB_BUFFER_SIZE(i0)			       (0x0000038c + 0x20*(i0))
+
+#define NVC0_3D_TFB_BUFFER_OFFSET(i0)			       (0x00000390 + 0x20*(i0))
+
+#define NVC0_3D_TFB_STREAM(i0)				       (0x00000700 + 0x10*(i0))
+#define NVC0_3D_TFB_STREAM__ESIZE				0x00000010
+#define NVC0_3D_TFB_STREAM__LEN					0x00000004
+
+#define NVC0_3D_TFB_VARYING_COUNT(i0)			       (0x00000704 + 0x10*(i0))
+#define NVC0_3D_TFB_VARYING_COUNT__ESIZE			0x00000010
+#define NVC0_3D_TFB_VARYING_COUNT__LEN				0x00000004
+
+#define NVC0_3D_TFB_BUFFER_STRIDE(i0)			       (0x00000708 + 0x10*(i0))
+#define NVC0_3D_TFB_BUFFER_STRIDE__ESIZE			0x00000010
+#define NVC0_3D_TFB_BUFFER_STRIDE__LEN				0x00000004
+
+#define NVC0_3D_TFB_ENABLE					0x00000744
+
+#define NVC0_3D_SAMPLE_SHADING					0x00000754
+#define NVC0_3D_SAMPLE_SHADING_MIN_SAMPLES__MASK		0x0000000f
+#define NVC0_3D_SAMPLE_SHADING_MIN_SAMPLES__SHIFT		0
+#define NVC0_3D_SAMPLE_SHADING_ENABLE				0x00000010
+
+#define NVC0_3D_LOCAL_BASE					0x0000077c
+
+#define NVC0_3D_TEMP_ADDRESS_HIGH				0x00000790
+
+#define NVC0_3D_TEMP_ADDRESS_LOW				0x00000794
+
+#define NVC0_3D_TEMP_SIZE_HIGH					0x00000798
+
+#define NVC0_3D_TEMP_SIZE_LOW					0x0000079c
+
+#define NVC0_3D_WARP_TEMP_ALLOC					0x000007a0
+
+#define NVC0_3D_ZCULL_WIDTH					0x000007c0
+
+#define NVC0_3D_ZCULL_HEIGHT					0x000007c4
+
+#define NVC0_3D_ZCULL_ADDRESS_HIGH				0x000007e8
+
+#define NVC0_3D_ZCULL_ADDRESS_LOW				0x000007ec
+
+#define NVC0_3D_ZCULL_LIMIT_HIGH				0x000007f0
+
+#define NVC0_3D_ZCULL_LIMIT_LOW					0x000007f4
+
+#define NVC0_3D_RT(i0)					       (0x00000800 + 0x40*(i0))
+#define NVC0_3D_RT__ESIZE					0x00000040
+#define NVC0_3D_RT__LEN						0x00000008
+
+#define NVC0_3D_RT_ADDRESS_HIGH(i0)			       (0x00000800 + 0x40*(i0))
+
+#define NVC0_3D_RT_ADDRESS_LOW(i0)			       (0x00000804 + 0x40*(i0))
+
+#define NVC0_3D_RT_HORIZ(i0)				       (0x00000808 + 0x40*(i0))
+
+#define NVC0_3D_RT_VERT(i0)				       (0x0000080c + 0x40*(i0))
+
+#define NVC0_3D_RT_FORMAT(i0)				       (0x00000810 + 0x40*(i0))
+
+#define NVC0_3D_RT_TILE_MODE(i0)			       (0x00000814 + 0x40*(i0))
+#define NVC0_3D_RT_TILE_MODE_X					0x00000001
+#define NVC0_3D_RT_TILE_MODE_Y__MASK				0x00000070
+#define NVC0_3D_RT_TILE_MODE_Y__SHIFT				4
+#define NVC0_3D_RT_TILE_MODE_Z__MASK				0x00000700
+#define NVC0_3D_RT_TILE_MODE_Z__SHIFT				8
+#define NVC0_3D_RT_TILE_MODE_LINEAR				0x00001000
+#define NVC0_3D_RT_TILE_MODE_UNK16				0x00010000
+
+#define NVC0_3D_RT_ARRAY_MODE(i0)			       (0x00000818 + 0x40*(i0))
+#define NVC0_3D_RT_ARRAY_MODE_LAYERS__MASK			0x0000ffff
+#define NVC0_3D_RT_ARRAY_MODE_LAYERS__SHIFT			0
+#define NVC0_3D_RT_ARRAY_MODE_VOLUME				0x00010000
+
+#define NVC0_3D_RT_LAYER_STRIDE(i0)			       (0x0000081c + 0x40*(i0))
+
+#define NVC0_3D_RT_BASE_LAYER(i0)			       (0x00000820 + 0x40*(i0))
+
+#define NVC0_3D_RT_UNK14(i0)				       (0x00000824 + 0x40*(i0))
+
+#define NVC0_3D_VIEWPORT_SCALE_X(i0)			       (0x00000a00 + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_SCALE_X__ESIZE				0x00000020
+#define NVC0_3D_VIEWPORT_SCALE_X__LEN				0x00000010
+
+#define NVC0_3D_VIEWPORT_SCALE_Y(i0)			       (0x00000a04 + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_SCALE_Y__ESIZE				0x00000020
+#define NVC0_3D_VIEWPORT_SCALE_Y__LEN				0x00000010
+
+#define NVC0_3D_VIEWPORT_SCALE_Z(i0)			       (0x00000a08 + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_SCALE_Z__ESIZE				0x00000020
+#define NVC0_3D_VIEWPORT_SCALE_Z__LEN				0x00000010
+
+#define NVC0_3D_VIEWPORT_TRANSLATE_X(i0)		       (0x00000a0c + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_TRANSLATE_X__ESIZE			0x00000020
+#define NVC0_3D_VIEWPORT_TRANSLATE_X__LEN			0x00000010
+
+#define NVC0_3D_VIEWPORT_TRANSLATE_Y(i0)		       (0x00000a10 + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_TRANSLATE_Y__ESIZE			0x00000020
+#define NVC0_3D_VIEWPORT_TRANSLATE_Y__LEN			0x00000010
+
+#define NVC0_3D_VIEWPORT_TRANSLATE_Z(i0)		       (0x00000a14 + 0x20*(i0))
+#define NVC0_3D_VIEWPORT_TRANSLATE_Z__ESIZE			0x00000020
+#define NVC0_3D_VIEWPORT_TRANSLATE_Z__LEN			0x00000010
+
+#define NVC0_3D_VIEWPORT_HORIZ(i0)			       (0x00000c00 + 0x10*(i0))
+#define NVC0_3D_VIEWPORT_HORIZ__ESIZE				0x00000010
+#define NVC0_3D_VIEWPORT_HORIZ__LEN				0x00000010
+#define NVC0_3D_VIEWPORT_HORIZ_X__MASK				0x0000ffff
+#define NVC0_3D_VIEWPORT_HORIZ_X__SHIFT				0
+#define NVC0_3D_VIEWPORT_HORIZ_W__MASK				0xffff0000
+#define NVC0_3D_VIEWPORT_HORIZ_W__SHIFT				16
+
+#define NVC0_3D_VIEWPORT_VERT(i0)			       (0x00000c04 + 0x10*(i0))
+#define NVC0_3D_VIEWPORT_VERT__ESIZE				0x00000010
+#define NVC0_3D_VIEWPORT_VERT__LEN				0x00000010
+#define NVC0_3D_VIEWPORT_VERT_Y__MASK				0x0000ffff
+#define NVC0_3D_VIEWPORT_VERT_Y__SHIFT				0
+#define NVC0_3D_VIEWPORT_VERT_H__MASK				0xffff0000
+#define NVC0_3D_VIEWPORT_VERT_H__SHIFT				16
+
+#define NVC0_3D_DEPTH_RANGE_NEAR(i0)			       (0x00000c08 + 0x10*(i0))
+#define NVC0_3D_DEPTH_RANGE_NEAR__ESIZE				0x00000010
+#define NVC0_3D_DEPTH_RANGE_NEAR__LEN				0x00000010
+
+#define NVC0_3D_DEPTH_RANGE_FAR(i0)			       (0x00000c0c + 0x10*(i0))
+#define NVC0_3D_DEPTH_RANGE_FAR__ESIZE				0x00000010
+#define NVC0_3D_DEPTH_RANGE_FAR__LEN				0x00000010
+
+#define NVC0_3D_CLIP_RECT_HORIZ(i0)			       (0x00000d00 + 0x8*(i0))
+#define NVC0_3D_CLIP_RECT_HORIZ__ESIZE				0x00000008
+#define NVC0_3D_CLIP_RECT_HORIZ__LEN				0x00000008
+#define NVC0_3D_CLIP_RECT_HORIZ_MIN__MASK			0x0000ffff
+#define NVC0_3D_CLIP_RECT_HORIZ_MIN__SHIFT			0
+#define NVC0_3D_CLIP_RECT_HORIZ_MAX__MASK			0xffff0000
+#define NVC0_3D_CLIP_RECT_HORIZ_MAX__SHIFT			16
+
+#define NVC0_3D_CLIP_RECT_VERT(i0)			       (0x00000d04 + 0x8*(i0))
+#define NVC0_3D_CLIP_RECT_VERT__ESIZE				0x00000008
+#define NVC0_3D_CLIP_RECT_VERT__LEN				0x00000008
+#define NVC0_3D_CLIP_RECT_VERT_MIN__MASK			0x0000ffff
+#define NVC0_3D_CLIP_RECT_VERT_MIN__SHIFT			0
+#define NVC0_3D_CLIP_RECT_VERT_MAX__MASK			0xffff0000
+#define NVC0_3D_CLIP_RECT_VERT_MAX__SHIFT			16
+
+#define NVC0_3D_CLIPID_REGION_HORIZ(i0)			       (0x00000d40 + 0x8*(i0))
+#define NVC0_3D_CLIPID_REGION_HORIZ__ESIZE			0x00000008
+#define NVC0_3D_CLIPID_REGION_HORIZ__LEN			0x00000004
+#define NVC0_3D_CLIPID_REGION_HORIZ_X__MASK			0x0000ffff
+#define NVC0_3D_CLIPID_REGION_HORIZ_X__SHIFT			0
+#define NVC0_3D_CLIPID_REGION_HORIZ_W__MASK			0xffff0000
+#define NVC0_3D_CLIPID_REGION_HORIZ_W__SHIFT			16
+
+#define NVC0_3D_CLIPID_REGION_VERT(i0)			       (0x00000d44 + 0x8*(i0))
+#define NVC0_3D_CLIPID_REGION_VERT__ESIZE			0x00000008
+#define NVC0_3D_CLIPID_REGION_VERT__LEN				0x00000004
+#define NVC0_3D_CLIPID_REGION_VERT_Y__MASK			0x0000ffff
+#define NVC0_3D_CLIPID_REGION_VERT_Y__SHIFT			0
+#define NVC0_3D_CLIPID_REGION_VERT_H__MASK			0xffff0000
+#define NVC0_3D_CLIPID_REGION_VERT_H__SHIFT			16
+
+#define NVC0_3D_CALL_LIMIT_LOG					0x00000d64
+
+#define NVC0_3D_COUNTER_ENABLE					0x00000d68
+#define NVC0_3D_COUNTER_ENABLE_UNK00				0x00000001
+#define NVC0_3D_COUNTER_ENABLE_UNK01				0x00000002
+#define NVC0_3D_COUNTER_ENABLE_UNK02				0x00000004
+#define NVC0_3D_COUNTER_ENABLE_UNK03				0x00000008
+#define NVC0_3D_COUNTER_ENABLE_UNK04				0x00000010
+#define NVC0_3D_COUNTER_ENABLE_EMITTED_PRIMITIVES		0x00000020
+#define NVC0_3D_COUNTER_ENABLE_UNK06				0x00000040
+#define NVC0_3D_COUNTER_ENABLE_UNK07				0x00000080
+#define NVC0_3D_COUNTER_ENABLE_UNK08				0x00000100
+#define NVC0_3D_COUNTER_ENABLE_UNK09				0x00000200
+#define NVC0_3D_COUNTER_ENABLE_GENERATED_PRIMITIVES		0x00000400
+#define NVC0_3D_COUNTER_ENABLE_UNK0B				0x00000800
+#define NVC0_3D_COUNTER_ENABLE_UNK0C				0x00001000
+#define NVC0_3D_COUNTER_ENABLE_UNK0D				0x00002000
+#define NVC0_3D_COUNTER_ENABLE_UNK0E				0x00004000
+#define NVC0_3D_COUNTER_ENABLE_UNK0F				0x00008000
+
+#define NVC0_3D_VERTEX_BUFFER_FIRST				0x00000d74
+
+#define NVC0_3D_VERTEX_BUFFER_COUNT				0x00000d78
+
+#define NVC0_3D_CLEAR_COLOR(i0)				       (0x00000d80 + 0x4*(i0))
+#define NVC0_3D_CLEAR_COLOR__ESIZE				0x00000004
+#define NVC0_3D_CLEAR_COLOR__LEN				0x00000004
+
+#define NVC0_3D_CLEAR_DEPTH					0x00000d90
+
+#define NVC0_3D_CLEAR_STENCIL					0x00000da0
+
+#define NVC0_3D_POLYGON_SMOOTH_ENABLE				0x00000db4
+
+#define NVC0_3D_POLYGON_OFFSET_POINT_ENABLE			0x00000dc0
+
+#define NVC0_3D_POLYGON_OFFSET_LINE_ENABLE			0x00000dc4
+
+#define NVC0_3D_POLYGON_OFFSET_FILL_ENABLE			0x00000dc8
+
+#define NVC0_3D_PATCH_VERTICES					0x00000dcc
+
+#define NVC0_3D_WATCHDOG_TIMER					0x00000de4
+
+#define NVC0_3D_WINDOW_OFFSET_X					0x00000df8
+
+#define NVC0_3D_WINDOW_OFFSET_Y					0x00000dfc
+
+#define NVC0_3D_SCISSOR_ENABLE(i0)			       (0x00000e00 + 0x10*(i0))
+#define NVC0_3D_SCISSOR_ENABLE__ESIZE				0x00000010
+#define NVC0_3D_SCISSOR_ENABLE__LEN				0x00000010
+
+#define NVC0_3D_SCISSOR_HORIZ(i0)			       (0x00000e04 + 0x10*(i0))
+#define NVC0_3D_SCISSOR_HORIZ__ESIZE				0x00000010
+#define NVC0_3D_SCISSOR_HORIZ__LEN				0x00000010
+#define NVC0_3D_SCISSOR_HORIZ_MIN__MASK				0x0000ffff
+#define NVC0_3D_SCISSOR_HORIZ_MIN__SHIFT			0
+#define NVC0_3D_SCISSOR_HORIZ_MAX__MASK				0xffff0000
+#define NVC0_3D_SCISSOR_HORIZ_MAX__SHIFT			16
+
+#define NVC0_3D_SCISSOR_VERT(i0)			       (0x00000e08 + 0x10*(i0))
+#define NVC0_3D_SCISSOR_VERT__ESIZE				0x00000010
+#define NVC0_3D_SCISSOR_VERT__LEN				0x00000010
+#define NVC0_3D_SCISSOR_VERT_MIN__MASK				0x0000ffff
+#define NVC0_3D_SCISSOR_VERT_MIN__SHIFT				0
+#define NVC0_3D_SCISSOR_VERT_MAX__MASK				0xffff0000
+#define NVC0_3D_SCISSOR_VERT_MAX__SHIFT				16
+
+#define NVC0_3D_STENCIL_BACK_FUNC_REF				0x00000f54
+
+#define NVC0_3D_STENCIL_BACK_MASK				0x00000f58
+
+#define NVC0_3D_STENCIL_BACK_FUNC_MASK				0x00000f5c
+
+#define NVC0_3D_VERTEX_RUNOUT_ADDRESS_HIGH			0x00000f84
+
+#define NVC0_3D_VERTEX_RUNOUT_ADDRESS_LOW			0x00000f88
+
+#define NVC0_3D_COLOR_MASK_COMMON				0x00000f90
+
+#define NVC0_3D_DEPTH_BOUNDS(i0)			       (0x00000f9c + 0x4*(i0))
+#define NVC0_3D_DEPTH_BOUNDS__ESIZE				0x00000004
+#define NVC0_3D_DEPTH_BOUNDS__LEN				0x00000002
+
+#define NVC0_3D_RT_SEPARATE_FRAG_DATA				0x00000fac
+
+#define NVC0_3D_MSAA_MASK(i0)				       (0x00000fbc + 0x4*(i0))
+#define NVC0_3D_MSAA_MASK__ESIZE				0x00000004
+#define NVC0_3D_MSAA_MASK__LEN					0x00000004
+
+#define NVC0_3D_CLIPID_ADDRESS_HIGH				0x00000fcc
+
+#define NVC0_3D_CLIPID_ADDRESS_LOW				0x00000fd0
+
+#define NVC0_3D_ZETA_ADDRESS_HIGH				0x00000fe0
+
+#define NVC0_3D_ZETA_ADDRESS_LOW				0x00000fe4
+
+#define NVC0_3D_ZETA_FORMAT					0x00000fe8
+
+#define NVC0_3D_ZETA_TILE_MODE					0x00000fec
+
+#define NVC0_3D_ZETA_LAYER_STRIDE				0x00000ff0
+
+#define NVC0_3D_SCREEN_SCISSOR_HORIZ				0x00000ff4
+#define NVC0_3D_SCREEN_SCISSOR_HORIZ_W__MASK			0xffff0000
+#define NVC0_3D_SCREEN_SCISSOR_HORIZ_W__SHIFT			16
+#define NVC0_3D_SCREEN_SCISSOR_HORIZ_X__MASK			0x0000ffff
+#define NVC0_3D_SCREEN_SCISSOR_HORIZ_X__SHIFT			0
+
+#define NVC0_3D_SCREEN_SCISSOR_VERT				0x00000ff8
+#define NVC0_3D_SCREEN_SCISSOR_VERT_H__MASK			0xffff0000
+#define NVC0_3D_SCREEN_SCISSOR_VERT_H__SHIFT			16
+#define NVC0_3D_SCREEN_SCISSOR_VERT_Y__MASK			0x0000ffff
+#define NVC0_3D_SCREEN_SCISSOR_VERT_Y__SHIFT			0
+
+#define NVC0_3D_CLEAR_FLAGS					0x000010f8
+#define NVC0_3D_CLEAR_FLAGS_STENCIL_MASK			0x00000001
+#define NVC0_3D_CLEAR_FLAGS_UNK4				0x00000010
+#define NVC0_3D_CLEAR_FLAGS_SCISSOR				0x00000100
+#define NVC0_3D_CLEAR_FLAGS_VIEWPORT				0x00001000
+
+#define NVC0_3D_VERTEX_ID					0x00001118
+
+#define NVC0_3D_VTX_ATTR_DEFINE					0x0000114c
+#define NVC0_3D_VTX_ATTR_DEFINE_ATTR__MASK			0x000000ff
+#define NVC0_3D_VTX_ATTR_DEFINE_ATTR__SHIFT			0
+#define NVC0_3D_VTX_ATTR_DEFINE_COMP__MASK			0x00000700
+#define NVC0_3D_VTX_ATTR_DEFINE_COMP__SHIFT			8
+#define NVC0_3D_VTX_ATTR_DEFINE_COMP__MIN			0x00000001
+#define NVC0_3D_VTX_ATTR_DEFINE_COMP__MAX			0x00000004
+#define NVC0_3D_VTX_ATTR_DEFINE_SIZE__MASK			0x00007000
+#define NVC0_3D_VTX_ATTR_DEFINE_SIZE__SHIFT			12
+#define NVC0_3D_VTX_ATTR_DEFINE_SIZE_8				0x00001000
+#define NVC0_3D_VTX_ATTR_DEFINE_SIZE_16				0x00002000
+#define NVC0_3D_VTX_ATTR_DEFINE_SIZE_32				0x00004000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE__MASK			0x00070000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE__SHIFT			16
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_SNORM			0x00010000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_UNORM			0x00020000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_SINT			0x00030000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_UINT			0x00040000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_USCALED			0x00050000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_SSCALED			0x00060000
+#define NVC0_3D_VTX_ATTR_DEFINE_TYPE_FLOAT			0x00070000
+
+#define NVC0_3D_VTX_ATTR_DATA(i0)			       (0x00001150 + 0x4*(i0))
+#define NVC0_3D_VTX_ATTR_DATA__ESIZE				0x00000004
+#define NVC0_3D_VTX_ATTR_DATA__LEN				0x00000004
+
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT(i0)		       (0x00001160 + 0x4*(i0))
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT__ESIZE			0x00000004
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT__LEN			0x00000020
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__MASK		0x0000001f
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT		0
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST			0x00000040
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_OFFSET__MASK		0x001fff80
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_OFFSET__SHIFT		7
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE__MASK			0x07e00000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE__SHIFT		21
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32_32_32_32		0x00200000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32_32_32		0x00400000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_16_16_16_16		0x00600000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32_32			0x00800000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_16_16_16		0x00a00000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_8_8_8_8		0x01400000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_16_16			0x01e00000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32			0x02400000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_8_8_8			0x02600000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_8_8			0x03000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_16			0x03600000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_8			0x03a00000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_10_10_10_2		0x06000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE__MASK			0x38000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE__SHIFT		27
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_SNORM			0x08000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_UNORM			0x10000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_SINT			0x18000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_UINT			0x20000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_USCALED		0x28000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_SSCALED		0x30000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT			0x38000000
+#define NVC0_3D_VERTEX_ATTRIB_FORMAT_BGRA			0x80000000
+
+#define NVC0_3D_RT_CONTROL					0x0000121c
+#define NVC0_3D_RT_CONTROL_COUNT__MASK				0x0000000f
+#define NVC0_3D_RT_CONTROL_COUNT__SHIFT				0
+#define NVC0_3D_RT_CONTROL_MAP0__MASK				0x00000070
+#define NVC0_3D_RT_CONTROL_MAP0__SHIFT				4
+#define NVC0_3D_RT_CONTROL_MAP1__MASK				0x00000380
+#define NVC0_3D_RT_CONTROL_MAP1__SHIFT				7
+#define NVC0_3D_RT_CONTROL_MAP2__MASK				0x00001c00
+#define NVC0_3D_RT_CONTROL_MAP2__SHIFT				10
+#define NVC0_3D_RT_CONTROL_MAP3__MASK				0x0000e000
+#define NVC0_3D_RT_CONTROL_MAP3__SHIFT				13
+#define NVC0_3D_RT_CONTROL_MAP4__MASK				0x00070000
+#define NVC0_3D_RT_CONTROL_MAP4__SHIFT				16
+#define NVC0_3D_RT_CONTROL_MAP5__MASK				0x00380000
+#define NVC0_3D_RT_CONTROL_MAP5__SHIFT				19
+#define NVC0_3D_RT_CONTROL_MAP6__MASK				0x01c00000
+#define NVC0_3D_RT_CONTROL_MAP6__SHIFT				22
+#define NVC0_3D_RT_CONTROL_MAP7__MASK				0x0e000000
+#define NVC0_3D_RT_CONTROL_MAP7__SHIFT				25
+
+#define NVC0_3D_ZETA_HORIZ					0x00001228
+
+#define NVC0_3D_ZETA_VERT					0x0000122c
+
+#define NVC0_3D_ZETA_ARRAY_MODE					0x00001230
+#define NVC0_3D_ZETA_ARRAY_MODE_LAYERS__MASK			0x0000ffff
+#define NVC0_3D_ZETA_ARRAY_MODE_LAYERS__SHIFT			0
+#define NVC0_3D_ZETA_ARRAY_MODE_UNK				0x00010000
+
+#define NVC0_3D_LINKED_TSC					0x00001234
+
+#define NVC0_3D_DRAW_TFB_BYTES					0x0000123c
+
+#define NVC0_3D_FP_RESULT_COUNT					0x00001298
+
+#define NVC0_3D_DEPTH_TEST_ENABLE				0x000012cc
+
+#define NVC0_3D_D3D_FILL_MODE					0x000012d0
+#define NVC0_3D_D3D_FILL_MODE_POINT				0x00000001
+#define NVC0_3D_D3D_FILL_MODE_WIREFRAME				0x00000002
+#define NVC0_3D_D3D_FILL_MODE_SOLID				0x00000003
+
+#define NVC0_3D_SHADE_MODEL					0x000012d4
+#define NVC0_3D_SHADE_MODEL_FLAT				0x00001d00
+#define NVC0_3D_SHADE_MODEL_SMOOTH				0x00001d01
+
+#define NVC0_3D_BLEND_INDEPENDENT				0x000012e4
+
+#define NVC0_3D_DEPTH_WRITE_ENABLE				0x000012e8
+
+#define NVC0_3D_ALPHA_TEST_ENABLE				0x000012ec
+
+#define NVC0_3D_VB_ELEMENT_U8_SETUP				0x00001300
+#define NVC0_3D_VB_ELEMENT_U8_SETUP_OFFSET__MASK		0xc0000000
+#define NVC0_3D_VB_ELEMENT_U8_SETUP_OFFSET__SHIFT		30
+#define NVC0_3D_VB_ELEMENT_U8_SETUP_COUNT__MASK			0x3fffffff
+#define NVC0_3D_VB_ELEMENT_U8_SETUP_COUNT__SHIFT		0
+
+#define NVC0_3D_VB_ELEMENT_U8					0x00001304
+#define NVC0_3D_VB_ELEMENT_U8_I0__MASK				0x000000ff
+#define NVC0_3D_VB_ELEMENT_U8_I0__SHIFT				0
+#define NVC0_3D_VB_ELEMENT_U8_I1__MASK				0x0000ff00
+#define NVC0_3D_VB_ELEMENT_U8_I1__SHIFT				8
+#define NVC0_3D_VB_ELEMENT_U8_I2__MASK				0x00ff0000
+#define NVC0_3D_VB_ELEMENT_U8_I2__SHIFT				16
+#define NVC0_3D_VB_ELEMENT_U8_I3__MASK				0xff000000
+#define NVC0_3D_VB_ELEMENT_U8_I3__SHIFT				24
+
+#define NVC0_3D_D3D_CULL_MODE					0x00001308
+#define NVC0_3D_D3D_CULL_MODE_NONE				0x00000001
+#define NVC0_3D_D3D_CULL_MODE_FRONT				0x00000002
+#define NVC0_3D_D3D_CULL_MODE_BACK				0x00000003
+
+#define NVC0_3D_DEPTH_TEST_FUNC					0x0000130c
+#define NVC0_3D_DEPTH_TEST_FUNC_NEVER				0x00000200
+#define NVC0_3D_DEPTH_TEST_FUNC_LESS				0x00000201
+#define NVC0_3D_DEPTH_TEST_FUNC_EQUAL				0x00000202
+#define NVC0_3D_DEPTH_TEST_FUNC_LEQUAL				0x00000203
+#define NVC0_3D_DEPTH_TEST_FUNC_GREATER				0x00000204
+#define NVC0_3D_DEPTH_TEST_FUNC_NOTEQUAL			0x00000205
+#define NVC0_3D_DEPTH_TEST_FUNC_GEQUAL				0x00000206
+#define NVC0_3D_DEPTH_TEST_FUNC_ALWAYS				0x00000207
+
+#define NVC0_3D_ALPHA_TEST_REF					0x00001310
+
+#define NVC0_3D_ALPHA_TEST_FUNC					0x00001314
+#define NVC0_3D_ALPHA_TEST_FUNC_NEVER				0x00000200
+#define NVC0_3D_ALPHA_TEST_FUNC_LESS				0x00000201
+#define NVC0_3D_ALPHA_TEST_FUNC_EQUAL				0x00000202
+#define NVC0_3D_ALPHA_TEST_FUNC_LEQUAL				0x00000203
+#define NVC0_3D_ALPHA_TEST_FUNC_GREATER				0x00000204
+#define NVC0_3D_ALPHA_TEST_FUNC_NOTEQUAL			0x00000205
+#define NVC0_3D_ALPHA_TEST_FUNC_GEQUAL				0x00000206
+#define NVC0_3D_ALPHA_TEST_FUNC_ALWAYS				0x00000207
+
+#define NVC0_3D_DRAW_TFB_STRIDE					0x00001318
+#define NVC0_3D_DRAW_TFB_STRIDE__MIN				0x00000001
+#define NVC0_3D_DRAW_TFB_STRIDE__MAX				0x00000fff
+
+#define NVC0_3D_BLEND_COLOR(i0)				       (0x0000131c + 0x4*(i0))
+#define NVC0_3D_BLEND_COLOR__ESIZE				0x00000004
+#define NVC0_3D_BLEND_COLOR__LEN				0x00000004
+
+#define NVC0_3D_TSC_FLUSH					0x00001330
+#define NVC0_3D_TSC_FLUSH_SPECIFIC				0x00000001
+#define NVC0_3D_TSC_FLUSH_ENTRY__MASK				0x03fffff0
+#define NVC0_3D_TSC_FLUSH_ENTRY__SHIFT				4
+
+#define NVC0_3D_TIC_FLUSH					0x00001334
+#define NVC0_3D_TIC_FLUSH_SPECIFIC				0x00000001
+#define NVC0_3D_TIC_FLUSH_ENTRY__MASK				0x03fffff0
+#define NVC0_3D_TIC_FLUSH_ENTRY__SHIFT				4
+
+#define NVC0_3D_TEX_CACHE_CTL					0x00001338
+#define NVC0_3D_TEX_CACHE_CTL_UNK1__MASK			0x00000030
+#define NVC0_3D_TEX_CACHE_CTL_UNK1__SHIFT			4
+
+#define NVC0_3D_BLEND_SEPARATE_ALPHA				0x0000133c
+
+#define NVC0_3D_BLEND_EQUATION_RGB				0x00001340
+#define NVC0_3D_BLEND_EQUATION_RGB_FUNC_ADD			0x00008006
+#define NVC0_3D_BLEND_EQUATION_RGB_MIN				0x00008007
+#define NVC0_3D_BLEND_EQUATION_RGB_MAX				0x00008008
+#define NVC0_3D_BLEND_EQUATION_RGB_FUNC_SUBTRACT		0x0000800a
+#define NVC0_3D_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVC0_3D_BLEND_FUNC_SRC_RGB				0x00001344
+
+#define NVC0_3D_BLEND_FUNC_DST_RGB				0x00001348
+
+#define NVC0_3D_BLEND_EQUATION_ALPHA				0x0000134c
+#define NVC0_3D_BLEND_EQUATION_ALPHA_FUNC_ADD			0x00008006
+#define NVC0_3D_BLEND_EQUATION_ALPHA_MIN			0x00008007
+#define NVC0_3D_BLEND_EQUATION_ALPHA_MAX			0x00008008
+#define NVC0_3D_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT		0x0000800a
+#define NVC0_3D_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVC0_3D_BLEND_FUNC_SRC_ALPHA				0x00001350
+
+#define NVC0_3D_BLEND_FUNC_DST_ALPHA				0x00001358
+
+#define NVC0_3D_BLEND_ENABLE_COMMON				0x0000135c
+
+#define NVC0_3D_BLEND_ENABLE(i0)			       (0x00001360 + 0x4*(i0))
+#define NVC0_3D_BLEND_ENABLE__ESIZE				0x00000004
+#define NVC0_3D_BLEND_ENABLE__LEN				0x00000008
+
+#define NVC0_3D_STENCIL_ENABLE					0x00001380
+
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL				0x00001384
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_ZERO			0x00000000
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_INCR			0x00001e02
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_DECR			0x00001e03
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_INCR_WRAP			0x00008507
+#define NVC0_3D_STENCIL_FRONT_OP_FAIL_DECR_WRAP			0x00008508
+
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL				0x00001388
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_ZERO			0x00000000
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_INCR			0x00001e02
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_DECR			0x00001e03
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP		0x00008507
+#define NVC0_3D_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP		0x00008508
+
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS				0x0000138c
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_ZERO			0x00000000
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_INCR			0x00001e02
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_DECR			0x00001e03
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_INCR_WRAP		0x00008507
+#define NVC0_3D_STENCIL_FRONT_OP_ZPASS_DECR_WRAP		0x00008508
+
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC				0x00001390
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_NEVER			0x00000200
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_LESS			0x00000201
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_EQUAL			0x00000202
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_LEQUAL			0x00000203
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_GREATER			0x00000204
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL		0x00000205
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_GEQUAL			0x00000206
+#define NVC0_3D_STENCIL_FRONT_FUNC_FUNC_ALWAYS			0x00000207
+
+#define NVC0_3D_STENCIL_FRONT_FUNC_REF				0x00001394
+
+#define NVC0_3D_STENCIL_FRONT_FUNC_MASK				0x00001398
+
+#define NVC0_3D_STENCIL_FRONT_MASK				0x0000139c
+
+#define NVC0_3D_DRAW_TFB_BASE					0x000013a4
+
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN				0x000013a8
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_0				0x00000001
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_1				0x00000010
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_2				0x00000100
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_3				0x00001000
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_4				0x00010000
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_5				0x00100000
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_6				0x01000000
+#define NVC0_3D_FRAG_COLOR_CLAMP_EN_7				0x10000000
+
+#define NVC0_3D_SCREEN_Y_CONTROL				0x000013ac
+#define NVC0_3D_SCREEN_Y_CONTROL_Y_NEGATE			0x00000001
+#define NVC0_3D_SCREEN_Y_CONTROL_TRIANGLE_RAST_FLIP		0x00000010
+
+#define NVC0_3D_LINE_WIDTH_SMOOTH				0x000013b0
+
+#define NVC0_3D_LINE_WIDTH_ALIASED				0x000013b4
+
+#define NVC0_3D_GP_VERTEX_OUTPUT_COUNT				0x00001420
+#define NVC0_3D_GP_VERTEX_OUTPUT_COUNT__MIN			0x00000001
+#define NVC0_3D_GP_VERTEX_OUTPUT_COUNT__MAX			0x00000400
+
+#define NVC0_3D_VERTEX_ARRAY_FLUSH				0x0000142c
+
+#define NVC0_3D_VB_ELEMENT_BASE					0x00001434
+
+#define NVC0_3D_VB_INSTANCE_BASE				0x00001438
+
+#define NVC0_3D_CODE_CB_FLUSH					0x00001440
+
+#define NVC0_3D_CLIPID_HEIGHT					0x00001504
+#define NVC0_3D_CLIPID_HEIGHT__MAX				0x00002000
+
+#define NVC0_3D_CLIPID_FILL_RECT_HORIZ				0x00001508
+#define NVC0_3D_CLIPID_FILL_RECT_HORIZ_LOW__MASK		0x0000ffff
+#define NVC0_3D_CLIPID_FILL_RECT_HORIZ_LOW__SHIFT		0
+#define NVC0_3D_CLIPID_FILL_RECT_HORIZ_HIGH__MASK		0xffff0000
+#define NVC0_3D_CLIPID_FILL_RECT_HORIZ_HIGH__SHIFT		16
+
+#define NVC0_3D_CLIPID_FILL_RECT_VERT				0x0000150c
+#define NVC0_3D_CLIPID_FILL_RECT_VERT_LOW__MASK			0x0000ffff
+#define NVC0_3D_CLIPID_FILL_RECT_VERT_LOW__SHIFT		0
+#define NVC0_3D_CLIPID_FILL_RECT_VERT_HIGH__MASK		0xffff0000
+#define NVC0_3D_CLIPID_FILL_RECT_VERT_HIGH__SHIFT		16
+
+#define NVC0_3D_CLIP_DISTANCE_ENABLE				0x00001510
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_0				0x00000001
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_1				0x00000002
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_2				0x00000004
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_3				0x00000008
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_4				0x00000010
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_5				0x00000020
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_6				0x00000040
+#define NVC0_3D_CLIP_DISTANCE_ENABLE_7				0x00000080
+
+#define NVC0_3D_SAMPLECNT_ENABLE				0x00001514
+
+#define NVC0_3D_POINT_SIZE					0x00001518
+
+#define NVC0_3D_ZCULL_STATCTRS_ENABLE				0x0000151c
+
+#define NVC0_3D_POINT_SPRITE_ENABLE				0x00001520
+
+#define NVC0_3D_COUNTER_RESET					0x00001530
+#define NVC0_3D_COUNTER_RESET_SAMPLECNT				0x00000001
+#define NVC0_3D_COUNTER_RESET_UNK02				0x00000002
+#define NVC0_3D_COUNTER_RESET_UNK03				0x00000003
+#define NVC0_3D_COUNTER_RESET_UNK04				0x00000004
+#define NVC0_3D_COUNTER_RESET_EMITTED_PRIMITIVES		0x00000010
+#define NVC0_3D_COUNTER_RESET_UNK11				0x00000011
+#define NVC0_3D_COUNTER_RESET_UNK12				0x00000012
+#define NVC0_3D_COUNTER_RESET_UNK13				0x00000013
+#define NVC0_3D_COUNTER_RESET_UNK15				0x00000015
+#define NVC0_3D_COUNTER_RESET_UNK16				0x00000016
+#define NVC0_3D_COUNTER_RESET_UNK17				0x00000017
+#define NVC0_3D_COUNTER_RESET_UNK18				0x00000018
+#define NVC0_3D_COUNTER_RESET_UNK1A				0x0000001a
+#define NVC0_3D_COUNTER_RESET_UNK1B				0x0000001b
+#define NVC0_3D_COUNTER_RESET_UNK1C				0x0000001c
+#define NVC0_3D_COUNTER_RESET_UNK1D				0x0000001d
+#define NVC0_3D_COUNTER_RESET_UNK1E				0x0000001e
+#define NVC0_3D_COUNTER_RESET_GENERATED_PRIMITIVES		0x0000001f
+
+#define NVC0_3D_MULTISAMPLE_ENABLE				0x00001534
+
+#define NVC0_3D_ZETA_ENABLE					0x00001538
+
+#define NVC0_3D_MULTISAMPLE_CTRL				0x0000153c
+#define NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE		0x00000001
+#define NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE			0x00000010
+
+#define NVC0_3D_COND_ADDRESS_HIGH				0x00001550
+
+#define NVC0_3D_COND_ADDRESS_LOW				0x00001554
+
+#define NVC0_3D_COND_MODE					0x00001558
+#define NVC0_3D_COND_MODE_NEVER					0x00000000
+#define NVC0_3D_COND_MODE_ALWAYS				0x00000001
+#define NVC0_3D_COND_MODE_RES_NON_ZERO				0x00000002
+#define NVC0_3D_COND_MODE_EQUAL					0x00000003
+#define NVC0_3D_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NVC0_3D_TSC_ADDRESS_HIGH				0x0000155c
+
+#define NVC0_3D_TSC_ADDRESS_LOW					0x00001560
+#define NVC0_3D_TSC_ADDRESS_LOW__ALIGN				0x00000020
+
+#define NVC0_3D_TSC_LIMIT					0x00001564
+#define NVC0_3D_TSC_LIMIT__MAX					0x00001fff
+
+#define NVC0_3D_POLYGON_OFFSET_FACTOR				0x0000156c
+
+#define NVC0_3D_LINE_SMOOTH_ENABLE				0x00001570
+
+#define NVC0_3D_TIC_ADDRESS_HIGH				0x00001574
+
+#define NVC0_3D_TIC_ADDRESS_LOW					0x00001578
+
+#define NVC0_3D_TIC_LIMIT					0x0000157c
+
+#define NVC0_3D_ZCULL_REGION					0x00001590
+
+#define NVC0_3D_STENCIL_TWO_SIDE_ENABLE				0x00001594
+
+#define NVC0_3D_STENCIL_BACK_OP_FAIL				0x00001598
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_ZERO			0x00000000
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_INCR			0x00001e02
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_DECR			0x00001e03
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_INCR_WRAP			0x00008507
+#define NVC0_3D_STENCIL_BACK_OP_FAIL_DECR_WRAP			0x00008508
+
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL				0x0000159c
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_ZERO			0x00000000
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_INCR			0x00001e02
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_DECR			0x00001e03
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_INCR_WRAP			0x00008507
+#define NVC0_3D_STENCIL_BACK_OP_ZFAIL_DECR_WRAP			0x00008508
+
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS				0x000015a0
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_ZERO			0x00000000
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_INVERT			0x0000150a
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_KEEP			0x00001e00
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_REPLACE			0x00001e01
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_INCR			0x00001e02
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_DECR			0x00001e03
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_INCR_WRAP			0x00008507
+#define NVC0_3D_STENCIL_BACK_OP_ZPASS_DECR_WRAP			0x00008508
+
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC				0x000015a4
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_NEVER			0x00000200
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_LESS			0x00000201
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_EQUAL			0x00000202
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_LEQUAL			0x00000203
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_GREATER			0x00000204
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_NOTEQUAL			0x00000205
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_GEQUAL			0x00000206
+#define NVC0_3D_STENCIL_BACK_FUNC_FUNC_ALWAYS			0x00000207
+
+#define NVC0_3D_CSAA_ENABLE					0x000015b4
+
+#define NVC0_3D_FRAMEBUFFER_SRGB				0x000015b8
+
+#define NVC0_3D_POLYGON_OFFSET_UNITS				0x000015bc
+
+#define NVC0_3D_LAYER						0x000015cc
+#define NVC0_3D_LAYER_IDX__MASK					0x0000ffff
+#define NVC0_3D_LAYER_IDX__SHIFT				0
+#define NVC0_3D_LAYER_USE_GP					0x00010000
+
+#define NVC0_3D_MULTISAMPLE_MODE				0x000015d0
+#define NVC0_3D_MULTISAMPLE_MODE_MS1				0x00000000
+#define NVC0_3D_MULTISAMPLE_MODE_MS2				0x00000001
+#define NVC0_3D_MULTISAMPLE_MODE_MS4				0x00000002
+#define NVC0_3D_MULTISAMPLE_MODE_MS8				0x00000003
+#define NVC0_3D_MULTISAMPLE_MODE_MS8_ALT			0x00000004
+#define NVC0_3D_MULTISAMPLE_MODE_MS2_ALT			0x00000005
+#define NVC0_3D_MULTISAMPLE_MODE_UNK6				0x00000006
+#define NVC0_3D_MULTISAMPLE_MODE_MS4_CS4			0x00000008
+#define NVC0_3D_MULTISAMPLE_MODE_MS4_CS12			0x00000009
+#define NVC0_3D_MULTISAMPLE_MODE_MS8_CS8			0x0000000a
+#define NVC0_3D_MULTISAMPLE_MODE_MS8_CS24			0x0000000b
+
+#define NVC0_3D_VERTEX_BEGIN_D3D				0x000015d4
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE__MASK		0x0fffffff
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE__SHIFT		0
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_POINTS		0x00000001
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINES		0x00000002
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINE_STRIP		0x00000003
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLES		0x00000004
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLE_STRIP	0x00000005
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINES_ADJACENCY	0x0000000a
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_LINE_STRIP_ADJACENCY	0x0000000b
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLES_ADJACENCY	0x0000000c
+#define NVC0_3D_VERTEX_BEGIN_D3D_PRIMITIVE_TRIANGLE_STRIP_ADJACENCY	0x0000000d
+#define NVC0_3D_VERTEX_BEGIN_D3D_INSTANCE_NEXT			0x10000000
+
+#define NVC0_3D_VERTEX_END_D3D					0x000015d8
+#define NVC0_3D_VERTEX_END_D3D_UNK0				0x00000001
+#define NVC0_3D_VERTEX_END_D3D_UNK1				0x00000002
+
+#define NVC0_3D_EDGEFLAG					0x000015e4
+
+#define NVC0_3D_VB_ELEMENT_U32					0x000015e8
+
+#define NVC0_3D_VB_ELEMENT_U16_SETUP				0x000015ec
+#define NVC0_3D_VB_ELEMENT_U16_SETUP_OFFSET__MASK		0xc0000000
+#define NVC0_3D_VB_ELEMENT_U16_SETUP_OFFSET__SHIFT		30
+#define NVC0_3D_VB_ELEMENT_U16_SETUP_COUNT__MASK		0x3fffffff
+#define NVC0_3D_VB_ELEMENT_U16_SETUP_COUNT__SHIFT		0
+
+#define NVC0_3D_VB_ELEMENT_U16					0x000015f0
+#define NVC0_3D_VB_ELEMENT_U16_I0__MASK				0x0000ffff
+#define NVC0_3D_VB_ELEMENT_U16_I0__SHIFT			0
+#define NVC0_3D_VB_ELEMENT_U16_I1__MASK				0xffff0000
+#define NVC0_3D_VB_ELEMENT_U16_I1__SHIFT			16
+
+#define NVC0_3D_VERTEX_BASE_HIGH				0x000015f4
+
+#define NVC0_3D_VERTEX_BASE_LOW					0x000015f8
+
+#define NVC0_3D_ZCULL_WINDOW_OFFSET_X				0x000015fc
+
+#define NVC0_3D_ZCULL_WINDOW_OFFSET_Y				0x00001600
+
+#define NVC0_3D_POINT_COORD_REPLACE				0x00001604
+#define NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN__MASK		0x00000004
+#define NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN__SHIFT		2
+#define NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN_LOWER_LEFT	0x00000000
+#define NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN_UPPER_LEFT	0x00000004
+#define NVC0_3D_POINT_COORD_REPLACE_ENABLE__MASK		0x000007f8
+#define NVC0_3D_POINT_COORD_REPLACE_ENABLE__SHIFT		3
+
+#define NVC0_3D_CODE_ADDRESS_HIGH				0x00001608
+
+#define NVC0_3D_CODE_ADDRESS_LOW				0x0000160c
+
+#define NVC0_3D_VERTEX_END_GL					0x00001614
+#define NVC0_3D_VERTEX_END_GL_UNK0				0x00000001
+#define NVC0_3D_VERTEX_END_GL_UNK1				0x00000002
+
+#define NVC0_3D_VERTEX_BEGIN_GL					0x00001618
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE__MASK			0x0fffffff
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE__SHIFT		0
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS		0x00000000
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINES			0x00000001
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_LOOP		0x00000002
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_STRIP		0x00000003
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES		0x00000004
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_STRIP	0x00000005
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_FAN		0x00000006
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_QUADS			0x00000007
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_QUAD_STRIP		0x00000008
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POLYGON		0x00000009
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINES_ADJACENCY	0x0000000a
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_LINE_STRIP_ADJACENCY	0x0000000b
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES_ADJACENCY	0x0000000c
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLE_STRIP_ADJACENCY	0x0000000d
+#define NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_PATCHES		0x0000000e
+#define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT			0x04000000
+#define NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT			0x08000000
+
+#define NVC0_3D_VERTEX_ID_REPLACE				0x0000161c
+#define NVC0_3D_VERTEX_ID_REPLACE_ENABLE			0x00000001
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__MASK			0x00000ff0
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE__SHIFT			4
+
+#define NVC0_3D_VERTEX_DATA					0x00001640
+
+#define NVC0_3D_PRIM_RESTART_ENABLE				0x00001644
+
+#define NVC0_3D_PRIM_RESTART_INDEX				0x00001648
+
+#define NVC0_3D_VP_GP_BUILTIN_ATTR_EN				0x0000164c
+#define NVC0_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID			0x00000001
+#define NVC0_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID		0x00000010
+#define NVC0_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID		0x00000100
+#define NVC0_3D_VP_GP_BUILTIN_ATTR_EN_UNK12			0x00001000
+
+#define NVC0_3D_POINT_SMOOTH_ENABLE				0x00001658
+
+#define NVC0_3D_POINT_RASTER_RULES				0x0000165c
+#define NVC0_3D_POINT_RASTER_RULES_OGL				0x00000000
+#define NVC0_3D_POINT_RASTER_RULES_D3D				0x00000001
+
+#define NVC0_3D_TEX_MISC					0x00001664
+#define NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP			0x00000004
+
+#define NVC0_3D_LINE_STIPPLE_ENABLE				0x0000166c
+
+#define NVC0_3D_LINE_STIPPLE_PATTERN				0x00001680
+
+#define NVC0_3D_PROVOKING_VERTEX_LAST				0x00001684
+
+#define NVC0_3D_VERTEX_TWO_SIDE_ENABLE				0x00001688
+
+#define NVC0_3D_POLYGON_STIPPLE_ENABLE				0x0000168c
+
+#define NVC0_3D_POLYGON_STIPPLE_PATTERN(i0)		       (0x00001700 + 0x4*(i0))
+#define NVC0_3D_POLYGON_STIPPLE_PATTERN__ESIZE			0x00000004
+#define NVC0_3D_POLYGON_STIPPLE_PATTERN__LEN			0x00000020
+
+#define NVC0_3D_ZETA_BASE_LAYER					0x0000179c
+
+#define NVC0_3D_VERTEX_QUARANTINE_ADDRESS_HIGH			0x000017bc
+
+#define NVC0_3D_VERTEX_QUARANTINE_ADDRESS_LOW			0x000017c0
+
+#define NVC0_3D_VERTEX_QUARANTINE_SIZE				0x000017c4
+#define NVC0_3D_VERTEX_QUARANTINE_SIZE_16K			0x00000001
+#define NVC0_3D_VERTEX_QUARANTINE_SIZE_32K			0x00000002
+#define NVC0_3D_VERTEX_QUARANTINE_SIZE_64K			0x00000003
+
+#define NVC0_3D_STRMOUT_UNK1780(i0)			       (0x00001780 + 0x4*(i0))
+#define NVC0_3D_STRMOUT_UNK1780__ESIZE				0x00000004
+#define NVC0_3D_STRMOUT_UNK1780__LEN				0x00000004
+
+#define NVC0_3D_UNK17BC_ADDRESS_HIGH				0x000017bc
+
+#define NVC0_3D_UNK17BC_ADDRESS_LOW				0x000017c0
+
+#define NVC0_3D_UNK17BC_LIMIT					0x000017c4
+
+#define NVC0_3D_INDEX_ARRAY_START_HIGH				0x000017c8
+
+#define NVC0_3D_INDEX_ARRAY_START_LOW				0x000017cc
+
+#define NVC0_3D_INDEX_ARRAY_LIMIT_HIGH				0x000017d0
+
+#define NVC0_3D_INDEX_ARRAY_LIMIT_LOW				0x000017d4
+
+#define NVC0_3D_INDEX_LOG2_SIZE					0x000017d8
+
+#define NVC0_3D_INDEX_BATCH_FIRST				0x000017dc
+
+#define NVC0_3D_INDEX_BATCH_COUNT				0x000017e0
+
+#define NVC0_3D_POLYGON_OFFSET_CLAMP				0x0000187c
+
+#define NVC0_3D_VERTEX_ARRAY_PER_INSTANCE(i0)		       (0x00001880 + 0x4*(i0))
+#define NVC0_3D_VERTEX_ARRAY_PER_INSTANCE__ESIZE		0x00000004
+#define NVC0_3D_VERTEX_ARRAY_PER_INSTANCE__LEN			0x00000020
+
+#define NVC0_3D_VP_POINT_SIZE_EN				0x00001910
+
+#define NVC0_3D_CULL_FACE_ENABLE				0x00001918
+
+#define NVC0_3D_FRONT_FACE					0x0000191c
+#define NVC0_3D_FRONT_FACE_CW					0x00000900
+#define NVC0_3D_FRONT_FACE_CCW					0x00000901
+
+#define NVC0_3D_CULL_FACE					0x00001920
+#define NVC0_3D_CULL_FACE_FRONT					0x00000404
+#define NVC0_3D_CULL_FACE_BACK					0x00000405
+#define NVC0_3D_CULL_FACE_FRONT_AND_BACK			0x00000408
+
+#define NVC0_3D_LINE_LAST_PIXEL					0x00001924
+
+#define NVC0_3D_VIEWPORT_TRANSFORM_EN				0x0000192c
+
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL				0x0000193c
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_RANGE_0_1		0x00000001
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1__MASK		0x00000006
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1__SHIFT		1
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK0			0x00000000
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK1			0x00000002
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK2			0x00000004
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_NEAR		0x00000008
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_FAR		0x00000010
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK7			0x00000080
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK10			0x00000400
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK11			0x00000800
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12__MASK		0x00003000
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12__SHIFT		12
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK0		0x00000000
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK1		0x00001000
+#define NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK2		0x00002000
+
+#define NVC0_3D_CLIP_DISTANCE_MODE				0x00001940
+#define NVC0_3D_CLIP_DISTANCE_MODE_0__MASK			0x00000001
+#define NVC0_3D_CLIP_DISTANCE_MODE_0__SHIFT			0
+#define NVC0_3D_CLIP_DISTANCE_MODE_0_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_0_CULL			0x00000001
+#define NVC0_3D_CLIP_DISTANCE_MODE_1__MASK			0x00000010
+#define NVC0_3D_CLIP_DISTANCE_MODE_1__SHIFT			4
+#define NVC0_3D_CLIP_DISTANCE_MODE_1_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_1_CULL			0x00000010
+#define NVC0_3D_CLIP_DISTANCE_MODE_2__MASK			0x00000100
+#define NVC0_3D_CLIP_DISTANCE_MODE_2__SHIFT			8
+#define NVC0_3D_CLIP_DISTANCE_MODE_2_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_2_CULL			0x00000100
+#define NVC0_3D_CLIP_DISTANCE_MODE_3__MASK			0x00001000
+#define NVC0_3D_CLIP_DISTANCE_MODE_3__SHIFT			12
+#define NVC0_3D_CLIP_DISTANCE_MODE_3_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_3_CULL			0x00001000
+#define NVC0_3D_CLIP_DISTANCE_MODE_4__MASK			0x00010000
+#define NVC0_3D_CLIP_DISTANCE_MODE_4__SHIFT			16
+#define NVC0_3D_CLIP_DISTANCE_MODE_4_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_4_CULL			0x00010000
+#define NVC0_3D_CLIP_DISTANCE_MODE_5__MASK			0x00100000
+#define NVC0_3D_CLIP_DISTANCE_MODE_5__SHIFT			20
+#define NVC0_3D_CLIP_DISTANCE_MODE_5_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_5_CULL			0x00100000
+#define NVC0_3D_CLIP_DISTANCE_MODE_6__MASK			0x01000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_6__SHIFT			24
+#define NVC0_3D_CLIP_DISTANCE_MODE_6_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_6_CULL			0x01000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_7__MASK			0x10000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_7__SHIFT			28
+#define NVC0_3D_CLIP_DISTANCE_MODE_7_CLIP			0x00000000
+#define NVC0_3D_CLIP_DISTANCE_MODE_7_CULL			0x10000000
+
+#define NVC0_3D_CLIP_RECTS_EN					0x0000194c
+
+#define NVC0_3D_CLIP_RECTS_MODE					0x00001950
+#define NVC0_3D_CLIP_RECTS_MODE_INSIDE_ANY			0x00000000
+#define NVC0_3D_CLIP_RECTS_MODE_OUTSIDE_ALL			0x00000001
+#define NVC0_3D_CLIP_RECTS_MODE_NEVER				0x00000002
+
+#define NVC0_3D_ZCULL_INVALIDATE				0x00001958
+
+#define NVC0_3D_ZCULL_TEST_MASK					0x0000196c
+#define NVC0_3D_ZCULL_TEST_MASK_FAIL_GT_PASS_LT			0x00000001
+#define NVC0_3D_ZCULL_TEST_MASK_PASS_GT_FAIL_LT			0x00000010
+
+#define NVC0_3D_FP_ZORDER_CTRL					0x0000196c
+#define NVC0_3D_FP_ZORDER_CTRL_0				0x00000001
+#define NVC0_3D_FP_ZORDER_CTRL_1				0x00000010
+
+#define NVC0_3D_CLIPID_ENABLE					0x0000197c
+
+#define NVC0_3D_CLIPID_WIDTH					0x00001980
+#define NVC0_3D_CLIPID_WIDTH__MAX				0x00002000
+#define NVC0_3D_CLIPID_WIDTH__ALIGN				0x00000040
+
+#define NVC0_3D_CLIPID_ID					0x00001984
+
+#define NVC0_3D_DEPTH_BOUNDS_EN					0x000019bc
+
+#define NVC0_3D_LOGIC_OP_ENABLE					0x000019c4
+
+#define NVC0_3D_LOGIC_OP					0x000019c8
+#define NVC0_3D_LOGIC_OP_CLEAR					0x00001500
+#define NVC0_3D_LOGIC_OP_AND					0x00001501
+#define NVC0_3D_LOGIC_OP_AND_REVERSE				0x00001502
+#define NVC0_3D_LOGIC_OP_COPY					0x00001503
+#define NVC0_3D_LOGIC_OP_AND_INVERTED				0x00001504
+#define NVC0_3D_LOGIC_OP_NOOP					0x00001505
+#define NVC0_3D_LOGIC_OP_XOR					0x00001506
+#define NVC0_3D_LOGIC_OP_OR					0x00001507
+#define NVC0_3D_LOGIC_OP_NOR					0x00001508
+#define NVC0_3D_LOGIC_OP_EQUIV					0x00001509
+#define NVC0_3D_LOGIC_OP_INVERT					0x0000150a
+#define NVC0_3D_LOGIC_OP_OR_REVERSE				0x0000150b
+#define NVC0_3D_LOGIC_OP_COPY_INVERTED				0x0000150c
+#define NVC0_3D_LOGIC_OP_OR_INVERTED				0x0000150d
+#define NVC0_3D_LOGIC_OP_NAND					0x0000150e
+#define NVC0_3D_LOGIC_OP_SET					0x0000150f
+
+#define NVC0_3D_ZETA_COMP_ENABLE				0x000019cc
+
+#define NVC0_3D_CLEAR_BUFFERS					0x000019d0
+#define NVC0_3D_CLEAR_BUFFERS_Z					0x00000001
+#define NVC0_3D_CLEAR_BUFFERS_S					0x00000002
+#define NVC0_3D_CLEAR_BUFFERS_R					0x00000004
+#define NVC0_3D_CLEAR_BUFFERS_G					0x00000008
+#define NVC0_3D_CLEAR_BUFFERS_B					0x00000010
+#define NVC0_3D_CLEAR_BUFFERS_A					0x00000020
+#define NVC0_3D_CLEAR_BUFFERS_RT__MASK				0x000003c0
+#define NVC0_3D_CLEAR_BUFFERS_RT__SHIFT				6
+#define NVC0_3D_CLEAR_BUFFERS_LAYER__MASK			0x001ffc00
+#define NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT			10
+
+#define NVC0_3D_CLIPID_FILL					0x000019d4
+
+#define NVC0_3D_RT_COMP_ENABLE(i0)			       (0x000019e0 + 0x4*(i0))
+#define NVC0_3D_RT_COMP_ENABLE__ESIZE				0x00000004
+#define NVC0_3D_RT_COMP_ENABLE__LEN				0x00000008
+
+#define NVC0_3D_COLOR_MASK(i0)				       (0x00001a00 + 0x4*(i0))
+#define NVC0_3D_COLOR_MASK__ESIZE				0x00000004
+#define NVC0_3D_COLOR_MASK__LEN					0x00000008
+#define NVC0_3D_COLOR_MASK_R					0x0000000f
+#define NVC0_3D_COLOR_MASK_G					0x000000f0
+#define NVC0_3D_COLOR_MASK_B					0x00000f00
+#define NVC0_3D_COLOR_MASK_A					0x0000f000
+
+#define NVC0_3D_QUERY_ADDRESS_HIGH				0x00001b00
+
+#define NVC0_3D_QUERY_ADDRESS_LOW				0x00001b04
+
+#define NVC0_3D_QUERY_SEQUENCE					0x00001b08
+
+#define NVC0_3D_QUERY_GET					0x00001b0c
+#define NVC0_3D_QUERY_GET_MODE__MASK				0x00000003
+#define NVC0_3D_QUERY_GET_MODE__SHIFT				0
+#define NVC0_3D_QUERY_GET_MODE_WRITE_UNK0			0x00000000
+#define NVC0_3D_QUERY_GET_MODE_SYNC				0x00000001
+#define NVC0_3D_QUERY_GET_MODE_WRITE_UNK2			0x00000002
+#define NVC0_3D_QUERY_GET_FENCE					0x00000010
+#define NVC0_3D_QUERY_GET_STREAM__MASK				0x000000e0
+#define NVC0_3D_QUERY_GET_STREAM__SHIFT				5
+#define NVC0_3D_QUERY_GET_UNK8					0x00000100
+#define NVC0_3D_QUERY_GET_UNIT__MASK				0x0000f000
+#define NVC0_3D_QUERY_GET_UNIT__SHIFT				12
+#define NVC0_3D_QUERY_GET_SYNC_COND__MASK			0x00010000
+#define NVC0_3D_QUERY_GET_SYNC_COND__SHIFT			16
+#define NVC0_3D_QUERY_GET_SYNC_COND_NEQUAL			0x00000000
+#define NVC0_3D_QUERY_GET_SYNC_COND_GREATER			0x00010000
+#define NVC0_3D_QUERY_GET_INTR					0x00100000
+#define NVC0_3D_QUERY_GET_UNK21					0x00200000
+#define NVC0_3D_QUERY_GET_SELECT__MASK				0x0f800000
+#define NVC0_3D_QUERY_GET_SELECT__SHIFT				23
+#define NVC0_3D_QUERY_GET_SELECT_ZERO				0x00000000
+#define NVC0_3D_QUERY_GET_SELECT_SAMPLECNT			0x01000000
+#define NVC0_3D_QUERY_GET_SELECT_EMITTED_PRIMS			0x05800000
+#define NVC0_3D_QUERY_GET_SELECT_GENERATED_PRIMS		0x09000000
+#define NVC0_3D_QUERY_GET_SHORT					0x10000000
+
+#define NVC0_3D_VERTEX_ARRAY_FETCH(i0)			       (0x00001c00 + 0x10*(i0))
+#define NVC0_3D_VERTEX_ARRAY_FETCH__ESIZE			0x00000010
+#define NVC0_3D_VERTEX_ARRAY_FETCH__LEN				0x00000020
+#define NVC0_3D_VERTEX_ARRAY_FETCH_STRIDE__MASK			0x00000fff
+#define NVC0_3D_VERTEX_ARRAY_FETCH_STRIDE__SHIFT		0
+#define NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE			0x00001000
+
+#define NVC0_3D_VERTEX_ARRAY_START_HIGH(i0)		       (0x00001c04 + 0x10*(i0))
+#define NVC0_3D_VERTEX_ARRAY_START_HIGH__ESIZE			0x00000010
+#define NVC0_3D_VERTEX_ARRAY_START_HIGH__LEN			0x00000020
+
+#define NVC0_3D_VERTEX_ARRAY_START_LOW(i0)		       (0x00001c08 + 0x10*(i0))
+#define NVC0_3D_VERTEX_ARRAY_START_LOW__ESIZE			0x00000010
+#define NVC0_3D_VERTEX_ARRAY_START_LOW__LEN			0x00000020
+
+#define NVC0_3D_VERTEX_ARRAY_DIVISOR(i0)		       (0x00001c0c + 0x10*(i0))
+#define NVC0_3D_VERTEX_ARRAY_DIVISOR__ESIZE			0x00000010
+#define NVC0_3D_VERTEX_ARRAY_DIVISOR__LEN			0x00000020
+
+#define NVC0_3D_IBLEND(i0)				       (0x00001e00 + 0x20*(i0))
+#define NVC0_3D_IBLEND__ESIZE					0x00000020
+#define NVC0_3D_IBLEND__LEN					0x00000008
+
+#define NVC0_3D_IBLEND_EQUATION_RGB(i0)			       (0x00001e04 + 0x20*(i0))
+#define NVC0_3D_IBLEND_EQUATION_RGB_FUNC_ADD			0x00008006
+#define NVC0_3D_IBLEND_EQUATION_RGB_MIN				0x00008007
+#define NVC0_3D_IBLEND_EQUATION_RGB_MAX				0x00008008
+#define NVC0_3D_IBLEND_EQUATION_RGB_FUNC_SUBTRACT		0x0000800a
+#define NVC0_3D_IBLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVC0_3D_IBLEND_FUNC_SRC_RGB(i0)			       (0x00001e08 + 0x20*(i0))
+
+#define NVC0_3D_IBLEND_FUNC_DST_RGB(i0)			       (0x00001e0c + 0x20*(i0))
+
+#define NVC0_3D_IBLEND_EQUATION_ALPHA(i0)		       (0x00001e10 + 0x20*(i0))
+#define NVC0_3D_IBLEND_EQUATION_ALPHA_FUNC_ADD			0x00008006
+#define NVC0_3D_IBLEND_EQUATION_ALPHA_MIN			0x00008007
+#define NVC0_3D_IBLEND_EQUATION_ALPHA_MAX			0x00008008
+#define NVC0_3D_IBLEND_EQUATION_ALPHA_FUNC_SUBTRACT		0x0000800a
+#define NVC0_3D_IBLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT	0x0000800b
+
+#define NVC0_3D_IBLEND_FUNC_SRC_ALPHA(i0)		       (0x00001e14 + 0x20*(i0))
+
+#define NVC0_3D_IBLEND_FUNC_DST_ALPHA(i0)		       (0x00001e18 + 0x20*(i0))
+
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_HIGH(i0)		       (0x00001f00 + 0x8*(i0))
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_HIGH__ESIZE			0x00000008
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_HIGH__LEN			0x00000020
+
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_LOW(i0)		       (0x00001f04 + 0x8*(i0))
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_LOW__ESIZE			0x00000008
+#define NVC0_3D_VERTEX_ARRAY_LIMIT_LOW__LEN			0x00000020
+
+#define NVC0_3D_SP(i0)					       (0x00002000 + 0x40*(i0))
+#define NVC0_3D_SP__ESIZE					0x00000040
+#define NVC0_3D_SP__LEN						0x00000006
+
+#define NVC0_3D_SP_SELECT(i0)				       (0x00002000 + 0x40*(i0))
+#define NVC0_3D_SP_SELECT_ENABLE				0x00000001
+#define NVC0_3D_SP_SELECT_PROGRAM__MASK				0x00000070
+#define NVC0_3D_SP_SELECT_PROGRAM__SHIFT			4
+#define NVC0_3D_SP_SELECT_PROGRAM_VP_A				0x00000000
+#define NVC0_3D_SP_SELECT_PROGRAM_VP_B				0x00000010
+#define NVC0_3D_SP_SELECT_PROGRAM_TCP				0x00000020
+#define NVC0_3D_SP_SELECT_PROGRAM_TEP				0x00000030
+#define NVC0_3D_SP_SELECT_PROGRAM_GP				0x00000040
+#define NVC0_3D_SP_SELECT_PROGRAM_FP				0x00000050
+
+#define NVC0_3D_SP_START_ID(i0)				       (0x00002004 + 0x40*(i0))
+
+#define NVC0_3D_SP_GPR_ALLOC(i0)			       (0x0000200c + 0x40*(i0))
+
+#define NVC0_3D_TEX_LIMITS(i0)				       (0x00002200 + 0x10*(i0))
+#define NVC0_3D_TEX_LIMITS__ESIZE				0x00000010
+#define NVC0_3D_TEX_LIMITS__LEN					0x00000005
+
+#define NVC0_3D_FIRMWARE(i0)				       (0x00002300 + 0x4*(i0))
+#define NVC0_3D_FIRMWARE__ESIZE					0x00000004
+#define NVC0_3D_FIRMWARE__LEN					0x00000020
+
+#define NVC0_3D_CB_SIZE						0x00002380
+
+#define NVC0_3D_CB_ADDRESS_HIGH					0x00002384
+
+#define NVC0_3D_CB_ADDRESS_LOW					0x00002388
+
+#define NVC0_3D_CB_POS						0x0000238c
+
+#define NVC0_3D_CB_DATA(i0)				       (0x00002390 + 0x4*(i0))
+#define NVC0_3D_CB_DATA__ESIZE					0x00000004
+#define NVC0_3D_CB_DATA__LEN					0x00000010
+
+#define NVC0_3D_BIND_TSC(i0)				       (0x00002400 + 0x20*(i0))
+#define NVC0_3D_BIND_TSC__ESIZE					0x00000020
+#define NVC0_3D_BIND_TSC__LEN					0x00000005
+#define NVC0_3D_BIND_TSC_ACTIVE					0x00000001
+#define NVC0_3D_BIND_TSC_SAMPLER__MASK				0x00000ff0
+#define NVC0_3D_BIND_TSC_SAMPLER__SHIFT				4
+#define NVC0_3D_BIND_TSC_TSC__MASK				0x01fff000
+#define NVC0_3D_BIND_TSC_TSC__SHIFT				12
+
+#define NVC0_3D_BIND_TIC(i0)				       (0x00002404 + 0x20*(i0))
+#define NVC0_3D_BIND_TIC__ESIZE					0x00000020
+#define NVC0_3D_BIND_TIC__LEN					0x00000005
+#define NVC0_3D_BIND_TIC_ACTIVE					0x00000001
+#define NVC0_3D_BIND_TIC_TEXTURE__MASK				0x000001fe
+#define NVC0_3D_BIND_TIC_TEXTURE__SHIFT				1
+#define NVC0_3D_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NVC0_3D_BIND_TIC_TIC__SHIFT				9
+
+#define NVC0_3D_CB_BIND(i0)				       (0x00002410 + 0x20*(i0))
+#define NVC0_3D_CB_BIND__ESIZE					0x00000020
+#define NVC0_3D_CB_BIND__LEN					0x00000005
+#define NVC0_3D_CB_BIND_VALID					0x00000001
+#define NVC0_3D_CB_BIND_INDEX__MASK				0x000000f0
+#define NVC0_3D_CB_BIND_INDEX__SHIFT				4
+
+#define NVC0_3D_VERT_COLOR_CLAMP_EN				0x00002600
+
+#define NVE4_3D_TEX_CB_INDEX					0x00002608
+#define NVE4_3D_TEX_CB_INDEX__MIN				0x00000000
+#define NVE4_3D_TEX_CB_INDEX__MAX				0x00000010
+
+#define NVC0_3D_TFB_VARYING_LOCS(i0, i1)		       (0x00002800 + 0x80*(i0) + 0x4*(i1))
+#define NVC0_3D_TFB_VARYING_LOCS__ESIZE				0x00000004
+#define NVC0_3D_TFB_VARYING_LOCS__LEN				0x00000020
+
+#define NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE			0x00003800
+
+#define NVC0_3D_MACRO_VERTEX_ARRAY_SELECT			0x00003808
+
+#define NVC0_3D_MACRO_BLEND_ENABLES				0x00003810
+
+#define NVC0_3D_MACRO_POLYGON_MODE_FRONT			0x00003818
+#define NVC0_3D_MACRO_POLYGON_MODE_FRONT_POINT			0x00001b00
+#define NVC0_3D_MACRO_POLYGON_MODE_FRONT_LINE			0x00001b01
+#define NVC0_3D_MACRO_POLYGON_MODE_FRONT_FILL			0x00001b02
+
+#define NVC0_3D_MACRO_POLYGON_MODE_BACK				0x00003820
+#define NVC0_3D_MACRO_POLYGON_MODE_BACK_POINT			0x00001b00
+#define NVC0_3D_MACRO_POLYGON_MODE_BACK_LINE			0x00001b01
+#define NVC0_3D_MACRO_POLYGON_MODE_BACK_FILL			0x00001b02
+
+#define NVC0_3D_MACRO_GP_SELECT					0x00003828
+
+#define NVC0_3D_MACRO_TEP_SELECT				0x00003830
+
+
+#endif /* NVC0_3D_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3ddefs.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3ddefs.xml.h
new file mode 100644
index 00000000000..84b152213a2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3ddefs.xml.h
@@ -0,0 +1,98 @@
+#ifndef NV_3DDEFS_XML
+#define NV_3DDEFS_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nvc0_3d.xml    (  26312 bytes, from 2010-10-08 10:10:01)
+- copyright.xml  (   6498 bytes, from 2010-10-03 13:18:37)
+- nv_defs.xml    (   4437 bytes, from 2010-07-06 07:43:58)
+- nv_3ddefs.xml  (  16397 bytes, from 2010-10-08 13:30:38)
+- nv_object.xml  (  11249 bytes, from 2010-10-07 15:31:28)
+- nvchipsets.xml (   2824 bytes, from 2010-07-07 13:41:20)
+- nv50_defs.xml  (   4482 bytes, from 2010-10-03 13:18:37)
+
+Copyright (C) 2006-2010 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro, curro_, currojerez)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+#define NV50_3D_BLEND_FACTOR_ZERO				0x00004000
+#define NV50_3D_BLEND_FACTOR_ONE				0x00004001
+#define NV50_3D_BLEND_FACTOR_SRC_COLOR				0x00004300
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC_COLOR		0x00004301
+#define NV50_3D_BLEND_FACTOR_SRC_ALPHA				0x00004302
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA		0x00004303
+#define NV50_3D_BLEND_FACTOR_DST_ALPHA				0x00004304
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_DST_ALPHA		0x00004305
+#define NV50_3D_BLEND_FACTOR_DST_COLOR				0x00004306
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_DST_COLOR		0x00004307
+#define NV50_3D_BLEND_FACTOR_SRC_ALPHA_SATURATE			0x00004308
+#define NV50_3D_BLEND_FACTOR_CONSTANT_COLOR			0x0000c001
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR		0x0000c002
+#define NV50_3D_BLEND_FACTOR_CONSTANT_ALPHA			0x0000c003
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA		0x0000c004
+#define NV50_3D_BLEND_FACTOR_SRC1_COLOR				0x0000c900
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR		0x0000c901
+#define NV50_3D_BLEND_FACTOR_SRC1_ALPHA				0x0000c902
+#define NV50_3D_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA		0x0000c903
+
+#endif /* NV_3DDEFS_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
new file mode 100644
index 00000000000..b49f1aecfec
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2013 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Christoph Bumiller, Samuel Pitoiset
+ */
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_compute.h"
+
+int
+nvc0_screen_compute_setup(struct nvc0_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_object *chan = screen->base.channel;
+   struct nouveau_device *dev = screen->base.device;
+   uint32_t obj_class;
+   int ret;
+   int i;
+
+   switch (dev->chipset & 0xf0) {
+   case 0xc0:
+      if (dev->chipset == 0xc8)
+         obj_class = NVC8_COMPUTE_CLASS;
+      else
+         obj_class = NVC0_COMPUTE_CLASS;
+      break;
+   case 0xd0:
+      obj_class = NVC0_COMPUTE_CLASS;
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef90c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
+      return ret;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 1 << 12, NULL,
+                        &screen->parm);
+   if (ret)
+      return ret;
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->oclass);
+
+   /* hardware limit */
+   BEGIN_NVC0(push, NVC0_COMPUTE(MP_LIMIT), 1);
+   PUSH_DATA (push, screen->mp_count);
+   BEGIN_NVC0(push, NVC0_COMPUTE(CALL_LIMIT_LOG), 1);
+   PUSH_DATA (push, 0xf);
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1);
+   PUSH_DATA (push, 0x8000);
+
+   /* global memory setup */
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NIC0(push, NVC0_COMPUTE(GLOBAL_BASE), 0x100);
+   for (i = 0; i <= 0xff; i++)
+      PUSH_DATA (push, (0xc << 28) | (i << 16) | i);
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+   PUSH_DATA (push, 1);
+
+   /* local memory and cstack setup */
+   BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->offset);
+   BEGIN_NVC0(push, NVC0_COMPUTE(TEMP_SIZE_HIGH), 2);
+   PUSH_DATAh(push, screen->tls->size);
+   PUSH_DATA (push, screen->tls->size);
+   BEGIN_NVC0(push, NVC0_COMPUTE(WARP_TEMP_ALLOC), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1);
+   PUSH_DATA (push, 1 << 24);
+
+   /* shared memory setup */
+   BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1);
+   PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1);
+   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1);
+   PUSH_DATA (push, 2 << 24);
+   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1);
+   PUSH_DATA (push, 0);
+
+   /* code segment setup */
+   BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->text->offset);
+   PUSH_DATA (push, screen->text->offset);
+
+   /* bind parameters buffer */
+   BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
+   PUSH_DATA (push, screen->parm->size);
+   PUSH_DATAh(push, screen->parm->offset);
+   PUSH_DATA (push, screen->parm->offset);
+   BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+   PUSH_DATA (push, (0 << 8) | 1);
+
+   /* TODO: textures & samplers */
+
+   return 0;
+}
+
+boolean
+nvc0_compute_validate_program(struct nvc0_context *nvc0)
+{
+   struct nvc0_program *prog = nvc0->compprog;
+
+   if (prog->mem)
+      return TRUE;
+
+   if (!prog->translated) {
+      prog->translated = nvc0_program_translate(
+         prog, nvc0->screen->base.device->chipset);
+      if (!prog->translated)
+         return FALSE;
+   }
+   if (unlikely(!prog->code_size))
+      return FALSE;
+
+   if (likely(prog->code_size)) {
+      if (nvc0_program_upload_code(nvc0, prog)) {
+         struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+         BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
+         PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CODE);
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static boolean
+nvc0_compute_state_validate(struct nvc0_context *nvc0)
+{
+   if (!nvc0_compute_validate_program(nvc0))
+      return FALSE;
+
+   /* TODO: textures, samplers, surfaces, global memory buffers */
+
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
+      return FALSE;
+   if (unlikely(nvc0->state.flushed))
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+
+   return TRUE;
+
+}
+
+static void
+nvc0_compute_upload_input(struct nvc0_context *nvc0, const void *input)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nvc0_program *cp = nvc0->compprog;
+
+   if (cp->parm_size) {
+      BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
+      PUSH_DATA (push, align(cp->parm_size, 0x100));
+      PUSH_DATAh(push, screen->parm->offset);
+      PUSH_DATA (push, screen->parm->offset);
+      BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
+      PUSH_DATA (push, (0 << 8) | 1);
+      /* NOTE: size is limited to 4 KiB, which is < NV04_PFIFO_MAX_PACKET_LEN */
+      BEGIN_1IC0(push, NVC0_COMPUTE(CB_POS), 1 + cp->parm_size / 4);
+      PUSH_DATA (push, 0);
+      PUSH_DATAp(push, input, cp->parm_size / 4);
+
+      BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
+      PUSH_DATA (push, NVC0_COMPUTE_FLUSH_CB);
+   }
+}
+
+void
+nvc0_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label,
+                 const void *input)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *cp = nvc0->compprog;
+   unsigned s, i;
+   int ret;
+
+   ret = !nvc0_compute_state_validate(nvc0);
+   if (ret)
+      goto out;
+
+   nvc0_compute_upload_input(nvc0, input);
+
+   BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1);
+   PUSH_DATA (push, nvc0_program_symbol_offset(cp, label));
+
+   BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_POS_ALLOC), 3);
+   PUSH_DATA (push, align(cp->cp.lmem_size, 0x10));
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0x800); /* WARP_CSTACK_SIZE */
+
+   BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 3);
+   PUSH_DATA (push, align(cp->cp.smem_size, 0x100));
+   PUSH_DATA (push, block_layout[0] * block_layout[1] * block_layout[2]);
+   PUSH_DATA (push, cp->num_barriers);
+   BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1);
+   PUSH_DATA (push, cp->num_gprs);
+
+   /* grid/block setup */
+   BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2);
+   PUSH_DATA (push, (grid_layout[1] << 16) | grid_layout[0]);
+   PUSH_DATA (push, grid_layout[2]);
+   BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2);
+   PUSH_DATA (push, (block_layout[1] << 16) | block_layout[0]);
+   PUSH_DATA (push, block_layout[2]);
+
+   /* launch preliminary setup */
+   BEGIN_NVC0(push, NVC0_COMPUTE(GRIDID), 1);
+   PUSH_DATA (push, 0x1);
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVC0_COMPUTE_FLUSH_GLOBAL | NVC0_COMPUTE_FLUSH_UNK8);
+
+   /* kernel launching */
+   BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_BEGIN), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0x1000);
+   BEGIN_NVC0(push, NVC0_COMPUTE(COMPUTE_END), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1);
+   PUSH_DATA (push, 0x1);
+
+   /* rebind all the 3D constant buffers
+    * (looks like binding a CB on COMPUTE clobbers 3D state) */
+   nvc0->dirty |= NVC0_NEW_CONSTBUF;
+   for (s = 0; s < 6; s++) {
+      for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; i++)
+         if (nvc0->constbuf[s][i].u.buf)
+            nvc0->constbuf_dirty[s] |= 1 << i;
+   }
+   memset(nvc0->state.uniform_buffer_bound, 0,
+          sizeof(nvc0->state.uniform_buffer_bound));
+
+out:
+   if (ret)
+      NOUVEAU_ERR("Failed to launch grid !\n");
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
new file mode 100644
index 00000000000..9a1a71760d7
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.h
@@ -0,0 +1,10 @@
+#ifndef NVC0_COMPUTE_H
+#define NVC0_COMPUTE_H
+
+#include "nv50/nv50_defs.xml.h"
+#include "nvc0/nvc0_compute.xml.h"
+
+boolean
+nvc0_compute_validate_program(struct nvc0_context *nvc0);
+
+#endif /* NVC0_COMPUTE_H */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.xml.h
new file mode 100644
index 00000000000..35e6bfdbea2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.xml.h
@@ -0,0 +1,410 @@
+#ifndef NVC0_COMPUTE_XML
+#define NVC0_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nvc0_compute.xml (  11145 bytes, from 2013-04-27 14:00:13)
+- copyright.xml    (   6452 bytes, from 2013-02-27 22:13:22)
+- nvchipsets.xml   (   3954 bytes, from 2013-04-27 14:00:13)
+- nv_object.xml    (  14395 bytes, from 2013-04-27 14:00:13)
+- nv_defs.xml      (   4437 bytes, from 2013-02-27 22:13:22)
+- nv50_defs.xml    (  16652 bytes, from 2013-06-20 13:45:33)
+
+Copyright (C) 2006-2013 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVC0_COMPUTE_LOCAL_POS_ALLOC				0x00000204
+
+#define NVC0_COMPUTE_LOCAL_NEG_ALLOC				0x00000208
+
+#define NVC0_COMPUTE_WARP_CSTACK_SIZE				0x0000020c
+
+#define NVC0_COMPUTE_TEX_LIMITS					0x00000210
+#define NVC0_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MASK		0x0000000f
+#define NVC0_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__SHIFT		0
+#define NVC0_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MIN		0x00000000
+#define NVC0_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MAX		0x00000004
+#define NVC0_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MASK		0x000000f0
+#define NVC0_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__SHIFT		4
+#define NVC0_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MIN		0x00000000
+#define NVC0_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MAX		0x00000007
+
+#define NVC0_COMPUTE_SHARED_BASE				0x00000214
+
+#define NVC0_COMPUTE_MEM_BARRIER				0x0000021c
+#define NVC0_COMPUTE_MEM_BARRIER_UNK0				0x00000001
+#define NVC0_COMPUTE_MEM_BARRIER_UNK1				0x00000002
+#define NVC0_COMPUTE_MEM_BARRIER_UNK2				0x00000004
+#define NVC0_COMPUTE_MEM_BARRIER_UNK4				0x00000010
+#define NVC0_COMPUTE_MEM_BARRIER_UNK8				0x00000100
+#define NVC0_COMPUTE_MEM_BARRIER_UNK12				0x00001000
+
+#define NVC0_COMPUTE_BIND_TSC					0x00000228
+#define NVC0_COMPUTE_BIND_TSC_ACTIVE				0x00000001
+#define NVC0_COMPUTE_BIND_TSC_SAMPLER__MASK			0x00000ff0
+#define NVC0_COMPUTE_BIND_TSC_SAMPLER__SHIFT			4
+#define NVC0_COMPUTE_BIND_TSC_TSC__MASK				0x01fff000
+#define NVC0_COMPUTE_BIND_TSC_TSC__SHIFT			12
+
+#define NVC0_COMPUTE_BIND_TIC					0x0000022c
+#define NVC0_COMPUTE_BIND_TIC_ACTIVE				0x00000001
+#define NVC0_COMPUTE_BIND_TIC_TEXTURE__MASK			0x000001fe
+#define NVC0_COMPUTE_BIND_TIC_TEXTURE__SHIFT			1
+#define NVC0_COMPUTE_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NVC0_COMPUTE_BIND_TIC_TIC__SHIFT			9
+
+#define NVC0_COMPUTE_BIND_TSC2					0x00000230
+#define NVC0_COMPUTE_BIND_TSC2_ACTIVE				0x00000001
+#define NVC0_COMPUTE_BIND_TSC2_SAMPLER__MASK			0x00000010
+#define NVC0_COMPUTE_BIND_TSC2_SAMPLER__SHIFT			4
+#define NVC0_COMPUTE_BIND_TSC2_TSC__MASK			0x01fff000
+#define NVC0_COMPUTE_BIND_TSC2_TSC__SHIFT			12
+
+#define NVC0_COMPUTE_BIND_TIC2					0x00000234
+#define NVC0_COMPUTE_BIND_TIC2_ACTIVE				0x00000001
+#define NVC0_COMPUTE_BIND_TIC2_TEXTURE__MASK			0x00000002
+#define NVC0_COMPUTE_BIND_TIC2_TEXTURE__SHIFT			1
+#define NVC0_COMPUTE_BIND_TIC2_TIC__MASK			0x7ffffe00
+#define NVC0_COMPUTE_BIND_TIC2_TIC__SHIFT			9
+
+#define NVC0_COMPUTE_GRIDDIM_YX					0x00000238
+#define NVC0_COMPUTE_GRIDDIM_YX_X__MASK				0x0000ffff
+#define NVC0_COMPUTE_GRIDDIM_YX_X__SHIFT			0
+#define NVC0_COMPUTE_GRIDDIM_YX_Y__MASK				0xffff0000
+#define NVC0_COMPUTE_GRIDDIM_YX_Y__SHIFT			16
+
+#define NVC0_COMPUTE_GRIDDIM_Z					0x0000023c
+
+#define NVC0_COMPUTE_UNK244_TIC_FLUSH				0x00000244
+
+#define NVC0_COMPUTE_SHARED_SIZE				0x0000024c
+
+#define NVC0_COMPUTE_THREADS_ALLOC				0x00000250
+
+#define NVC0_COMPUTE_BARRIER_ALLOC				0x00000254
+
+#define NVC0_COMPUTE_UNK028C					0x0000028c
+
+#define NVC0_COMPUTE_COMPUTE_BEGIN				0x0000029c
+#define NVC0_COMPUTE_COMPUTE_BEGIN_UNK0				0x00000001
+
+#define NVC0_COMPUTE_UNK02A0					0x000002a0
+
+#define NVC0_COMPUTE_CP_GPR_ALLOC				0x000002c0
+
+#define NVC0_COMPUTE_UNK02C4					0x000002c4
+
+#define NVC0_COMPUTE_GLOBAL_BASE				0x000002c8
+#define NVC0_COMPUTE_GLOBAL_BASE_HIGH__MASK			0x000000ff
+#define NVC0_COMPUTE_GLOBAL_BASE_HIGH__SHIFT			0
+#define NVC0_COMPUTE_GLOBAL_BASE_INDEX__MASK			0x00ff0000
+#define NVC0_COMPUTE_GLOBAL_BASE_INDEX__SHIFT			16
+#define NVC0_COMPUTE_GLOBAL_BASE_READ_OK			0x40000000
+#define NVC0_COMPUTE_GLOBAL_BASE_WRITE_OK			0x80000000
+
+#define NVC8_COMPUTE_UNK02E0					0x000002e0
+
+#define NVC0_COMPUTE_CACHE_SPLIT				0x00000308
+#define NVC0_COMPUTE_CACHE_SPLIT_16K_SHARED_48K_L1		0x00000001
+#define NVC0_COMPUTE_CACHE_SPLIT_48K_SHARED_16K_L1		0x00000003
+
+#define NVC0_COMPUTE_UNK030C					0x0000030c
+
+#define NVC0_COMPUTE_UNK0360					0x00000360
+#define NVC0_COMPUTE_UNK0360_UNK0				0x00000001
+#define NVC0_COMPUTE_UNK0360_UNK8__MASK				0x00000300
+#define NVC0_COMPUTE_UNK0360_UNK8__SHIFT			8
+#define NVC8_COMPUTE_UNK0360_UNK10__MASK			0x00000c00
+#define NVC8_COMPUTE_UNK0360_UNK10__SHIFT			10
+
+#define NVC0_COMPUTE_LAUNCH					0x00000368
+
+#define NVC0_COMPUTE_UNK036C					0x0000036c
+#define NVC0_COMPUTE_UNK036C_UNK0__MASK				0x00000003
+#define NVC0_COMPUTE_UNK036C_UNK0__SHIFT			0
+#define NVC8_COMPUTE_UNK036C_UNK2__MASK				0x0000000c
+#define NVC8_COMPUTE_UNK036C_UNK2__SHIFT			2
+
+#define NVC0_COMPUTE_BLOCKDIM_YX				0x000003ac
+#define NVC0_COMPUTE_BLOCKDIM_YX_X__MASK			0x0000ffff
+#define NVC0_COMPUTE_BLOCKDIM_YX_X__SHIFT			0
+#define NVC0_COMPUTE_BLOCKDIM_YX_Y__MASK			0xffff0000
+#define NVC0_COMPUTE_BLOCKDIM_YX_Y__SHIFT			16
+
+#define NVC0_COMPUTE_BLOCKDIM_Z					0x000003b0
+
+#define NVC0_COMPUTE_CP_START_ID				0x000003b4
+
+#define NVC0_COMPUTE_FIRMWARE(i0)			       (0x00000500 + 0x4*(i0))
+#define NVC0_COMPUTE_FIRMWARE__ESIZE				0x00000004
+#define NVC0_COMPUTE_FIRMWARE__LEN				0x00000020
+
+#define NVC0_COMPUTE_MP_LIMIT					0x00000758
+
+#define NVC0_COMPUTE_LOCAL_BASE					0x0000077c
+
+#define NVC0_COMPUTE_GRIDID					0x00000780
+
+#define NVC0_COMPUTE_TEMP_ADDRESS_HIGH				0x00000790
+
+#define NVC0_COMPUTE_TEMP_ADDRESS_LOW				0x00000794
+
+#define NVC0_COMPUTE_TEMP_SIZE_HIGH				0x00000798
+
+#define NVC0_COMPUTE_TEMP_SIZE_LOW				0x0000079c
+
+#define NVC0_COMPUTE_WARP_TEMP_ALLOC				0x000007a0
+
+#define NVC0_COMPUTE_COMPUTE_END				0x00000a04
+#define NVC0_COMPUTE_COMPUTE_END_UNK0				0x00000001
+
+#define NVC0_COMPUTE_UNK0A08					0x00000a08
+
+#define NVC0_COMPUTE_CALL_LIMIT_LOG				0x00000d64
+
+#define NVC0_COMPUTE_UNK0D94					0x00000d94
+
+#define NVC0_COMPUTE_WATCHDOG_TIMER				0x00000de4
+
+#define NVC0_COMPUTE_UNK10F4					0x000010f4
+#define NVC0_COMPUTE_UNK10F4_UNK0				0x00000001
+#define NVC0_COMPUTE_UNK10F4_UNK4				0x00000010
+#define NVC0_COMPUTE_UNK10F4_UNK8				0x00000100
+
+#define NVC0_COMPUTE_LINKED_TSC					0x00001234
+
+#define NVC0_COMPUTE_UNK1288_TIC_FLUSH				0x00001288
+
+#define NVC0_COMPUTE_UNK12AC					0x000012ac
+
+#define NVC0_COMPUTE_TSC_FLUSH					0x00001330
+#define NVC0_COMPUTE_TSC_FLUSH_SPECIFIC				0x00000001
+#define NVC0_COMPUTE_TSC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NVC0_COMPUTE_TSC_FLUSH_ENTRY__SHIFT			4
+
+#define NVC0_COMPUTE_TIC_FLUSH					0x00001334
+#define NVC0_COMPUTE_TIC_FLUSH_SPECIFIC				0x00000001
+#define NVC0_COMPUTE_TIC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NVC0_COMPUTE_TIC_FLUSH_ENTRY__SHIFT			4
+
+#define NVC0_COMPUTE_TEX_CACHE_CTL				0x00001338
+#define NVC0_COMPUTE_TEX_CACHE_CTL_UNK0__MASK			0x00000007
+#define NVC0_COMPUTE_TEX_CACHE_CTL_UNK0__SHIFT			0
+#define NVC0_COMPUTE_TEX_CACHE_CTL_ENTRY__MASK			0x03fffff0
+#define NVC0_COMPUTE_TEX_CACHE_CTL_ENTRY__SHIFT			4
+
+#define NVC0_COMPUTE_UNK1354					0x00001354
+
+#define NVC0_COMPUTE_UNK1424_TSC_FLUSH				0x00001424
+
+#define NVC0_COMPUTE_COND_ADDRESS_HIGH				0x00001550
+
+#define NVC0_COMPUTE_COND_ADDRESS_LOW				0x00001554
+
+#define NVC0_COMPUTE_COND_MODE					0x00001558
+#define NVC0_COMPUTE_COND_MODE_NEVER				0x00000000
+#define NVC0_COMPUTE_COND_MODE_ALWAYS				0x00000001
+#define NVC0_COMPUTE_COND_MODE_RES_NON_ZERO			0x00000002
+#define NVC0_COMPUTE_COND_MODE_EQUAL				0x00000003
+#define NVC0_COMPUTE_COND_MODE_NOT_EQUAL			0x00000004
+
+#define NVC0_COMPUTE_TSC_ADDRESS_HIGH				0x0000155c
+
+#define NVC0_COMPUTE_TSC_ADDRESS_LOW				0x00001560
+
+#define NVC0_COMPUTE_TSC_LIMIT					0x00001564
+
+#define NVC0_COMPUTE_TIC_ADDRESS_HIGH				0x00001574
+
+#define NVC0_COMPUTE_TIC_ADDRESS_LOW				0x00001578
+
+#define NVC0_COMPUTE_TIC_LIMIT					0x0000157c
+
+#define NVC0_COMPUTE_CODE_ADDRESS_HIGH				0x00001608
+
+#define NVC0_COMPUTE_CODE_ADDRESS_LOW				0x0000160c
+
+#define NVC0_COMPUTE_TEX_MISC					0x00001664
+#define NVC0_COMPUTE_TEX_MISC_UNK				0x00000001
+#define NVC0_COMPUTE_TEX_MISC_SEAMLESS_CUBE_MAP			0x00000002
+
+#define NVC0_COMPUTE_UNK1690					0x00001690
+#define NVC0_COMPUTE_UNK1690_ALWAYS_DERIV			0x00000001
+#define NVC0_COMPUTE_UNK1690_UNK16				0x00010000
+
+#define NVC0_COMPUTE_CB_BIND					0x00001694
+#define NVC0_COMPUTE_CB_BIND_VALID				0x00000001
+#define NVC0_COMPUTE_CB_BIND_INDEX__MASK			0x00001f00
+#define NVC0_COMPUTE_CB_BIND_INDEX__SHIFT			8
+
+#define NVC0_COMPUTE_FLUSH					0x00001698
+#define NVC0_COMPUTE_FLUSH_CODE					0x00000001
+#define NVC0_COMPUTE_FLUSH_GLOBAL				0x00000010
+#define NVC0_COMPUTE_FLUSH_UNK8					0x00000100
+#define NVC0_COMPUTE_FLUSH_CB					0x00001000
+
+#define NVC0_COMPUTE_UNK1930					0x00001930
+
+#define NVC0_COMPUTE_UNK1944					0x00001944
+
+#define NVC0_COMPUTE_DELAY					0x00001a24
+
+#define NVC0_COMPUTE_UNK1A2C(i0)			       (0x00001a2c + 0x4*(i0))
+#define NVC0_COMPUTE_UNK1A2C__ESIZE				0x00000004
+#define NVC0_COMPUTE_UNK1A2C__LEN				0x00000005
+
+#define NVC0_COMPUTE_QUERY_ADDRESS_HIGH				0x00001b00
+
+#define NVC0_COMPUTE_QUERY_ADDRESS_LOW				0x00001b04
+
+#define NVC0_COMPUTE_QUERY_SEQUENCE				0x00001b08
+
+#define NVC0_COMPUTE_QUERY_GET					0x00001b0c
+#define NVC0_COMPUTE_QUERY_GET_MODE__MASK			0x00000003
+#define NVC0_COMPUTE_QUERY_GET_MODE__SHIFT			0
+#define NVC0_COMPUTE_QUERY_GET_MODE_WRITE			0x00000000
+#define NVC0_COMPUTE_QUERY_GET_MODE_WRITE_INTR_NRHOST		0x00000003
+#define NVC0_COMPUTE_QUERY_GET_INTR				0x00100000
+#define NVC0_COMPUTE_QUERY_GET_SHORT				0x10000000
+
+#define NVC0_COMPUTE_CB_SIZE					0x00002380
+
+#define NVC0_COMPUTE_CB_ADDRESS_HIGH				0x00002384
+
+#define NVC0_COMPUTE_CB_ADDRESS_LOW				0x00002388
+
+#define NVC0_COMPUTE_CB_POS					0x0000238c
+
+#define NVC0_COMPUTE_CB_DATA(i0)			       (0x00002390 + 0x4*(i0))
+#define NVC0_COMPUTE_CB_DATA__ESIZE				0x00000004
+#define NVC0_COMPUTE_CB_DATA__LEN				0x00000010
+
+#define NVC0_COMPUTE_IMAGE(i0)				       (0x00002700 + 0x20*(i0))
+#define NVC0_COMPUTE_IMAGE__ESIZE				0x00000020
+#define NVC0_COMPUTE_IMAGE__LEN					0x00000008
+
+#define NVC0_COMPUTE_IMAGE_ADDRESS_HIGH(i0)		       (0x00002700 + 0x20*(i0))
+
+#define NVC0_COMPUTE_IMAGE_ADDRESS_LOW(i0)		       (0x00002704 + 0x20*(i0))
+
+#define NVC0_COMPUTE_IMAGE_WIDTH(i0)			       (0x00002708 + 0x20*(i0))
+
+#define NVC0_COMPUTE_IMAGE_HEIGHT(i0)			       (0x0000270c + 0x20*(i0))
+#define NVC0_COMPUTE_IMAGE_HEIGHT_HEIGHT__MASK			0x0000ffff
+#define NVC0_COMPUTE_IMAGE_HEIGHT_HEIGHT__SHIFT			0
+#define NVC0_COMPUTE_IMAGE_HEIGHT_UNK16				0x00010000
+#define NVC0_COMPUTE_IMAGE_HEIGHT_LINEAR			0x00100000
+
+#define NVC0_COMPUTE_IMAGE_FORMAT(i0)			       (0x00002710 + 0x20*(i0))
+#define NVC0_COMPUTE_IMAGE_FORMAT_UNK0				0x00000001
+#define NVC0_COMPUTE_IMAGE_FORMAT_FORMAT_COLOR__MASK		0x00000ff0
+#define NVC0_COMPUTE_IMAGE_FORMAT_FORMAT_COLOR__SHIFT		4
+#define NVC0_COMPUTE_IMAGE_FORMAT_FORMAT_ZETA__MASK		0x0001f000
+#define NVC0_COMPUTE_IMAGE_FORMAT_FORMAT_ZETA__SHIFT		12
+
+#define NVC0_COMPUTE_IMAGE_TILE_MODE(i0)		       (0x00002714 + 0x20*(i0))
+
+#define NVC0_COMPUTE_MP_PM_SET(i0)			       (0x0000335c + 0x4*(i0))
+#define NVC0_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NVC0_COMPUTE_MP_PM_SET__LEN				0x00000008
+
+#define NVC0_COMPUTE_MP_PM_SIGSEL(i0)			       (0x0000337c + 0x4*(i0))
+#define NVC0_COMPUTE_MP_PM_SIGSEL__ESIZE			0x00000004
+#define NVC0_COMPUTE_MP_PM_SIGSEL__LEN				0x00000008
+
+#define NVC0_COMPUTE_MP_PM_SRCSEL(i0)			       (0x0000339c + 0x4*(i0))
+#define NVC0_COMPUTE_MP_PM_SRCSEL__ESIZE			0x00000004
+#define NVC0_COMPUTE_MP_PM_SRCSEL__LEN				0x00000008
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP0__MASK			0x00000007
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP0__SHIFT			0
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG0__MASK			0x00000070
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG0__SHIFT			4
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP1__MASK			0x00000700
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP1__SHIFT			8
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG1__MASK			0x00007000
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG1__SHIFT			12
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP2__MASK			0x00070000
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP2__SHIFT			16
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG2__MASK			0x00700000
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG2__SHIFT			20
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP3__MASK			0x07000000
+#define NVC0_COMPUTE_MP_PM_SRCSEL_GRP3__SHIFT			24
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG3__MASK			0x70000000
+#define NVC0_COMPUTE_MP_PM_SRCSEL_SIG3__SHIFT			28
+
+#define NVC0_COMPUTE_MP_PM_OP(i0)			       (0x000033bc + 0x4*(i0))
+#define NVC0_COMPUTE_MP_PM_OP__ESIZE				0x00000004
+#define NVC0_COMPUTE_MP_PM_OP__LEN				0x00000008
+#define NVC0_COMPUTE_MP_PM_OP_MODE__MASK			0x00000001
+#define NVC0_COMPUTE_MP_PM_OP_MODE__SHIFT			0
+#define NVC0_COMPUTE_MP_PM_OP_MODE_LOGOP			0x00000000
+#define NVC0_COMPUTE_MP_PM_OP_MODE_LOGOP_PULSE			0x00000001
+#define NVC0_COMPUTE_MP_PM_OP_FUNC__MASK			0x000ffff0
+#define NVC0_COMPUTE_MP_PM_OP_FUNC__SHIFT			4
+
+#define NVC0_COMPUTE_MP_PM_UNK33DC				0x000033dc
+
+
+#endif /* NVC0_COMPUTE_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
new file mode 100644
index 00000000000..e0c2b74e196
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
+
+#ifdef NVC0_WITH_DRAW_MODULE
+#include "draw/draw_context.h"
+#endif
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_screen.h"
+#include "nvc0/nvc0_resource.h"
+
+static void
+nvc0_flush(struct pipe_context *pipe,
+           struct pipe_fence_handle **fence,
+           unsigned flags)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_screen *screen = &nvc0->screen->base;
+
+   if (fence)
+      nouveau_fence_ref(screen->fence.current, (struct nouveau_fence **)fence);
+
+   PUSH_KICK(nvc0->base.pushbuf); /* fencing handled in kick_notify */
+
+   nouveau_context_update_frame_stats(&nvc0->base);
+}
+
+static void
+nvc0_texture_barrier(struct pipe_context *pipe)
+{
+   struct nouveau_pushbuf *push = nvc0_context(pipe)->base.pushbuf;
+
+   IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
+   IMMED_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 0);
+}
+
+static void
+nvc0_context_unreference_resources(struct nvc0_context *nvc0)
+{
+   unsigned s, i;
+
+   nouveau_bufctx_del(&nvc0->bufctx_3d);
+   nouveau_bufctx_del(&nvc0->bufctx);
+   nouveau_bufctx_del(&nvc0->bufctx_cp);
+
+   util_unreference_framebuffer_state(&nvc0->framebuffer);
+
+   for (i = 0; i < nvc0->num_vtxbufs; ++i)
+      pipe_resource_reference(&nvc0->vtxbuf[i].buffer, NULL);
+
+   pipe_resource_reference(&nvc0->idxbuf.buffer, NULL);
+
+   for (s = 0; s < 6; ++s) {
+      for (i = 0; i < nvc0->num_textures[s]; ++i)
+         pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
+
+      for (i = 0; i < NVC0_MAX_PIPE_CONSTBUFS; ++i)
+         if (!nvc0->constbuf[s][i].user)
+            pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, NULL);
+   }
+
+   for (s = 0; s < 2; ++s) {
+      for (i = 0; i < NVC0_MAX_SURFACE_SLOTS; ++i)
+         pipe_surface_reference(&nvc0->surfaces[s][i], NULL);
+   }
+
+   for (i = 0; i < nvc0->num_tfbbufs; ++i)
+      pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
+
+   for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nvc0->global_residents);
+}
+
+static void
+nvc0_destroy(struct pipe_context *pipe)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   if (nvc0->screen->cur_ctx == nvc0) {
+      nvc0->base.pushbuf->kick_notify = NULL;
+      nvc0->screen->cur_ctx = NULL;
+      nouveau_pushbuf_bufctx(nvc0->base.pushbuf, NULL);
+   }
+   nouveau_pushbuf_kick(nvc0->base.pushbuf, nvc0->base.pushbuf->channel);
+
+   nvc0_context_unreference_resources(nvc0);
+   nvc0_blitctx_destroy(nvc0);
+
+#ifdef NVC0_WITH_DRAW_MODULE
+   draw_destroy(nvc0->draw);
+#endif
+
+   nouveau_context_destroy(&nvc0->base);
+}
+
+void
+nvc0_default_kick_notify(struct nouveau_pushbuf *push)
+{
+   struct nvc0_screen *screen = push->user_priv;
+
+   if (screen) {
+      nouveau_fence_next(&screen->base);
+      nouveau_fence_update(&screen->base, TRUE);
+      if (screen->cur_ctx)
+         screen->cur_ctx->state.flushed = TRUE;
+   }
+   NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
+}
+
+static int
+nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
+                                 struct pipe_resource *res,
+                                 int ref)
+{
+   struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned s, i;
+
+   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+      for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
+         if (nvc0->framebuffer.cbufs[i] &&
+             nvc0->framebuffer.cbufs[i]->texture == res) {
+            nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (nvc0->framebuffer.zsbuf &&
+          nvc0->framebuffer.zsbuf->texture == res) {
+         nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+         if (!--ref)
+            return ref;
+      }
+   }
+
+   if (res->bind & PIPE_BIND_VERTEX_BUFFER) {
+      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
+         if (nvc0->vtxbuf[i].buffer == res) {
+            nvc0->dirty |= NVC0_NEW_ARRAYS;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+            if (!--ref)
+               return ref;
+         }
+      }
+   }
+   if (res->bind & PIPE_BIND_INDEX_BUFFER) {
+      if (nvc0->idxbuf.buffer == res) {
+         nvc0->dirty |= NVC0_NEW_IDXBUF;
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_IDX);
+         if (!--ref)
+            return ref;
+      }
+   }
+
+   if (res->bind & PIPE_BIND_SAMPLER_VIEW) {
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < nvc0->num_textures[s]; ++i) {
+         if (nvc0->textures[s][i] &&
+             nvc0->textures[s][i]->texture == res) {
+            nvc0->textures_dirty[s] |= 1 << i;
+            nvc0->dirty |= NVC0_NEW_TEXTURES;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
+   }
+
+   if (res->bind & PIPE_BIND_CONSTANT_BUFFER) {
+      for (s = 0; s < 5; ++s) {
+      for (i = 0; i < nvc0->num_vtxbufs; ++i) {
+         if (!nvc0->constbuf[s][i].user &&
+             nvc0->constbuf[s][i].u.buf == res) {
+            nvc0->dirty |= NVC0_NEW_CONSTBUF;
+            nvc0->constbuf_dirty[s] |= 1 << i;
+            nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i));
+            if (!--ref)
+               return ref;
+         }
+      }
+      }
+   }
+
+   return ref;
+}
+
+static void
+nvc0_context_get_sample_position(struct pipe_context *, unsigned, unsigned,
+                                 float *);
+
+struct pipe_context *
+nvc0_create(struct pipe_screen *pscreen, void *priv)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   struct nvc0_context *nvc0;
+   struct pipe_context *pipe;
+   int ret;
+   uint32_t flags;
+
+   nvc0 = CALLOC_STRUCT(nvc0_context);
+   if (!nvc0)
+      return NULL;
+   pipe = &nvc0->base.pipe;
+
+   if (!nvc0_blitctx_create(nvc0))
+      goto out_err;
+
+   nvc0->base.pushbuf = screen->base.pushbuf;
+   nvc0->base.client = screen->base.client;
+
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nvc0->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_3D_COUNT,
+                               &nvc0->bufctx_3d);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NVC0_BIND_CP_COUNT,
+                               &nvc0->bufctx_cp);
+   if (ret)
+      goto out_err;
+
+   nvc0->screen = screen;
+   nvc0->base.screen = &screen->base;
+
+   pipe->screen = pscreen;
+   pipe->priv = priv;
+
+   pipe->destroy = nvc0_destroy;
+
+   pipe->draw_vbo = nvc0_draw_vbo;
+   pipe->clear = nvc0_clear;
+   pipe->launch_grid = (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) ?
+      nve4_launch_grid : nvc0_launch_grid;
+
+   pipe->flush = nvc0_flush;
+   pipe->texture_barrier = nvc0_texture_barrier;
+   pipe->get_sample_position = nvc0_context_get_sample_position;
+
+   if (!screen->cur_ctx) {
+      screen->cur_ctx = nvc0;
+      nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
+   }
+   screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
+
+   nvc0_init_query_functions(nvc0);
+   nvc0_init_surface_functions(nvc0);
+   nvc0_init_state_functions(nvc0);
+   nvc0_init_transfer_functions(nvc0);
+   nvc0_init_resource_functions(pipe);
+
+   nvc0->base.invalidate_resource_storage = nvc0_invalidate_resource_storage;
+
+#ifdef NVC0_WITH_DRAW_MODULE
+   /* no software fallbacks implemented */
+   nvc0->draw = draw_create(pipe);
+   assert(nvc0->draw);
+   draw_set_rasterize_stage(nvc0->draw, nvc0_draw_render_stage(nvc0));
+#endif
+
+   pipe->create_video_codec = nvc0_create_decoder;
+   pipe->create_video_buffer = nvc0_video_buffer_create;
+
+   /* shader builtin library is per-screen, but we need a context for m2mf */
+   nvc0_program_library_upload(nvc0);
+
+   /* add permanently resident buffers to bufctxts */
+
+   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
+
+   BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text);
+   BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo);
+   BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->txc);
+   if (screen->compute) {
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->text);
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm);
+   }
+
+   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+
+   BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache);
+   if (screen->compute)
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->tls);
+
+   flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
+
+   BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->fence.bo);
+   BCTX_REFN_bo(nvc0->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
+
+   nvc0->base.scratch.bo_size = 2 << 20;
+
+   memset(nvc0->tex_handles, ~0, sizeof(nvc0->tex_handles));
+
+   util_dynarray_init(&nvc0->global_residents);
+
+   return pipe;
+
+out_err:
+   if (nvc0) {
+      if (nvc0->bufctx_3d)
+         nouveau_bufctx_del(&nvc0->bufctx_3d);
+      if (nvc0->bufctx_cp)
+         nouveau_bufctx_del(&nvc0->bufctx_cp);
+      if (nvc0->bufctx)
+         nouveau_bufctx_del(&nvc0->bufctx);
+      if (nvc0->blit)
+         FREE(nvc0->blit);
+      FREE(nvc0);
+   }
+   return NULL;
+}
+
+void
+nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx,
+                  boolean on_flush)
+{
+   struct nouveau_list *list = on_flush ? &bufctx->current : &bufctx->pending;
+   struct nouveau_list *it;
+   NOUVEAU_DRV_STAT_IFD(unsigned count = 0);
+
+   for (it = list->next; it != list; it = it->next) {
+      struct nouveau_bufref *ref = (struct nouveau_bufref *)it;
+      struct nv04_resource *res = ref->priv;
+      if (res)
+         nvc0_resource_validate(res, (unsigned)ref->priv_data);
+      NOUVEAU_DRV_STAT_IFD(count++);
+   }
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, resource_validate_count, count);
+}
+
+static void
+nvc0_context_get_sample_position(struct pipe_context *pipe,
+                                 unsigned sample_count, unsigned sample_index,
+                                 float *xy)
+{
+   static const uint8_t ms1[1][2] = { { 0x8, 0x8 } };
+   static const uint8_t ms2[2][2] = {
+      { 0x4, 0x4 }, { 0xc, 0xc } }; /* surface coords (0,0), (1,0) */
+   static const uint8_t ms4[4][2] = {
+      { 0x6, 0x2 }, { 0xe, 0x6 },   /* (0,0), (1,0) */
+      { 0x2, 0xa }, { 0xa, 0xe } }; /* (0,1), (1,1) */
+   static const uint8_t ms8[8][2] = {
+      { 0x1, 0x7 }, { 0x5, 0x3 },   /* (0,0), (1,0) */
+      { 0x3, 0xd }, { 0x7, 0xb },   /* (0,1), (1,1) */
+      { 0x9, 0x5 }, { 0xf, 0x1 },   /* (2,0), (3,0) */
+      { 0xb, 0xf }, { 0xd, 0x9 } }; /* (2,1), (3,1) */
+#if 0
+   /* NOTE: there are alternative modes for MS2 and MS8, currently not used */
+   static const uint8_t ms8_alt[8][2] = {
+      { 0x9, 0x5 }, { 0x7, 0xb },   /* (2,0), (1,1) */
+      { 0xd, 0x9 }, { 0x5, 0x3 },   /* (3,1), (1,0) */
+      { 0x3, 0xd }, { 0x1, 0x7 },   /* (0,1), (0,0) */
+      { 0xb, 0xf }, { 0xf, 0x1 } }; /* (2,1), (3,0) */
+#endif
+
+   const uint8_t (*ptr)[2];
+
+   switch (sample_count) {
+   case 0:
+   case 1: ptr = ms1; break;
+   case 2: ptr = ms2; break;
+   case 4: ptr = ms4; break;
+   case 8: ptr = ms8; break;
+   default:
+      assert(0);
+      return; /* bad sample count -> undefined locations */
+   }
+   xy[0] = ptr[sample_index][0] * 0.0625f;
+   xy[1] = ptr[sample_index][1] * 0.0625f;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
new file mode 100644
index 00000000000..3fbecdc1391
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -0,0 +1,357 @@
+#ifndef __NVC0_CONTEXT_H__
+#define __NVC0_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+#include "util/u_dynarray.h"
+
+#ifdef NVC0_WITH_DRAW_MODULE
+#include "draw/draw_vertex.h"
+#endif
+
+#include "nv50/nv50_debug.h"
+#include "nvc0/nvc0_winsys.h"
+#include "nvc0/nvc0_stateobj.h"
+#include "nvc0/nvc0_screen.h"
+#include "nvc0/nvc0_program.h"
+#include "nvc0/nvc0_resource.h"
+
+#include "nv50/nv50_transfer.h"
+
+#include "nouveau_context.h"
+
+#include "nvc0/nvc0_3ddefs.xml.h"
+#include "nvc0/nvc0_3d.xml.h"
+#include "nvc0/nvc0_2d.xml.h"
+#include "nvc0/nvc0_m2mf.xml.h"
+#include "nvc0/nve4_p2mf.xml.h"
+
+/* NOTE: must keep NVC0_NEW_...PROG in consecutive bits in this order */
+#define NVC0_NEW_BLEND        (1 << 0)
+#define NVC0_NEW_RASTERIZER   (1 << 1)
+#define NVC0_NEW_ZSA          (1 << 2)
+#define NVC0_NEW_VERTPROG     (1 << 3)
+#define NVC0_NEW_TCTLPROG     (1 << 4)
+#define NVC0_NEW_TEVLPROG     (1 << 5)
+#define NVC0_NEW_GMTYPROG     (1 << 6)
+#define NVC0_NEW_FRAGPROG     (1 << 7)
+#define NVC0_NEW_BLEND_COLOUR (1 << 8)
+#define NVC0_NEW_STENCIL_REF  (1 << 9)
+#define NVC0_NEW_CLIP         (1 << 10)
+#define NVC0_NEW_SAMPLE_MASK  (1 << 11)
+#define NVC0_NEW_FRAMEBUFFER  (1 << 12)
+#define NVC0_NEW_STIPPLE      (1 << 13)
+#define NVC0_NEW_SCISSOR      (1 << 14)
+#define NVC0_NEW_VIEWPORT     (1 << 15)
+#define NVC0_NEW_ARRAYS       (1 << 16)
+#define NVC0_NEW_VERTEX       (1 << 17)
+#define NVC0_NEW_CONSTBUF     (1 << 18)
+#define NVC0_NEW_TEXTURES     (1 << 19)
+#define NVC0_NEW_SAMPLERS     (1 << 20)
+#define NVC0_NEW_TFB_TARGETS  (1 << 21)
+#define NVC0_NEW_IDXBUF       (1 << 22)
+#define NVC0_NEW_SURFACES     (1 << 23)
+
+#define NVC0_NEW_CP_PROGRAM   (1 << 0)
+#define NVC0_NEW_CP_SURFACES  (1 << 1)
+#define NVC0_NEW_CP_TEXTURES  (1 << 2)
+#define NVC0_NEW_CP_SAMPLERS  (1 << 3)
+#define NVC0_NEW_CP_CONSTBUF  (1 << 4)
+#define NVC0_NEW_CP_GLOBALS   (1 << 5)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
+#define NVC0_BIND_FB            0
+#define NVC0_BIND_VTX           1
+#define NVC0_BIND_VTX_TMP       2
+#define NVC0_BIND_IDX           3
+#define NVC0_BIND_TEX(s, i)  (  4 + 32 * (s) + (i))
+#define NVC0_BIND_CB(s, i)   (164 + 16 * (s) + (i))
+#define NVC0_BIND_TFB         244
+#define NVC0_BIND_SUF         245
+#define NVC0_BIND_SCREEN      246
+#define NVC0_BIND_TLS         247
+#define NVC0_BIND_3D_COUNT    248
+
+/* compute bufctx (during launch_grid) */
+#define NVC0_BIND_CP_CB(i)     (  0 + (i))
+#define NVC0_BIND_CP_TEX(i)    ( 16 + (i))
+#define NVC0_BIND_CP_SUF         48
+#define NVC0_BIND_CP_GLOBAL      49
+#define NVC0_BIND_CP_DESC        50
+#define NVC0_BIND_CP_SCREEN      51
+#define NVC0_BIND_CP_QUERY       52
+#define NVC0_BIND_CP_COUNT       53
+
+/* bufctx for other operations */
+#define NVC0_BIND_2D            0
+#define NVC0_BIND_M2MF          0
+#define NVC0_BIND_FENCE         1
+
+
+struct nvc0_blitctx;
+
+boolean nvc0_blitctx_create(struct nvc0_context *);
+void nvc0_blitctx_destroy(struct nvc0_context *);
+
+struct nvc0_context {
+   struct nouveau_context base;
+
+   struct nouveau_bufctx *bufctx_3d;
+   struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
+
+   struct nvc0_screen *screen;
+
+   void (*m2mf_copy_rect)(struct nvc0_context *,
+                          const struct nv50_m2mf_rect *dst,
+                          const struct nv50_m2mf_rect *src,
+                          uint32_t nblocksx, uint32_t nblocksy);
+
+   uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
+
+   struct {
+      boolean flushed;
+      boolean rasterizer_discard;
+      boolean early_z_forced;
+      boolean prim_restart;
+      uint32_t instance_elts; /* bitmask of per-instance elements */
+      uint32_t instance_base;
+      uint32_t constant_vbos;
+      uint32_t constant_elts;
+      int32_t index_bias;
+      uint16_t scissor;
+      uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
+      uint8_t num_vtxbufs;
+      uint8_t num_vtxelts;
+      uint8_t num_textures[6];
+      uint8_t num_samplers[6];
+      uint8_t tls_required; /* bitmask of shader types using l[] */
+      uint8_t c14_bound; /* whether immediate array constbuf is bound */
+      uint8_t clip_enable;
+      uint32_t clip_mode;
+      uint32_t uniform_buffer_bound[5];
+      struct nvc0_transform_feedback_state *tfb;
+   } state;
+
+   struct nvc0_blend_stateobj *blend;
+   struct nvc0_rasterizer_stateobj *rast;
+   struct nvc0_zsa_stateobj *zsa;
+   struct nvc0_vertex_stateobj *vertex;
+
+   struct nvc0_program *vertprog;
+   struct nvc0_program *tctlprog;
+   struct nvc0_program *tevlprog;
+   struct nvc0_program *gmtyprog;
+   struct nvc0_program *fragprog;
+   struct nvc0_program *compprog;
+
+   struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
+   uint16_t constbuf_dirty[6];
+
+   struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+   unsigned num_vtxbufs;
+   struct pipe_index_buffer idxbuf;
+   uint32_t constant_vbos;
+   uint32_t vbo_user; /* bitmask of vertex buffers pointing to user memory */
+   uint32_t vb_elt_first; /* from pipe_draw_info, for vertex upload */
+   uint32_t vb_elt_limit; /* max - min element (count - 1) */
+   uint32_t instance_off; /* current base vertex for instanced arrays */
+   uint32_t instance_max; /* last instance for current draw call */
+
+   struct pipe_sampler_view *textures[6][PIPE_MAX_SAMPLERS];
+   unsigned num_textures[6];
+   uint32_t textures_dirty[6];
+   struct nv50_tsc_entry *samplers[6][PIPE_MAX_SAMPLERS];
+   unsigned num_samplers[6];
+   uint16_t samplers_dirty[6];
+
+   uint32_t tex_handles[6][PIPE_MAX_SAMPLERS]; /* for nve4 */
+
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_blend_color blend_colour;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_poly_stipple stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_viewport_state viewport;
+   struct pipe_clip_state clip;
+
+   unsigned sample_mask;
+
+   boolean vbo_push_hint;
+
+   uint8_t tfbbuf_dirty;
+   struct pipe_stream_output_target *tfbbuf[4];
+   unsigned num_tfbbufs;
+
+   struct pipe_query *cond_query;
+   boolean cond_cond;
+   uint cond_mode;
+
+   struct nvc0_blitctx *blit;
+
+   struct pipe_surface *surfaces[2][NVC0_MAX_SURFACE_SLOTS];
+   uint16_t surfaces_dirty[2];
+   uint16_t surfaces_valid[2];
+   uint32_t vport_int[2];
+
+   struct util_dynarray global_residents;
+
+#ifdef NVC0_WITH_DRAW_MODULE
+   struct draw_context *draw;
+#endif
+};
+
+static INLINE struct nvc0_context *
+nvc0_context(struct pipe_context *pipe)
+{
+   return (struct nvc0_context *)pipe;
+}
+
+static INLINE unsigned
+nvc0_shader_stage(unsigned pipe)
+{
+   switch (pipe) {
+   case PIPE_SHADER_VERTEX: return 0;
+/* case PIPE_SHADER_TESSELLATION_CONTROL: return 1; */
+/* case PIPE_SHADER_TESSELLATION_EVALUATION: return 2; */
+   case PIPE_SHADER_GEOMETRY: return 3;
+   case PIPE_SHADER_FRAGMENT: return 4;
+   case PIPE_SHADER_COMPUTE: return 5;
+   default:
+      assert(!"invalid PIPE_SHADER type");
+      return 0;
+   }
+}
+
+
+/* nvc0_context.c */
+struct pipe_context *nvc0_create(struct pipe_screen *, void *);
+void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *,
+                       boolean on_flush);
+void nvc0_default_kick_notify(struct nouveau_pushbuf *);
+
+/* nvc0_draw.c */
+extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
+
+/* nvc0_program.c */
+boolean nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+boolean nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
+void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
+void nvc0_program_library_upload(struct nvc0_context *);
+uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
+                                    uint32_t label);
+
+/* nvc0_query.c */
+void nvc0_init_query_functions(struct nvc0_context *);
+void nvc0_query_pushbuf_submit(struct nouveau_pushbuf *,
+                               struct pipe_query *, unsigned result_offset);
+void nvc0_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
+void nvc0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *, unsigned i,
+                                boolean *serialize);
+
+#define NVC0_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
+
+/* nvc0_shader_state.c */
+void nvc0_vertprog_validate(struct nvc0_context *);
+void nvc0_tctlprog_validate(struct nvc0_context *);
+void nvc0_tevlprog_validate(struct nvc0_context *);
+void nvc0_gmtyprog_validate(struct nvc0_context *);
+void nvc0_fragprog_validate(struct nvc0_context *);
+
+void nvc0_tfb_validate(struct nvc0_context *);
+
+/* nvc0_state.c */
+extern void nvc0_init_state_functions(struct nvc0_context *);
+
+/* nvc0_state_validate.c */
+void nvc0_validate_global_residents(struct nvc0_context *,
+                                    struct nouveau_bufctx *, int bin);
+extern boolean nvc0_state_validate(struct nvc0_context *, uint32_t state_mask,
+                                   unsigned space_words);
+
+/* nvc0_surface.c */
+extern void nvc0_clear(struct pipe_context *, unsigned buffers,
+                       const union pipe_color_union *color,
+                       double depth, unsigned stencil);
+extern void nvc0_init_surface_functions(struct nvc0_context *);
+
+/* nvc0_tex.c */
+boolean nve4_validate_tsc(struct nvc0_context *nvc0, int s);
+void nvc0_validate_textures(struct nvc0_context *);
+void nvc0_validate_samplers(struct nvc0_context *);
+void nve4_set_tex_handles(struct nvc0_context *);
+void nvc0_validate_surfaces(struct nvc0_context *);
+void nve4_set_surface_info(struct nouveau_pushbuf *, struct pipe_surface *,
+                           struct nvc0_screen *);
+
+struct pipe_sampler_view *
+nvc0_create_texture_view(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_sampler_view *,
+                         uint32_t flags,
+                         enum pipe_texture_target);
+struct pipe_sampler_view *
+nvc0_create_sampler_view(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_sampler_view *);
+
+/* nvc0_transfer.c */
+void
+nvc0_init_transfer_functions(struct nvc0_context *);
+
+void
+nvc0_m2mf_push_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                      unsigned size, const void *data);
+void
+nve4_p2mf_push_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                      unsigned size, const void *data);
+void
+nvc0_cb_push(struct nouveau_context *,
+             struct nouveau_bo *bo, unsigned domain,
+             unsigned base, unsigned size,
+             unsigned offset, unsigned words, const uint32_t *data);
+
+/* nvc0_vbo.c */
+void nvc0_draw_vbo(struct pipe_context *, const struct pipe_draw_info *);
+
+void *
+nvc0_vertex_state_create(struct pipe_context *pipe,
+                         unsigned num_elements,
+                         const struct pipe_vertex_element *elements);
+void
+nvc0_vertex_state_delete(struct pipe_context *pipe, void *hwcso);
+
+void nvc0_vertex_arrays_validate(struct nvc0_context *);
+
+void nvc0_idxbuf_validate(struct nvc0_context *);
+
+/* nvc0_video.c */
+struct pipe_video_codec *
+nvc0_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ);
+
+struct pipe_video_buffer *
+nvc0_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *templat);
+
+/* nvc0_push.c */
+void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *);
+
+/* nve4_compute.c */
+void nve4_launch_grid(struct pipe_context *,
+                      const uint *, const uint *, uint32_t, const void *);
+
+/* nvc0_compute.c */
+void nvc0_launch_grid(struct pipe_context *,
+                      const uint *, const uint *, uint32_t, const void *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_draw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_draw.c
new file mode 100644
index 00000000000..e261d5058fc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_draw.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nvc0/nvc0_context.h"
+
+struct nvc0_render_stage {
+   struct draw_stage stage;
+   struct nvc0_context *nvc0;
+};
+
+static INLINE struct nvc0_render_stage *
+nvc0_render_stage(struct draw_stage *stage)
+{
+   return (struct nvc0_render_stage *)stage;
+}
+
+static void
+nvc0_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nvc0_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nvc0_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nvc0_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nvc0_render_reset_stipple_counter(struct draw_stage *stage)
+{
+   NOUVEAU_ERR("\n");
+}
+
+static void
+nvc0_render_destroy(struct draw_stage *stage)
+{
+   FREE(stage);
+}
+
+struct draw_stage *
+nvc0_draw_render_stage(struct nvc0_context *nvc0)
+{
+   struct nvc0_render_stage *rs = CALLOC_STRUCT(nvc0_render_stage);
+
+   rs->nvc0 = nvc0;
+   rs->stage.draw = nvc0->draw;
+   rs->stage.destroy = nvc0_render_destroy;
+   rs->stage.point = nvc0_render_point;
+   rs->stage.line = nvc0_render_line;
+   rs->stage.tri = nvc0_render_tri;
+   rs->stage.flush = nvc0_render_flush;
+   rs->stage.reset_stipple_counter = nvc0_render_reset_stipple_counter;
+
+   return &rs->stage;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_formats.c b/src/gallium/drivers/nouveau/nvc0/nvc0_formats.c
new file mode 100644
index 00000000000..2bfdb0e076c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_formats.c
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NOUVEAU_DRIVER 0xc0
+
+#include "nv50/nv50_formats.c"
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_graph_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_graph_macros.h
new file mode 100644
index 00000000000..f009980c629
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_graph_macros.h
@@ -0,0 +1,236 @@
+
+#ifndef __NVC0_PGRAPH_MACROS_H__
+#define __NVC0_PGRAPH_MACROS_H__
+
+/* extrinsrt r1, r2, src, size, dst: replace bits [dst:dst+size) in r1
+ *  with bits [src:src+size) in r2
+ *
+ * bra(n)z annul: no delay slot
+ */
+
+/* Bitfield version of NVC0_3D_VERTEX_ARRAY_PER_INSTANCE[].
+ * Args: size, bitfield
+ */
+static const uint32_t nvc0_9097_per_instance_bf[] =
+{
+   0x00000301, /* parm $r3 (the bitfield) */
+   0x00000211, /* mov $r2 0 */
+   0x05880021, /* maddr [NVC0_3D_VERTEX_ARRAY_PER_INSTANCE(0), increment = 4] */
+   0xffffc911, /* mov $r1 (add $r1 -0x1) */
+   0x0040d043, /* send (extrshl $r3 $r2 0x1 0) */
+   0xffff8897, /* exit branz $r1 0x3 */
+   0x00005211  /* mov $r2 (add $r2 0x1) */
+};
+
+/* The comments above the macros describe what they *should* be doing,
+ * but we use less functionality for now.
+ */
+
+/*
+ * for (i = 0; i < 8; ++i)
+ *    [NVC0_3D_BLEND_ENABLE(i)] = BIT(i of arg);
+ *
+ * [3428] = arg;
+ *
+ * if (arg == 0 || [NVC0_3D_MULTISAMPLE_ENABLE] == 0)
+ *    [0d9c] = 0;
+ * else
+ *    [0d9c] = [342c];
+ */
+static const uint32_t nvc0_9097_blend_enables[] =
+{
+   0x05360021, /* 0x00: maddr [NVC0_3D_BLEND_ENABLE(0), increment = 4] */
+   0x00404042, /* 0x01: send extrinsrt 0 $r1 0 0x1 0 */
+   0x00424042, /* 0x02: send extrinsrt 0 $r1 0x1 0x1 0 */
+   0x00444042, /* 0x03: send extrinsrt 0 $r1 0x2 0x1 0 */
+   0x00464042, /* 0x04: send extrinsrt 0 $r1 0x3 0x1 0 */
+   0x00484042, /* 0x05: send extrinsrt 0 $r1 0x4 0x1 0 */
+   0x004a4042, /* 0x06: send extrinsrt 0 $r1 0x5 0x1 0 */
+   0x004c40c2, /* 0x07: exit send extrinsrt 0 $r1 0x6 0x1 0 */
+   0x004e4042, /* 0x08: send extrinsrt 0 $r1 0x7 0x1 0 */
+};
+
+/*
+ * uint64 limit = (parm(0) << 32) | parm(1);
+ * uint64 start = (parm(2) << 32);
+ *
+ * if (limit) {
+ *    start |= parm(3);
+ *    --limit;
+ * } else {
+ *    start |= 1;
+ * }
+ *
+ * [0x1c04 + (arg & 0xf) * 16 + 0] = (start >> 32) & 0xff;
+ * [0x1c04 + (arg & 0xf) * 16 + 4] = start & 0xffffffff;
+ * [0x1f00 + (arg & 0xf) * 8 + 0] = (limit >> 32) & 0xff;
+ * [0x1f00 + (arg & 0xf) * 8 + 4] = limit & 0xffffffff;
+ */
+static const uint32_t nvc0_9097_vertex_array_select[] =
+{
+   0x00000201, /* 0x00: parm $r2 */
+   0x00000301, /* 0x01: parm $r3 */
+   0x00000401, /* 0x02: parm $r4 */
+   0x00000501, /* 0x03: parm $r5 */
+   0x11004612, /* 0x04: mov $r6 extrinsrt 0 $r1 0 4 2 */
+   0x09004712, /* 0x05: mov $r7 extrinsrt 0 $r1 0 4 1 */
+   0x05c07621, /* 0x06: maddr $r6 add $6 0x1701 */
+   0x00002041, /* 0x07: send $r4 */
+   0x00002841, /* 0x08: send $r5 */
+   0x05f03f21, /* 0x09: maddr $r7 add $7 0x17c0 */
+   0x000010c1, /* 0x0a: exit send $r2 */
+   0x00001841, /* 0x0b: send $r3 */
+};
+
+/*
+ * [GL_POLYGON_MODE_FRONT] = arg;
+ *
+ * if (BIT(31 of [0x3410]))
+ *    [1a24] = 0x7353;
+ *
+ * if ([NVC0_3D_SP_SELECT(3)] == 0x31 || [NVC0_3D_SP_SELECT(4)] == 0x41)
+ *    [02ec] = 0;
+ * else
+ * if ([GL_POLYGON_MODE_BACK] == GL_LINE || arg == GL_LINE)
+ *    [02ec] = BYTE(1 of [0x3410]) << 4;
+ * else
+ *    [02ec] = BYTE(0 of [0x3410]) << 4;
+ */
+static const uint32_t nvc0_9097_poly_mode_front[] =
+{
+   0x00db0215, /* 0x00: read $r2 [NVC0_3D_POLYGON_MODE_BACK] */
+   0x020c0315, /* 0x01: read $r3 [NVC0_3D_SP_SELECT(3)] */
+   0x00128f10, /* 0x02: mov $r7 or $r1 $r2 */
+   0x02100415, /* 0x03: read $r4 [NVC0_3D_SP_SELECT(4)] */
+   0x00004211, /* 0x04: mov $r2 0x1 */
+   0x00180611, /* 0x05: mov $r6 0x60 */
+   0x0014bf10, /* 0x06: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x07: braz $r7 0xa */
+   0x00dac021, /* 0x08: maddr 0x36b */
+   0x00800611, /* 0x09: mov $r6 0x200 */
+   0x00131f10, /* 0x0a: mov $r7 or $r3 $r4 */
+   0x0014bf10, /* 0x0b: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x0c: braz $r7 0xf */
+   0x00000841, /* 0x0d: send $r1 */
+   0x00000611, /* 0x0e: mov $r6 0 */
+   0x002ec0a1, /* 0x0f: exit maddr [02ec] */
+   0x00003041  /* 0x10: send $r6 */
+};
+
+/*
+ * [GL_POLYGON_MODE_BACK] = arg;
+ *
+ * if (BIT(31 of [0x3410]))
+ *    [1a24] = 0x7353;
+ *
+ * if ([NVC0_3D_SP_SELECT(3)] == 0x31 || [NVC0_3D_SP_SELECT(4)] == 0x41)
+ *    [02ec] = 0;
+ * else
+ * if ([GL_POLYGON_MODE_FRONT] == GL_LINE || arg == GL_LINE)
+ *    [02ec] = BYTE(1 of [0x3410]) << 4;
+ * else
+ *    [02ec] = BYTE(0 of [0x3410]) << 4;
+ */
+/* NOTE: 0x3410 = 0x80002006 by default,
+ *  POLYGON_MODE == GL_LINE check replaced by (MODE & 1)
+ *  SP_SELECT(i) == (i << 4) | 1 check replaced by SP_SELECT(i) & 1
+ */
+static const uint32_t nvc0_9097_poly_mode_back[] =
+{
+   0x00dac215, /* 0x00: read $r2 [NVC0_3D_POLYGON_MODE_FRONT] */
+   0x020c0315, /* 0x01: read $r3 [NVC0_3D_SP_SELECT(3)] */
+   0x00128f10, /* 0x02: mov $r7 or $r1 $r2 */
+   0x02100415, /* 0x03: read $r4 [NVC0_3D_SP_SELECT(4)] */
+   0x00004211, /* 0x04: mov $r2 0x1 */
+   0x00180611, /* 0x05: mov $r6 0x60 */
+   0x0014bf10, /* 0x06: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x07: braz $r7 0xa */
+   0x00db0021, /* 0x08: maddr 0x36c */
+   0x00800611, /* 0x09: mov $r6 0x200 */
+   0x00131f10, /* 0x0a: mov $r7 or $r3 $r4 */
+   0x0014bf10, /* 0x0b: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x0c: braz $r7 0xf */
+   0x00000841, /* 0x0d: send $r1 */
+   0x00000611, /* 0x0e: mov $r6 0 */
+   0x002ec0a1, /* 0x0f: exit maddr [02ec] */
+   0x00003041  /* 0x10: send $r6 */
+};
+
+/*
+ * [NVC0_3D_SP_SELECT(4)] = arg
+ *
+ * if BIT(31 of [0x3410]) == 0
+ *    [1a24] = 0x7353;
+ *
+ * if ([NVC0_3D_SP_SELECT(3)] == 0x31 || arg == 0x41)
+ *    [02ec] = 0
+ * else
+ * if (any POLYGON MODE == LINE)
+ *    [02ec] = BYTE(1 of [3410]) << 4;
+ * else
+ *    [02ec] = BYTE(0 of [3410]) << 4; // 02ec valid bits are 0xff1
+ */
+static const uint32_t nvc0_9097_gp_select[] = /* 0x0f */
+{
+   0x00dac215, /* 0x00: read $r2 0x36b */
+   0x00db0315, /* 0x01: read $r3 0x36c */
+   0x0012d710, /* 0x02: mov $r7 or $r2 $r3 */
+   0x020c0415, /* 0x03: read $r4 0x830 */
+   0x00004211, /* 0x04: mov $r2 0x1 */
+   0x00180611, /* 0x05: mov $r6 0x60 */
+   0x0014bf10, /* 0x06: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x07: braz $r7 0xa */
+   0x02100021, /* 0x08: maddr 0x840 */
+   0x00800611, /* 0x09: mov $r6 0x200 */
+   0x00130f10, /* 0x0a: mov $r7 or $r1 $r4 */
+   0x0014bf10, /* 0x0b: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x0c: braz $r7 0xf */
+   0x00000841, /* 0x0d: send $r1 */
+   0x00000611, /* 0x0e: mov $r6 0 */
+   0x002ec0a1, /* 0x0f: exit maddr 0xbb */
+   0x00003041, /* 0x10: send $r6 */
+};
+
+/*
+ * [NVC0_3D_SP_SELECT(3)] = arg
+ *
+ * if BIT(31 of [0x3410]) == 0
+ *    [1a24] = 0x7353;
+ *
+ * if (arg == 0x31) {
+ *    if (BIT(2 of [0x3430])) {
+ *       int i = 15; do { --i; } while(i);
+ *       [0x1a2c] = 0;
+ *    }
+ * }
+ *
+ * if ([NVC0_3D_SP_SELECT(4)] == 0x41 || arg == 0x31)
+ *    [02ec] = 0
+ * else
+ * if ([any POLYGON_MODE] == GL_LINE)
+ *    [02ec] = BYTE(1 of [3410]) << 4;
+ * else
+ *    [02ec] = BYTE(0 of [3410]) << 4;
+ */
+static const uint32_t nvc0_9097_tep_select[] = /* 0x10 */
+{
+   0x00dac215, /* 0x00: read $r2 0x36b */
+   0x00db0315, /* 0x01: read $r3 0x36c */
+   0x0012d710, /* 0x02: mov $r7 or $r2 $r3 */
+   0x02100415, /* 0x03: read $r4 0x840 */
+   0x00004211, /* 0x04: mov $r2 0x1 */
+   0x00180611, /* 0x05: mov $r6 0x60 */
+   0x0014bf10, /* 0x06: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x07: braz $r7 0xa */
+   0x020c0021, /* 0x08: maddr 0x830 */
+   0x00800611, /* 0x09: mov $r6 0x200 */
+   0x00130f10, /* 0x0a: mov $r7 or $r1 $r4 */
+   0x0014bf10, /* 0x0b: mov $r7 and $r7 $r2 */
+   0x0000f807, /* 0x0c: braz $r7 0xf */
+   0x00000841, /* 0x0d: send $r1 */
+   0x00000611, /* 0x0e: mov $r6 0 */
+   0x002ec0a1, /* 0x0f: exit maddr 0xbb */
+   0x00003041, /* 0x10: send $r6 */
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_m2mf.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_m2mf.xml.h
new file mode 100644
index 00000000000..3bf628d425e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_m2mf.xml.h
@@ -0,0 +1,138 @@
+#ifndef NVC0_M2MF_XML
+#define NVC0_M2MF_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nvc0_m2mf.xml  (   2227 bytes, from 2010-10-16 16:10:29)
+- copyright.xml  (   6498 bytes, from 2010-10-03 13:18:37)
+- nv_object.xml  (  11379 bytes, from 2010-10-16 11:43:24)
+- nvchipsets.xml (   2907 bytes, from 2010-10-15 16:28:21)
+- nv_defs.xml    (   4437 bytes, from 2010-07-06 07:43:58)
+
+Copyright (C) 2006-2010 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro, curro_, currojerez)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVC0_M2MF_TILING_MODE_IN				0x00000204
+
+#define NVC0_M2MF_TILING_PITCH_IN				0x00000208
+
+#define NVC0_M2MF_TILING_HEIGHT_IN				0x0000020c
+
+#define NVC0_M2MF_TILING_DEPTH_IN				0x00000210
+
+#define NVC0_M2MF_TILING_POSITION_IN_Z				0x00000214
+
+#define NVC0_M2MF_TILING_MODE_OUT				0x00000220
+
+#define NVC0_M2MF_TILING_PITCH_OUT				0x00000224
+
+#define NVC0_M2MF_TILING_HEIGHT_OUT				0x00000228
+
+#define NVC0_M2MF_TILING_DEPTH_OUT				0x0000022c
+
+#define NVC0_M2MF_TILING_POSITION_OUT_Z				0x00000230
+
+#define NVC0_M2MF_OFFSET_OUT_HIGH				0x00000238
+
+#define NVC0_M2MF_OFFSET_OUT_LOW				0x0000023c
+
+#define NVC0_M2MF_EXEC						0x00000300
+#define NVC0_M2MF_EXEC_PUSH					0x00000001
+#define NVC0_M2MF_EXEC_LINEAR_IN				0x00000010
+#define NVC0_M2MF_EXEC_LINEAR_OUT				0x00000100
+#define NVC0_M2MF_EXEC_NOTIFY					0x00002000
+#define NVC0_M2MF_EXEC_INC__MASK				0x00f00000
+#define NVC0_M2MF_EXEC_INC__SHIFT				20
+
+#define NVC0_M2MF_DATA						0x00000304
+
+#define NVC0_M2MF_OFFSET_IN_HIGH				0x0000030c
+
+#define NVC0_M2MF_OFFSET_IN_LOW					0x00000310
+
+#define NVC0_M2MF_PITCH_IN					0x00000314
+
+#define NVC0_M2MF_PITCH_OUT					0x00000318
+
+#define NVC0_M2MF_LINE_LENGTH_IN				0x0000031c
+
+#define NVC0_M2MF_LINE_COUNT					0x00000320
+
+#define NVC0_M2MF_NOTIFY_ADDRESS_HIGH				0x0000032c
+
+#define NVC0_M2MF_NOTIFY_ADDRESS_LOW				0x00000330
+
+#define NVC0_M2MF_NOTIFY					0x00000334
+
+#define NVC0_M2MF_TILING_POSITION_IN_X				0x00000344
+
+#define NVC0_M2MF_TILING_POSITION_IN_Y				0x00000348
+
+#define NVC0_M2MF_TILING_POSITION_OUT_X				0x0000034c
+
+#define NVC0_M2MF_TILING_POSITION_OUT_Y				0x00000350
+
+
+#endif /* NVC0_M2MF_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
new file mode 100644
index 00000000000..79c9390b78f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+
+static uint32_t
+nvc0_tex_choose_tile_dims(unsigned nx, unsigned ny, unsigned nz)
+{
+   return nv50_tex_choose_tile_dims_helper(nx, ny, nz);
+}
+
+static uint32_t
+nvc0_mt_choose_storage_type(struct nv50_miptree *mt, boolean compressed)
+{
+   const unsigned ms = util_logbase2(mt->base.base.nr_samples);
+
+   uint32_t tile_flags;
+
+   if (unlikely(mt->base.base.bind & PIPE_BIND_CURSOR))
+      return 0;
+   if (unlikely(mt->base.base.flags & NOUVEAU_RESOURCE_FLAG_LINEAR))
+      return 0;
+
+   switch (mt->base.base.format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      if (compressed)
+         tile_flags = 0x02 + ms;
+      else
+         tile_flags = 0x01;
+      break;
+   case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+      if (compressed)
+         tile_flags = 0x51 + ms;
+      else
+         tile_flags = 0x46;
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      if (compressed)
+         tile_flags = 0x17 + ms;
+      else
+         tile_flags = 0x11;
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      if (compressed)
+         tile_flags = 0x86 + ms;
+      else
+         tile_flags = 0x7b;
+      break;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      if (compressed)
+         tile_flags = 0xce + ms;
+      else
+         tile_flags = 0xc3;
+      break;
+   default:
+      switch (util_format_get_blocksizebits(mt->base.base.format)) {
+      case 128:
+         if (compressed)
+            tile_flags = 0xf4 + ms * 2;
+         else
+            tile_flags = 0xfe;
+         break;
+      case 64:
+         if (compressed) {
+            switch (ms) {
+            case 0: tile_flags = 0xe6; break;
+            case 1: tile_flags = 0xeb; break;
+            case 2: tile_flags = 0xed; break;
+            case 3: tile_flags = 0xf2; break;
+            default:
+               return 0;
+            }
+         } else {
+            tile_flags = 0xfe;
+         }
+         break;
+      case 32:
+         if (compressed && ms) {
+            switch (ms) {
+               /* This one makes things blurry:
+            case 0: tile_flags = 0xdb; break;
+               */
+            case 1: tile_flags = 0xdd; break;
+            case 2: tile_flags = 0xdf; break;
+            case 3: tile_flags = 0xe4; break;
+            default:
+               return 0;
+            }
+         } else {
+            tile_flags = 0xfe;
+         }
+         break;
+      case 16:
+      case 8:
+         tile_flags = 0xfe;
+         break;
+      default:
+         return 0;
+      }
+      break;
+   }
+
+   return tile_flags;
+}
+
+static INLINE boolean
+nvc0_miptree_init_ms_mode(struct nv50_miptree *mt)
+{
+   switch (mt->base.base.nr_samples) {
+   case 8:
+      mt->ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS8;
+      mt->ms_x = 2;
+      mt->ms_y = 1;
+      break;
+   case 4:
+      mt->ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS4;
+      mt->ms_x = 1;
+      mt->ms_y = 1;
+      break;
+   case 2:
+      mt->ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS2;
+      mt->ms_x = 1;
+      break;
+   case 1:
+   case 0:
+      mt->ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
+      break;
+   default:
+      NOUVEAU_ERR("invalid nr_samples: %u\n", mt->base.base.nr_samples);
+      return FALSE;
+   }
+   return TRUE;
+}
+
+static void
+nvc0_miptree_init_layout_video(struct nv50_miptree *mt)
+{
+   const struct pipe_resource *pt = &mt->base.base;
+   const unsigned blocksize = util_format_get_blocksize(pt->format);
+
+   assert(pt->last_level == 0);
+   assert(mt->ms_x == 0 && mt->ms_y == 0);
+   assert(!util_format_is_compressed(pt->format));
+
+   mt->layout_3d = pt->target == PIPE_TEXTURE_3D;
+
+   mt->level[0].tile_mode = 0x10;
+   mt->level[0].pitch = align(pt->width0 * blocksize, 64);
+   mt->total_size = align(pt->height0, 16) * mt->level[0].pitch * (mt->layout_3d ? pt->depth0 : 1);
+
+   if (pt->array_size > 1) {
+      mt->layer_stride = align(mt->total_size, NVC0_TILE_SIZE(0x10));
+      mt->total_size = mt->layer_stride * pt->array_size;
+   }
+}
+
+static void
+nvc0_miptree_init_layout_tiled(struct nv50_miptree *mt)
+{
+   struct pipe_resource *pt = &mt->base.base;
+   unsigned w, h, d, l;
+   const unsigned blocksize = util_format_get_blocksize(pt->format);
+
+   mt->layout_3d = pt->target == PIPE_TEXTURE_3D;
+
+   w = pt->width0 << mt->ms_x;
+   h = pt->height0 << mt->ms_y;
+
+   /* For 3D textures, a mipmap is spanned by all the layers, for array
+    * textures and cube maps, each layer contains its own mipmaps.
+    */
+   d = mt->layout_3d ? pt->depth0 : 1;
+
+   assert(!mt->ms_mode || !pt->last_level);
+
+   for (l = 0; l <= pt->last_level; ++l) {
+      struct nv50_miptree_level *lvl = &mt->level[l];
+      unsigned tsx, tsy, tsz;
+      unsigned nbx = util_format_get_nblocksx(pt->format, w);
+      unsigned nby = util_format_get_nblocksy(pt->format, h);
+
+      lvl->offset = mt->total_size;
+
+      lvl->tile_mode = nvc0_tex_choose_tile_dims(nbx, nby, d);
+
+      tsx = NVC0_TILE_SIZE_X(lvl->tile_mode); /* x is tile row pitch in bytes */
+      tsy = NVC0_TILE_SIZE_Y(lvl->tile_mode);
+      tsz = NVC0_TILE_SIZE_Z(lvl->tile_mode);
+
+      lvl->pitch = align(nbx * blocksize, tsx);
+
+      mt->total_size += lvl->pitch * align(nby, tsy) * align(d, tsz);
+
+      w = u_minify(w, 1);
+      h = u_minify(h, 1);
+      d = u_minify(d, 1);
+   }
+
+   if (pt->array_size > 1) {
+      mt->layer_stride = align(mt->total_size,
+                               NVC0_TILE_SIZE(mt->level[0].tile_mode));
+      mt->total_size = mt->layer_stride * pt->array_size;
+   }
+}
+
+const struct u_resource_vtbl nvc0_miptree_vtbl =
+{
+   nv50_miptree_get_handle,         /* get_handle */
+   nv50_miptree_destroy,            /* resource_destroy */
+   nvc0_miptree_transfer_map,       /* transfer_map */
+   u_default_transfer_flush_region, /* transfer_flush_region */
+   nvc0_miptree_transfer_unmap,     /* transfer_unmap */
+   u_default_transfer_inline_write  /* transfer_inline_write */
+};
+
+struct pipe_resource *
+nvc0_miptree_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *templ)
+{
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+   struct pipe_resource *pt = &mt->base.base;
+   boolean compressed = dev->drm_version >= 0x01000101;
+   int ret;
+   union nouveau_bo_config bo_config;
+   uint32_t bo_flags;
+
+   if (!mt)
+      return NULL;
+
+   mt->base.vtbl = &nvc0_miptree_vtbl;
+   *pt = *templ;
+   pipe_reference_init(&pt->reference, 1);
+   pt->screen = pscreen;
+
+   if (pt->usage == PIPE_USAGE_STAGING) {
+      switch (pt->target) {
+      case PIPE_TEXTURE_1D:
+      case PIPE_TEXTURE_2D:
+      case PIPE_TEXTURE_RECT:
+         if (pt->last_level == 0 &&
+             !util_format_is_depth_or_stencil(pt->format) &&
+             pt->nr_samples <= 1)
+            pt->flags |= NOUVEAU_RESOURCE_FLAG_LINEAR;
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (pt->bind & PIPE_BIND_LINEAR)
+      pt->flags |= NOUVEAU_RESOURCE_FLAG_LINEAR;
+
+   bo_config.nvc0.memtype = nvc0_mt_choose_storage_type(mt, compressed);
+
+   if (!nvc0_miptree_init_ms_mode(mt)) {
+      FREE(mt);
+      return NULL;
+   }
+
+   if (unlikely(pt->flags & NVC0_RESOURCE_FLAG_VIDEO)) {
+      nvc0_miptree_init_layout_video(mt);
+   } else
+   if (likely(bo_config.nvc0.memtype)) {
+      nvc0_miptree_init_layout_tiled(mt);
+   } else
+   if (!nv50_miptree_init_layout_linear(mt, 128)) {
+      FREE(mt);
+      return NULL;
+   }
+   bo_config.nvc0.tile_mode = mt->level[0].tile_mode;
+
+   if (!bo_config.nvc0.memtype && pt->usage == PIPE_USAGE_STAGING)
+      mt->base.domain = NOUVEAU_BO_GART;
+   else
+      mt->base.domain = NOUVEAU_BO_VRAM;
+
+   bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP;
+
+   if (mt->base.base.bind & (PIPE_BIND_CURSOR | PIPE_BIND_DISPLAY_TARGET))
+      bo_flags |= NOUVEAU_BO_CONTIG;
+
+   ret = nouveau_bo_new(dev, bo_flags, 4096, mt->total_size, &bo_config,
+                        &mt->base.bo);
+   if (ret) {
+      FREE(mt);
+      return NULL;
+   }
+   mt->base.address = mt->base.bo->offset;
+
+   NOUVEAU_DRV_STAT(nouveau_screen(pscreen), tex_obj_current_count, 1);
+   NOUVEAU_DRV_STAT(nouveau_screen(pscreen), tex_obj_current_bytes,
+                    mt->total_size);
+
+   return pt;
+}
+
+/* Offset of zslice @z from start of level @l. */
+INLINE unsigned
+nvc0_mt_zslice_offset(const struct nv50_miptree *mt, unsigned l, unsigned z)
+{
+   const struct pipe_resource *pt = &mt->base.base;
+
+   unsigned tds = NVC0_TILE_SHIFT_Z(mt->level[l].tile_mode);
+   unsigned ths = NVC0_TILE_SHIFT_Y(mt->level[l].tile_mode);
+
+   unsigned nby = util_format_get_nblocksy(pt->format,
+                                           u_minify(pt->height0, l));
+
+   /* to next 2D tile slice within a 3D tile */
+   unsigned stride_2d = NVC0_TILE_SIZE_2D(mt->level[l].tile_mode);
+
+   /* to slice in the next (in z direction) 3D tile */
+   unsigned stride_3d = (align(nby, (1 << ths)) * mt->level[l].pitch) << tds;
+
+   return (z & (1 << (tds - 1))) * stride_2d + (z >> tds) * stride_3d;
+}
+
+/* Surface functions.
+ */
+
+struct pipe_surface *
+nvc0_miptree_surface_new(struct pipe_context *pipe,
+                         struct pipe_resource *pt,
+                         const struct pipe_surface *templ)
+{
+   struct nv50_surface *ns = nv50_surface_from_miptree(nv50_miptree(pt), templ);
+   if (!ns)
+      return NULL;
+   ns->base.context = pipe;
+   return &ns->base;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
new file mode 100644
index 00000000000..71deb3485d5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+
+#include "nvc0/nvc0_context.h"
+
+#include "codegen/nv50_ir_driver.h"
+#include "nvc0/nve4_compute.h"
+
+/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
+ * 124 scalar varying values.
+ */
+static uint32_t
+nvc0_shader_input_address(unsigned sn, unsigned si, unsigned ubase)
+{
+   switch (sn) {
+   case NV50_SEMANTIC_TESSFACTOR:   return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_PRIMID:       return 0x060;
+   case TGSI_SEMANTIC_PSIZE:        return 0x06c;
+   case TGSI_SEMANTIC_POSITION:     return 0x070;
+   case TGSI_SEMANTIC_GENERIC:      return ubase + si * 0x10;
+   case TGSI_SEMANTIC_FOG:          return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
+   case NV50_SEMANTIC_CLIPDISTANCE: return 0x2c0 + si * 0x4;
+   case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
+   case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
+   case NV50_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
+   case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
+   case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
+   case TGSI_SEMANTIC_FACE:         return 0x3fc;
+   case NV50_SEMANTIC_INVOCATIONID: return ~0;
+   default:
+      assert(!"invalid TGSI input semantic");
+      return ~0;
+   }
+}
+
+static uint32_t
+nvc0_shader_output_address(unsigned sn, unsigned si, unsigned ubase)
+{
+   switch (sn) {
+   case NV50_SEMANTIC_TESSFACTOR:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_PRIMID:        return 0x060;
+   case NV50_SEMANTIC_LAYER:         return 0x064;
+   case NV50_SEMANTIC_VIEWPORTINDEX: return 0x068;
+   case TGSI_SEMANTIC_PSIZE:         return 0x06c;
+   case TGSI_SEMANTIC_POSITION:      return 0x070;
+   case TGSI_SEMANTIC_GENERIC:       return ubase + si * 0x10;
+   case TGSI_SEMANTIC_FOG:           return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
+   case NV50_SEMANTIC_CLIPDISTANCE:  return 0x2c0 + si * 0x4;
+   case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
+   case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
+   case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
+   default:
+      assert(!"invalid TGSI output semantic");
+      return ~0;
+   }
+}
+
+static int
+nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info)
+{
+   unsigned i, c, n;
+
+   for (n = 0, i = 0; i < info->numInputs; ++i) {
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
+      case TGSI_SEMANTIC_VERTEXID:
+         info->in[i].mask = 0x1;
+         info->in[i].slot[0] =
+            nvc0_shader_input_address(info->in[i].sn, 0, 0) / 4;
+         continue;
+      default:
+         break;
+      }
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
+      ++n;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info)
+{
+   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      offset = nvc0_shader_input_address(info->in[i].sn,
+                                         info->in[i].si, ubase);
+      if (info->in[i].patch && offset >= 0x20)
+         offset = 0x20 + info->in[i].si * 0x10;
+
+      if (info->in[i].sn == NV50_SEMANTIC_TESSCOORD)
+         info->in[i].mask &= 3;
+
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info)
+{
+   unsigned count = info->prop.fp.numColourResults * 4;
+   unsigned i, c;
+
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         for (c = 0; c < 4; ++c)
+            info->out[i].slot[c] = info->out[i].si * 4 + c;
+
+   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.sampleMask].slot[0] = count++;
+   else
+   if (info->target >= 0xe0)
+      count++; /* on Kepler, depth is always last colour reg + 2 */
+
+   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
+      info->out[info->io.fragDepth].slot[2] = count;
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info)
+{
+   unsigned ubase = MAX2(0x80, 0x20 + info->numPatchConstants * 0x10);
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      offset = nvc0_shader_output_address(info->out[i].sn,
+                                          info->out[i].si, ubase);
+      if (info->out[i].patch && offset >= 0x20)
+         offset = 0x20 + info->out[i].si * 0x10;
+
+      for (c = 0; c < 4; ++c)
+         info->out[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info)
+{
+   int ret;
+
+   if (info->type == PIPE_SHADER_VERTEX)
+      ret = nvc0_vp_assign_input_slots(info);
+   else
+      ret = nvc0_sp_assign_input_slots(info);
+   if (ret)
+      return ret;
+
+   if (info->type == PIPE_SHADER_FRAGMENT)
+      ret = nvc0_fp_assign_output_slots(info);
+   else
+      ret = nvc0_sp_assign_output_slots(info);
+   return ret;
+}
+
+static INLINE void
+nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot)
+{
+   uint8_t min = (vp->hdr[4] >> 12) & 0xff;
+   uint8_t max = (vp->hdr[4] >> 24);
+
+   min = MIN2(min, slot);
+   max = MAX2(max, slot);
+
+   vp->hdr[4] = (max << 24) | (min << 12);
+}
+
+/* Common part of header generation for VP, TCP, TEP and GP. */
+static int
+nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
+{
+   unsigned i, c, a;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      if (info->in[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         a = info->in[i].slot[c];
+         if (info->in[i].mask & (1 << c)) {
+            if (info->in[i].sn != NV50_SEMANTIC_TESSCOORD)
+               vp->hdr[5 + a / 32] |= 1 << (a % 32);
+            else
+               nvc0_vtgp_hdr_update_oread(vp, info->in[i].slot[c]);
+         }
+      }
+   }
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         if (!(info->out[i].mask & (1 << c)))
+            continue;
+         assert(info->out[i].slot[c] >= 0x40 / 4);
+         a = info->out[i].slot[c] - 0x40 / 4;
+         vp->hdr[13 + a / 32] |= 1 << (a % 32);
+         if (info->out[i].oread)
+            nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]);
+      }
+   }
+
+   for (i = 0; i < info->numSysVals; ++i) {
+      switch (info->sv[i].sn) {
+      case TGSI_SEMANTIC_PRIMID:
+         vp->hdr[5] |= 1 << 24;
+         break;
+      case TGSI_SEMANTIC_INSTANCEID:
+         vp->hdr[10] |= 1 << 30;
+         break;
+      case TGSI_SEMANTIC_VERTEXID:
+         vp->hdr[10] |= 1 << 31;
+         break;
+      default:
+         break;
+      }
+   }
+
+   vp->vp.clip_enable = info->io.clipDistanceMask;
+   for (i = 0; i < 8; ++i)
+      if (info->io.cullDistanceMask & (1 << i))
+         vp->vp.clip_mode |= 1 << (i * 4);
+
+   if (info->io.genUserClip < 0)
+      vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */
+
+   return 0;
+}
+
+static int
+nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
+{
+   vp->hdr[0] = 0x20061 | (1 << 10);
+   vp->hdr[4] = 0xff000;
+
+   vp->hdr[18] = info->io.clipDistanceMask;
+
+   return nvc0_vtgp_gen_header(vp, info);
+}
+
+#if defined(PIPE_SHADER_HULL) || defined(PIPE_SHADER_DOMAIN)
+static void
+nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info)
+{
+   if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) {
+      tp->tp.tess_mode = ~0;
+      return;
+   }
+   switch (info->prop.tp.domain) {
+   case PIPE_PRIM_LINES:
+      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES;
+      if (info->prop.tp.winding > 0)
+         tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW;
+      break;
+   case PIPE_PRIM_QUADS:
+      tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS;
+      break;
+   default:
+      tp->tp.tess_mode = ~0;
+      return;
+   }
+   if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS)
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED;
+
+   switch (info->prop.tp.partitioning) {
+   case PIPE_TESS_PART_INTEGER:
+   case PIPE_TESS_PART_POW2:
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL;
+      break;
+   case PIPE_TESS_PART_FRACT_ODD:
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD;
+      break;
+   case PIPE_TESS_PART_FRACT_EVEN:
+      tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN;
+      break;
+   default:
+      assert(!"invalid tessellator partitioning");
+      break;
+   }
+}
+#endif
+
+#ifdef PIPE_SHADER_HULL
+static int
+nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info)
+{
+   unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
+
+   tcp->tp.input_patch_size = info->prop.tp.inputPatchSize;
+
+   if (info->numPatchConstants)
+      opcs = 8 + info->numPatchConstants * 4;
+
+   tcp->hdr[0] = 0x20061 | (2 << 10);
+
+   tcp->hdr[1] = opcs << 24;
+   tcp->hdr[2] = info->prop.tp.outputPatchSize << 24;
+
+   tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */
+
+   nvc0_vtgp_gen_header(tcp, info);
+
+   nvc0_tp_get_tess_mode(tcp, info);
+
+   return 0;
+}
+#endif
+
+#ifdef PIPE_SHADER_DOMAIN
+static int
+nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info)
+{
+   tep->tp.input_patch_size = ~0;
+
+   tep->hdr[0] = 0x20061 | (3 << 10);
+   tep->hdr[4] = 0xff000;
+
+   nvc0_vtgp_gen_header(tep, info);
+
+   nvc0_tp_get_tess_mode(tep, info);
+
+   tep->hdr[18] |= 0x3 << 12; /* ? */
+
+   return 0;
+}
+#endif
+
+static int
+nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
+{
+   gp->hdr[0] = 0x20061 | (4 << 10);
+
+   gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
+
+   switch (info->prop.gp.outputPrim) {
+   case PIPE_PRIM_POINTS:
+      gp->hdr[3] = 0x01000000;
+      gp->hdr[0] |= 0xf0000000;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      gp->hdr[3] = 0x06000000;
+      gp->hdr[0] |= 0x10000000;
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      gp->hdr[3] = 0x07000000;
+      gp->hdr[0] |= 0x10000000;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   gp->hdr[4] = info->prop.gp.maxVertices & 0x1ff;
+
+   return nvc0_vtgp_gen_header(gp, info);
+}
+
+#define NVC0_INTERP_FLAT          (1 << 0)
+#define NVC0_INTERP_PERSPECTIVE   (2 << 0)
+#define NVC0_INTERP_LINEAR        (3 << 0)
+#define NVC0_INTERP_CENTROID      (1 << 2)
+
+static uint8_t
+nvc0_hdr_interp_mode(const struct nv50_ir_varying *var)
+{
+   if (var->linear)
+      return NVC0_INTERP_LINEAR;
+   if (var->flat)
+      return NVC0_INTERP_FLAT;
+   return NVC0_INTERP_PERSPECTIVE;
+}
+
+static int
+nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
+{
+   unsigned i, c, a, m;
+
+   /* just 00062 on Kepler */
+   fp->hdr[0] = 0x20062 | (5 << 10);
+   fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
+
+   if (info->prop.fp.usesDiscard)
+      fp->hdr[0] |= 0x8000;
+   if (info->prop.fp.numColourResults > 1)
+      fp->hdr[0] |= 0x4000;
+   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS)
+      fp->hdr[19] |= 0x1;
+   if (info->prop.fp.writesDepth) {
+      fp->hdr[19] |= 0x2;
+      fp->flags[0] = 0x11; /* deactivate ZCULL */
+   }
+
+   for (i = 0; i < info->numInputs; ++i) {
+      m = nvc0_hdr_interp_mode(&info->in[i]);
+      for (c = 0; c < 4; ++c) {
+         if (!(info->in[i].mask & (1 << c)))
+            continue;
+         a = info->in[i].slot[c];
+         if (info->in[i].slot[0] >= (0x060 / 4) &&
+             info->in[i].slot[0] <= (0x07c / 4)) {
+            fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
+         } else
+         if (info->in[i].slot[0] >= (0x2c0 / 4) &&
+             info->in[i].slot[0] <= (0x2fc / 4)) {
+            fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
+         } else {
+            if (info->in[i].slot[c] < (0x040 / 4) ||
+                info->in[i].slot[c] > (0x380 / 4))
+               continue;
+            a *= 2;
+            if (info->in[i].slot[0] >= (0x300 / 4))
+               a -= 32;
+            fp->hdr[4 + a / 32] |= m << (a % 32);
+         }
+      }
+   }
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         fp->hdr[18] |= info->out[i].mask << info->out[i].slot[0];
+   }
+
+   fp->fp.early_z = info->prop.fp.earlyFragTests;
+
+   return 0;
+}
+
+static struct nvc0_transform_feedback_state *
+nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info,
+                              const struct pipe_stream_output_info *pso)
+{
+   struct nvc0_transform_feedback_state *tfb;
+   unsigned b, i, c;
+
+   tfb = MALLOC_STRUCT(nvc0_transform_feedback_state);
+   if (!tfb)
+      return NULL;
+   for (b = 0; b < 4; ++b) {
+      tfb->stride[b] = pso->stride[b] * 4;
+      tfb->varying_count[b] = 0;
+   }
+   memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */
+
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned s = pso->output[i].start_component;
+      unsigned p = pso->output[i].dst_offset;
+      b = pso->output[i].output_buffer;
+
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         tfb->varying_index[b][p++] =
+            info->out[pso->output[i].register_index].slot[s + c];
+
+      tfb->varying_count[b] = MAX2(tfb->varying_count[b], p);
+   }
+   for (b = 0; b < 4; ++b) // zero unused indices (looks nicer)
+      for (c = tfb->varying_count[b]; c & 3; ++c)
+         tfb->varying_index[b][c] = 0;
+
+   return tfb;
+}
+
+#ifdef DEBUG
+static void
+nvc0_program_dump(struct nvc0_program *prog)
+{
+   unsigned pos;
+
+   if (prog->type != PIPE_SHADER_COMPUTE) {
+      for (pos = 0; pos < sizeof(prog->hdr) / sizeof(prog->hdr[0]); ++pos)
+         debug_printf("HDR[%02lx] = 0x%08x\n",
+                      pos * sizeof(prog->hdr[0]), prog->hdr[pos]);
+   }
+   debug_printf("shader binary code (0x%x bytes):", prog->code_size);
+   for (pos = 0; pos < prog->code_size / 4; ++pos) {
+      if ((pos % 8) == 0)
+         debug_printf("\n");
+      debug_printf("%08x ", prog->code[pos]);
+   }
+   debug_printf("\n");
+}
+#endif
+
+boolean
+nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
+{
+   struct nv50_ir_prog_info *info;
+   int ret;
+
+   info = CALLOC_STRUCT(nv50_ir_prog_info);
+   if (!info)
+      return FALSE;
+
+   info->type = prog->type;
+   info->target = chipset;
+   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
+   info->bin.source = (void *)prog->pipe.tokens;
+
+   info->io.genUserClip = prog->vp.num_ucps;
+   info->io.ucpBase = 256;
+   info->io.ucpCBSlot = 15;
+
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      if (chipset >= NVISA_GK104_CHIPSET) {
+         info->io.resInfoCBSlot = 0;
+         info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
+         info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
+         info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
+      }
+      info->io.msInfoCBSlot = 0;
+      info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
+   } else {
+      if (chipset >= NVISA_GK104_CHIPSET) {
+         info->io.resInfoCBSlot = 15;
+         info->io.texBindBase = 0x20;
+         info->io.suInfoBase = 0; /* TODO */
+      }
+      info->io.msInfoCBSlot = 15;
+      info->io.msInfoBase = 0; /* TODO */
+   }
+
+   info->assignSlots = nvc0_program_assign_varying_slots;
+
+#ifdef DEBUG
+   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
+   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
+#else
+   info->optLevel = 3;
+#endif
+
+   ret = nv50_ir_generate_code(info);
+   if (ret) {
+      NOUVEAU_ERR("shader translation failed: %i\n", ret);
+      goto out;
+   }
+   if (prog->type != PIPE_SHADER_COMPUTE)
+      FREE(info->bin.syms);
+
+   prog->code = info->bin.code;
+   prog->code_size = info->bin.codeSize;
+   prog->immd_data = info->immd.buf;
+   prog->immd_size = info->immd.bufSize;
+   prog->relocs = info->bin.relocData;
+   prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
+   prog->num_barriers = info->numBarriers;
+
+   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+
+   if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS)
+      info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */
+   prog->vp.edgeflag = info->io.edgeFlagIn;
+
+   switch (prog->type) {
+   case PIPE_SHADER_VERTEX:
+      ret = nvc0_vp_gen_header(prog, info);
+      break;
+#ifdef PIPE_SHADER_HULL
+   case PIPE_SHADER_HULL:
+      ret = nvc0_tcp_gen_header(prog, info);
+      break;
+#endif
+#ifdef PIPE_SHADER_DOMAIN
+   case PIPE_SHADER_DOMAIN:
+      ret = nvc0_tep_gen_header(prog, info);
+      break;
+#endif
+   case PIPE_SHADER_GEOMETRY:
+      ret = nvc0_gp_gen_header(prog, info);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      ret = nvc0_fp_gen_header(prog, info);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
+      break;
+   default:
+      ret = -1;
+      NOUVEAU_ERR("unknown program type: %u\n", prog->type);
+      break;
+   }
+   if (ret)
+      goto out;
+
+   if (info->bin.tlsSpace) {
+      assert(info->bin.tlsSpace < (1 << 24));
+      prog->hdr[0] |= 1 << 26;
+      prog->hdr[1] |= info->bin.tlsSpace; /* l[] size */
+      prog->need_tls = TRUE;
+   }
+   /* TODO: factor 2 only needed where joinat/precont is used,
+    *       and we only have to count non-uniform branches
+    */
+   /*
+   if ((info->maxCFDepth * 2) > 16) {
+      prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200;
+      prog->need_tls = TRUE;
+   }
+   */
+   if (info->io.globalAccess)
+      prog->hdr[0] |= 1 << 16;
+
+   if (prog->pipe.stream_output.num_outputs)
+      prog->tfb = nvc0_program_create_tfb_state(info,
+                                                &prog->pipe.stream_output);
+
+out:
+   FREE(info);
+   return !ret;
+}
+
+boolean
+nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   const boolean is_cp = prog->type == PIPE_SHADER_COMPUTE;
+   int ret;
+   uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE);
+   uint32_t lib_pos = screen->lib_code->start;
+   uint32_t code_pos;
+
+   /* c[] bindings need to be aligned to 0x100, but we could use relocations
+    * to save space. */
+   if (prog->immd_size) {
+      prog->immd_base = size;
+      size = align(size, 0x40);
+      size += prog->immd_size + 0xc0; /* add 0xc0 for align 0x40 -> 0x100 */
+   }
+   /* On Fermi, SP_START_ID must be aligned to 0x40.
+    * On Kepler, the first instruction must be aligned to 0x80 because
+    * latency information is expected only at certain positions.
+    */
+   if (screen->base.class_3d >= NVE4_3D_CLASS)
+      size = size + (is_cp ? 0x40 : 0x70);
+   size = align(size, 0x40);
+
+   ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
+   if (ret) {
+      struct nouveau_heap *heap = screen->text_heap;
+      struct nouveau_heap *iter;
+      for (iter = heap; iter && iter->next != heap; iter = iter->next) {
+         struct nvc0_program *evict = iter->priv;
+         if (evict)
+            nouveau_heap_free(&evict->mem);
+      }
+      debug_printf("WARNING: out of code space, evicting all shaders.\n");
+      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
+      if (ret) {
+         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
+         return FALSE;
+      }
+      IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0);
+   }
+   prog->code_base = prog->mem->start;
+   prog->immd_base = align(prog->mem->start + prog->immd_base, 0x100);
+   assert((prog->immd_size == 0) || (prog->immd_base + prog->immd_size <=
+                                     prog->mem->start + prog->mem->size));
+
+   if (!is_cp) {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         switch (prog->mem->start & 0xff) {
+         case 0x40: prog->code_base += 0x70; break;
+         case 0x80: prog->code_base += 0x30; break;
+         case 0xc0: prog->code_base += 0x70; break;
+         default:
+            prog->code_base += 0x30;
+            assert((prog->mem->start & 0xff) == 0x00);
+            break;
+         }
+      }
+      code_pos = prog->code_base + NVC0_SHADER_HEADER_SIZE;
+   } else {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         if (prog->mem->start & 0x40)
+            prog->code_base += 0x40;
+         assert((prog->code_base & 0x7f) == 0x00);
+      }
+      code_pos = prog->code_base;
+   }
+
+   if (prog->relocs)
+      nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
+
+#ifdef DEBUG
+   if (debug_get_bool_option("NV50_PROG_DEBUG", FALSE))
+      nvc0_program_dump(prog);
+#endif
+
+   if (!is_cp)
+      nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
+                           NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr);
+   nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
+                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
+   if (prog->immd_size)
+      nvc0->base.push_data(&nvc0->base,
+                           screen->text, prog->immd_base, NOUVEAU_BO_VRAM,
+                           prog->immd_size, prog->immd_data);
+
+   BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
+   PUSH_DATA (nvc0->base.pushbuf, 0x1011);
+
+   return TRUE;
+}
+
+/* Upload code for builtin functions like integer division emulation. */
+void
+nvc0_program_library_upload(struct nvc0_context *nvc0)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   int ret;
+   uint32_t size;
+   const uint32_t *code;
+
+   if (screen->lib_code)
+      return;
+
+   nv50_ir_get_target_library(screen->base.device->chipset, &code, &size);
+   if (!size)
+      return;
+
+   ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL,
+                            &screen->lib_code);
+   if (ret)
+      return;
+
+   nvc0->base.push_data(&nvc0->base,
+                        screen->text, screen->lib_code->start, NOUVEAU_BO_VRAM,
+                        size, code);
+   /* no need for a memory barrier, will be emitted with first program */
+}
+
+void
+nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
+{
+   const struct pipe_shader_state pipe = prog->pipe;
+   const ubyte type = prog->type;
+
+   if (prog->mem)
+      nouveau_heap_free(&prog->mem);
+   if (prog->code)
+      FREE(prog->code); /* may be 0 for hardcoded shaders */
+   FREE(prog->immd_data);
+   FREE(prog->relocs);
+   if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
+      FREE(prog->cp.syms);
+   if (prog->tfb) {
+      if (nvc0->state.tfb == prog->tfb)
+         nvc0->state.tfb = NULL;
+      FREE(prog->tfb);
+   }
+
+   memset(prog, 0, sizeof(*prog));
+
+   prog->pipe = pipe;
+   prog->type = type;
+}
+
+uint32_t
+nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
+{
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned base = 0;
+   unsigned i;
+   if (prog->type != PIPE_SHADER_COMPUTE)
+      base = NVC0_SHADER_HEADER_SIZE;
+   for (i = 0; i < prog->cp.num_syms; ++i)
+      if (syms[i].label == label)
+         return prog->code_base + base + syms[i].offset;
+   return prog->code_base; /* no symbols or symbol not found */
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
new file mode 100644
index 00000000000..9c184d1f1d5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -0,0 +1,68 @@
+
+#ifndef __NVC0_PROGRAM_H__
+#define __NVC0_PROGRAM_H__
+
+#include "pipe/p_state.h"
+
+#define NVC0_CAP_MAX_PROGRAM_TEMPS 128
+
+
+struct nvc0_transform_feedback_state {
+   uint32_t stride[4];
+   uint8_t varying_count[4];
+   uint8_t varying_index[4][128];
+};
+
+
+#define NVC0_SHADER_HEADER_SIZE (20 * 4)
+
+struct nvc0_program {
+   struct pipe_shader_state pipe;
+
+   ubyte type;
+   boolean translated;
+   boolean need_tls;
+   uint8_t num_gprs;
+
+   uint32_t *code;
+   uint32_t *immd_data;
+   unsigned code_base;
+   unsigned code_size;
+   unsigned immd_base;
+   unsigned immd_size; /* size of immediate array data */
+   unsigned parm_size; /* size of non-bindable uniforms (c0[]) */
+
+   uint32_t hdr[20];
+   uint32_t flags[2];
+
+   struct {
+      uint32_t clip_mode; /* clip/cull selection */
+      uint8_t clip_enable; /* mask of defined clip planes */
+      uint8_t num_ucps; /* also set to max if ClipDistance is used */
+      uint8_t edgeflag; /* attribute index of edgeflag input */
+      boolean need_vertex_id;
+   } vp;
+   struct {
+      uint8_t early_z;
+      uint8_t in_pos[PIPE_MAX_SHADER_INPUTS];
+   } fp;
+   struct {
+      uint32_t tess_mode; /* ~0 if defined by the other stage */
+      uint32_t input_patch_size;
+   } tp;
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+   uint8_t num_barriers;
+
+   void *relocs;
+
+   struct nvc0_transform_feedback_state *tfb;
+
+   struct nouveau_heap *mem;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_push.c b/src/gallium/drivers/nouveau/nvc0/nvc0_push.c
new file mode 100644
index 00000000000..15e8be6968d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_push.c
@@ -0,0 +1,409 @@
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+
+#include "nvc0/nvc0_3d.xml.h"
+
+struct push_context {
+   struct nouveau_pushbuf *push;
+
+   void *idxbuf;
+
+   uint32_t vertex_words;
+   uint32_t packet_vertex_limit;
+
+   struct translate *translate;
+
+   boolean primitive_restart;
+   boolean need_vertex_id;
+   uint32_t prim;
+   uint32_t restart_index;
+   uint32_t instance_id;
+
+   struct {
+      int buffer;
+      float value;
+      uint8_t *data;
+      unsigned offset;
+      unsigned stride;
+   } edgeflag;
+};
+
+static void
+init_push_context(struct nvc0_context *nvc0, struct push_context *ctx)
+{
+   struct pipe_vertex_element *ve;
+
+   ctx->push = nvc0->base.pushbuf;
+   ctx->translate = nvc0->vertex->translate;
+
+   if (likely(nvc0->vertex->num_elements < 32))
+      ctx->need_vertex_id = nvc0->vertprog->vp.need_vertex_id;
+   else
+      ctx->need_vertex_id = FALSE;
+
+   ctx->edgeflag.buffer = -1;
+   ctx->edgeflag.value = 0.5f;
+
+   if (unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
+      ve = &nvc0->vertex->element[nvc0->vertprog->vp.edgeflag].pipe;
+      ctx->edgeflag.buffer = ve->vertex_buffer_index;
+      ctx->edgeflag.offset = ve->src_offset;
+      ctx->packet_vertex_limit = 1;
+   } else {
+      ctx->packet_vertex_limit = nvc0->vertex->vtx_per_packet_max;
+      if (unlikely(ctx->need_vertex_id))
+         ctx->packet_vertex_limit = 1;
+   }
+
+   ctx->vertex_words = nvc0->vertex->vtx_size;
+}
+
+static INLINE void
+set_edgeflag(struct push_context *ctx, unsigned vtx_id)
+{
+   float f = *(float *)(ctx->edgeflag.data + vtx_id * ctx->edgeflag.stride);
+
+   if (ctx->edgeflag.value != f) {
+      ctx->edgeflag.value = f;
+      IMMED_NVC0(ctx->push, NVC0_3D(EDGEFLAG), f ? 1 : 0);
+   }
+}
+
+static INLINE void
+set_vertexid(struct push_context *ctx, uint32_t vtx_id)
+{
+#if 0
+   BEGIN_NVC0(ctx->push, NVC0_3D(VERTEX_ID), 1); /* broken on nvc0 */
+#else
+   BEGIN_NVC0(ctx->push, NVC0_3D(VERTEX_DATA), 1); /* as last attribute */
+#endif
+   PUSH_DATA (ctx->push, vtx_id);
+}
+
+static INLINE unsigned
+prim_restart_search_i08(uint8_t *elts, unsigned push, uint8_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i16(uint16_t *elts, unsigned push, uint16_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i32(uint32_t *elts, unsigned push, uint32_t index)
+{
+   unsigned i;
+   for (i = 0; i < push; ++i)
+      if (elts[i] == index)
+         break;
+   return i;
+}
+
+static void
+emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint8_t *restrict elts = (uint8_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i08(elts, push, ctx->restart_index);
+
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
+         set_edgeflag(ctx, elts[0]);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
+                                ctx->push->cur);
+      ctx->push->cur += size;
+
+      if (unlikely(ctx->need_vertex_id) && likely(size))
+         set_vertexid(ctx, elts[0]);
+
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NVC0(ctx->push, NVC0_3D(VERTEX_END_GL), 2);
+         PUSH_DATA (ctx->push, 0);
+         PUSH_DATA (ctx->push, NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT |
+                    (ctx->prim & ~NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT));
+      }
+   }
+}
+
+static void
+emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint16_t *restrict elts = (uint16_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i16(elts, push, ctx->restart_index);
+
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
+         set_edgeflag(ctx, elts[0]);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
+                                 ctx->push->cur);
+      ctx->push->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NVC0(ctx->push, NVC0_3D(VERTEX_END_GL), 2);
+         PUSH_DATA (ctx->push, 0);
+         PUSH_DATA (ctx->push, NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT |
+                    (ctx->prim & ~NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT));
+      }
+   }
+}
+
+static void
+emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
+{
+   uint32_t *restrict elts = (uint32_t *)ctx->idxbuf + start;
+
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size, nr;
+
+      nr = push;
+      if (ctx->primitive_restart)
+         nr = prim_restart_search_i32(elts, push, ctx->restart_index);
+
+      if (unlikely(ctx->edgeflag.buffer >= 0) && likely(nr))
+         set_edgeflag(ctx, elts[0]);
+
+      size = ctx->vertex_words * nr;
+
+      BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
+
+      ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
+                               ctx->push->cur);
+      ctx->push->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, elts[0]);
+
+      count -= nr;
+      elts += nr;
+
+      if (nr != push) {
+         count--;
+         elts++;
+         BEGIN_NVC0(ctx->push, NVC0_3D(VERTEX_END_GL), 2);
+         PUSH_DATA (ctx->push, 0);
+         PUSH_DATA (ctx->push, NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_CONT |
+                    (ctx->prim & ~NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT));
+      }
+   }
+}
+
+static void
+emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
+{
+   while (count) {
+      unsigned push = MIN2(count, ctx->packet_vertex_limit);
+      unsigned size = ctx->vertex_words * push;
+
+      if (unlikely(ctx->edgeflag.buffer >= 0))
+         set_edgeflag(ctx, start);
+
+      BEGIN_NIC0(ctx->push, NVC0_3D(VERTEX_DATA), size);
+
+      ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
+                          ctx->push->cur);
+      ctx->push->cur += size;
+
+      if (unlikely(ctx->need_vertex_id))
+         set_vertexid(ctx, start);
+
+      count -= push;
+      start += push;
+   }
+}
+
+
+#define NVC0_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
+
+static INLINE unsigned
+nvc0_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NVC0_PRIM_GL_CASE(POINTS);
+   NVC0_PRIM_GL_CASE(LINES);
+   NVC0_PRIM_GL_CASE(LINE_LOOP);
+   NVC0_PRIM_GL_CASE(LINE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLES);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLE_FAN);
+   NVC0_PRIM_GL_CASE(QUADS);
+   NVC0_PRIM_GL_CASE(QUAD_STRIP);
+   NVC0_PRIM_GL_CASE(POLYGON);
+   NVC0_PRIM_GL_CASE(LINES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
+   /*
+   NVC0_PRIM_GL_CASE(PATCHES); */
+   default:
+      return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
+      break;
+   }
+}
+
+void
+nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
+{
+   struct push_context ctx;
+   unsigned i, index_size;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
+   boolean apply_bias = info->indexed && info->index_bias;
+
+   init_push_context(nvc0, &ctx);
+
+   for (i = 0; i < nvc0->num_vtxbufs; ++i) {
+      uint8_t *data;
+      struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[i];
+      struct nv04_resource *res = nv04_resource(vb->buffer);
+
+      data = nouveau_resource_map_offset(&nvc0->base, res,
+                                         vb->buffer_offset, NOUVEAU_BO_RD);
+
+      if (apply_bias && likely(!(nvc0->vertex->instance_bufs & (1 << i))))
+         data += info->index_bias * vb->stride;
+
+      ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+
+      if (unlikely(i == ctx.edgeflag.buffer)) {
+         ctx.edgeflag.data = data + ctx.edgeflag.offset;
+         ctx.edgeflag.stride = vb->stride;
+      }
+   }
+
+   if (info->indexed) {
+      ctx.idxbuf =
+         nouveau_resource_map_offset(&nvc0->base,
+                                     nv04_resource(nvc0->idxbuf.buffer),
+                                     nvc0->idxbuf.offset, NOUVEAU_BO_RD);
+      if (!ctx.idxbuf)
+         return;
+      index_size = nvc0->idxbuf.index_size;
+      ctx.primitive_restart = info->primitive_restart;
+      ctx.restart_index = info->restart_index;
+   } else {
+      ctx.idxbuf = NULL;
+      index_size = 0;
+      ctx.primitive_restart = FALSE;
+      ctx.restart_index = 0;
+
+      if (info->count_from_stream_output) {
+         struct pipe_context *pipe = &nvc0->base.pipe;
+         struct nvc0_so_target *targ;
+         targ = nvc0_so_target(info->count_from_stream_output);
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void*)&vert_count);
+         vert_count /= targ->stride;
+      }
+   }
+
+   ctx.instance_id = info->start_instance;
+   ctx.prim = nvc0_prim_gl(info->mode);
+
+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      PUSH_DATA (ctx.push, (a << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT) |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_ID_REPLACE), 1);
+      PUSH_DATA (ctx.push, (((0x80 + a * 0x10) / 4) << 4) | 1);
+   }
+
+   while (inst_count--) {
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (ctx.push, ctx.prim);
+      switch (index_size) {
+      case 0:
+         emit_vertices_seq(&ctx, info->start, vert_count);
+         break;
+      case 1:
+         emit_vertices_i08(&ctx, info->start, vert_count);
+         break;
+      case 2:
+         emit_vertices_i16(&ctx, info->start, vert_count);
+         break;
+      case 4:
+         emit_vertices_i32(&ctx, info->start, vert_count);
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_END_GL), 0);
+
+      ctx.instance_id++;
+      ctx.prim |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+
+   if (unlikely(ctx.edgeflag.value == 0.0f))
+      IMMED_NVC0(ctx.push, NVC0_3D(EDGEFLAG), 1);
+
+   if (unlikely(ctx.need_vertex_id)) {
+      const unsigned a = nvc0->vertex->num_elements;
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_ID_REPLACE), 0);
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+      PUSH_DATA (ctx.push,
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+   }
+
+   if (info->indexed)
+      nouveau_resource_unmap(nv04_resource(nvc0->idxbuf.buffer));
+
+   for (i = 0; i < nvc0->num_vtxbufs; ++i)
+      nouveau_resource_unmap(nv04_resource(nvc0->vtxbuf[i].buffer));
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
new file mode 100644
index 00000000000..21aa3580e7c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -0,0 +1,1448 @@
+/*
+ * Copyright 2011 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Christoph Bumiller
+ */
+
+#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nvc0/nvc0_context.h"
+#include "nv_object.xml.h"
+#include "nvc0/nve4_compute.xml.h"
+#include "nvc0/nvc0_compute.xml.h"
+
+#define NVC0_QUERY_STATE_READY   0
+#define NVC0_QUERY_STATE_ACTIVE  1
+#define NVC0_QUERY_STATE_ENDED   2
+#define NVC0_QUERY_STATE_FLUSHED 3
+
+struct nvc0_query {
+   uint32_t *data;
+   uint16_t type;
+   uint16_t index;
+   int8_t ctr[4];
+   uint32_t sequence;
+   struct nouveau_bo *bo;
+   uint32_t base;
+   uint32_t offset; /* base + i * rotate */
+   uint8_t state;
+   boolean is64bit;
+   uint8_t rotate;
+   int nesting; /* only used for occlusion queries */
+   union {
+      struct nouveau_mm_allocation *mm;
+      uint64_t value;
+   } u;
+   struct nouveau_fence *fence;
+};
+
+#define NVC0_QUERY_ALLOC_SPACE 256
+
+static void nvc0_mp_pm_query_begin(struct nvc0_context *, struct nvc0_query *);
+static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
+static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
+                                       struct nvc0_query *, void *, boolean);
+
+static INLINE struct nvc0_query *
+nvc0_query(struct pipe_query *pipe)
+{
+   return (struct nvc0_query *)pipe;
+}
+
+static boolean
+nvc0_query_allocate(struct nvc0_context *nvc0, struct nvc0_query *q, int size)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   int ret;
+
+   if (q->bo) {
+      nouveau_bo_ref(NULL, &q->bo);
+      if (q->u.mm) {
+         if (q->state == NVC0_QUERY_STATE_READY)
+            nouveau_mm_free(q->u.mm);
+         else
+            nouveau_fence_work(screen->base.fence.current,
+                               nouveau_mm_free_work, q->u.mm);
+      }
+   }
+   if (size) {
+      q->u.mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
+      if (!q->bo)
+         return FALSE;
+      q->offset = q->base;
+
+      ret = nouveau_bo_map(q->bo, 0, screen->base.client);
+      if (ret) {
+         nvc0_query_allocate(nvc0, q, 0);
+         return FALSE;
+      }
+      q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
+   }
+   return TRUE;
+}
+
+static void
+nvc0_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   nvc0_query_allocate(nvc0_context(pipe), nvc0_query(pq), 0);
+   nouveau_fence_ref(NULL, &nvc0_query(pq)->fence);
+   FREE(nvc0_query(pq));
+}
+
+static struct pipe_query *
+nvc0_query_create(struct pipe_context *pipe, unsigned type)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nvc0_query *q;
+   unsigned space = NVC0_QUERY_ALLOC_SPACE;
+
+   q = CALLOC_STRUCT(nvc0_query);
+   if (!q)
+      return NULL;
+
+   switch (type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      q->rotate = 32;
+      space = NVC0_QUERY_ALLOC_SPACE;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      q->is64bit = TRUE;
+      space = 512;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      q->is64bit = TRUE;
+      space = 64;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      q->is64bit = TRUE;
+      space = 32;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+      space = 32;
+      break;
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
+      space = 16;
+      break;
+   default:
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+      if (type >= NVC0_QUERY_DRV_STAT(0) && type <= NVC0_QUERY_DRV_STAT_LAST) {
+         space = 0;
+         q->is64bit = true;
+         q->index = type - NVC0_QUERY_DRV_STAT(0);
+         break;
+      } else
+#endif
+      if (nvc0->screen->base.device->drm_version >= 0x01000101) {
+         if (type >= NVE4_PM_QUERY(0) && type <= NVE4_PM_QUERY_LAST) {
+            /* for each MP:
+             * [00] = WS0.C0
+             * [04] = WS0.C1
+             * [08] = WS0.C2
+             * [0c] = WS0.C3
+             * [10] = WS1.C0
+             * [14] = WS1.C1
+             * [18] = WS1.C2
+             * [1c] = WS1.C3
+             * [20] = WS2.C0
+             * [24] = WS2.C1
+             * [28] = WS2.C2
+             * [2c] = WS2.C3
+             * [30] = WS3.C0
+             * [34] = WS3.C1
+             * [38] = WS3.C2
+             * [3c] = WS3.C3
+             * [40] = MP.C4
+             * [44] = MP.C5
+             * [48] = MP.C6
+             * [4c] = MP.C7
+             * [50] = WS0.sequence
+             * [54] = WS1.sequence
+             * [58] = WS2.sequence
+             * [5c] = WS3.sequence
+             */
+            space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
+            break;
+         } else
+         if (type >= NVC0_PM_QUERY(0) && type <= NVC0_PM_QUERY_LAST) {
+            /* for each MP:
+             * [00] = MP.C0
+             * [04] = MP.C1
+             * [08] = MP.C2
+             * [0c] = MP.C3
+             * [10] = MP.C4
+             * [14] = MP.C5
+             * [18] = MP.C6
+             * [1c] = MP.C7
+             * [20] = MP.sequence
+             */
+            space = (8 + 1) * nvc0->screen->mp_count * sizeof(uint32_t);
+            break;
+         }
+      }
+      debug_printf("invalid query type: %u\n", type);
+      FREE(q);
+      return NULL;
+   }
+   if (!nvc0_query_allocate(nvc0, q, space)) {
+      FREE(q);
+      return NULL;
+   }
+
+   q->type = type;
+
+   if (q->rotate) {
+      /* we advance before query_begin ! */
+      q->offset -= q->rotate;
+      q->data -= q->rotate / sizeof(*q->data);
+   } else
+   if (!q->is64bit)
+      q->data[0] = 0; /* initialize sequence */
+
+   return (struct pipe_query *)q;
+}
+
+static void
+nvc0_query_get(struct nouveau_pushbuf *push, struct nvc0_query *q,
+               unsigned offset, uint32_t get)
+{
+   offset += q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, get);
+}
+
+static void
+nvc0_query_rotate(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   q->offset += q->rotate;
+   q->data += q->rotate / sizeof(*q->data);
+   if (q->offset - q->base == NVC0_QUERY_ALLOC_SPACE)
+      nvc0_query_allocate(nvc0, q, NVC0_QUERY_ALLOC_SPACE);
+}
+
+static void
+nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_query *q = nvc0_query(pq);
+
+   /* For occlusion queries we have to change the storage, because a previous
+    * query might set the initial render conition to FALSE even *after* we re-
+    * initialized it to TRUE.
+    */
+   if (q->rotate) {
+      nvc0_query_rotate(nvc0, q);
+
+      /* XXX: can we do this with the GPU, and sync with respect to a previous
+       *  query ?
+       */
+      q->data[0] = q->sequence; /* initialize sequence */
+      q->data[1] = 1; /* initial render condition = TRUE */
+      q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
+      q->data[5] = 0;
+   }
+   q->sequence++;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      q->nesting = nvc0->screen->num_occlusion_queries_active++;
+      if (q->nesting) {
+         nvc0_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 3);
+         BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT);
+         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nvc0_query_get(push, q, 0x10, 0x09005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nvc0_query_get(push, q, 0x10, 0x05805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nvc0_query_get(push, q, 0x20, 0x05805002 | (q->index << 5));
+      nvc0_query_get(push, q, 0x30, 0x06805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      nvc0_query_get(push, q, 0x10, 0x03005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      nvc0_query_get(push, q, 0x10, 0x00005002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nvc0_query_get(push, q, 0xc0 + 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nvc0_query_get(push, q, 0xc0 + 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nvc0_query_get(push, q, 0xc0 + 0x20, 0x02802002); /* VP, LAUNCHES */
+      nvc0_query_get(push, q, 0xc0 + 0x30, 0x03806002); /* GP, LAUNCHES */
+      nvc0_query_get(push, q, 0xc0 + 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nvc0_query_get(push, q, 0xc0 + 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nvc0_query_get(push, q, 0xc0 + 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nvc0_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */
+      nvc0_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */
+      nvc0_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      break;
+   default:
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+      if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
+          q->type <= NVC0_QUERY_DRV_STAT_LAST) {
+         if (q->index >= 5)
+            q->u.value = nvc0->screen->base.stats.v[q->index];
+         else
+            q->u.value = 0;
+      } else
+#endif
+      if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
+          (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
+         nvc0_mp_pm_query_begin(nvc0, q);
+      }
+      break;
+   }
+   q->state = NVC0_QUERY_STATE_ACTIVE;
+}
+
+static void
+nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_query *q = nvc0_query(pq);
+
+   if (q->state != NVC0_QUERY_STATE_ACTIVE) {
+      /* some queries don't require 'begin' to be called (e.g. GPU_FINISHED) */
+      if (q->rotate)
+         nvc0_query_rotate(nvc0, q);
+      q->sequence++;
+   }
+   q->state = NVC0_QUERY_STATE_ENDED;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      nvc0_query_get(push, q, 0, 0x0100f002);
+      if (--nvc0->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 1);
+         IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 0);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nvc0_query_get(push, q, 0, 0x09005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nvc0_query_get(push, q, 0, 0x05805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nvc0_query_get(push, q, 0x00, 0x05805002 | (q->index << 5));
+      nvc0_query_get(push, q, 0x10, 0x06805002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      /* TODO: How do we sum over all streams for render condition ? */
+      /* PRIMS_DROPPED doesn't write sequence, use a ZERO query to sync on */
+      nvc0_query_get(push, q, 0x00, 0x03005002 | (q->index << 5));
+      nvc0_query_get(push, q, 0x20, 0x00005002);
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIME_ELAPSED:
+      nvc0_query_get(push, q, 0, 0x00005002);
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      nvc0_query_get(push, q, 0, 0x1000f010);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nvc0_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nvc0_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nvc0_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
+      nvc0_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
+      nvc0_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nvc0_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nvc0_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nvc0_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      nvc0_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */
+      nvc0_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */
+      break;
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
+      /* indexed by TFB buffer instead of by vertex stream */
+      nvc0_query_get(push, q, 0x00, 0x0d005002 | (q->index << 5));
+      break;
+   default:
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+      if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
+          q->type <= NVC0_QUERY_DRV_STAT_LAST) {
+         q->u.value = nvc0->screen->base.stats.v[q->index] - q->u.value;
+         return;
+      } else
+#endif
+      if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
+          (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
+         nvc0_mp_pm_query_end(nvc0, q);
+      }
+      break;
+   }
+   if (q->is64bit)
+      nouveau_fence_ref(nvc0->screen->base.fence.current, &q->fence);
+}
+
+static INLINE void
+nvc0_query_update(struct nouveau_client *cli, struct nvc0_query *q)
+{
+   if (q->is64bit) {
+      if (nouveau_fence_signalled(q->fence))
+         q->state = NVC0_QUERY_STATE_READY;
+   } else {
+      if (q->data[0] == q->sequence)
+         q->state = NVC0_QUERY_STATE_READY;
+   }
+}
+
+static boolean
+nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                  boolean wait, union pipe_query_result *result)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nvc0_query *q = nvc0_query(pq);
+   uint64_t *res64 = (uint64_t*)result;
+   uint32_t *res32 = (uint32_t*)result;
+   boolean *res8 = (boolean*)result;
+   uint64_t *data64 = (uint64_t *)q->data;
+   unsigned i;
+
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   if (q->type >= NVC0_QUERY_DRV_STAT(0) &&
+       q->type <= NVC0_QUERY_DRV_STAT_LAST) {
+      res64[0] = q->u.value;
+      return TRUE;
+   } else
+#endif
+   if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
+       (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
+      return nvc0_mp_pm_query_result(nvc0, q, result, wait);
+   }
+
+   if (q->state != NVC0_QUERY_STATE_READY)
+      nvc0_query_update(nvc0->screen->base.client, q);
+
+   if (q->state != NVC0_QUERY_STATE_READY) {
+      if (!wait) {
+         if (q->state != NVC0_QUERY_STATE_FLUSHED) {
+            q->state = NVC0_QUERY_STATE_FLUSHED;
+            /* flush for silly apps that spin on GL_QUERY_RESULT_AVAILABLE */
+            PUSH_KICK(nvc0->base.pushbuf);
+         }
+         return FALSE;
+      }
+      if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->screen->base.client))
+         return FALSE;
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, query_sync_count, 1);
+   }
+   q->state = NVC0_QUERY_STATE_READY;
+
+   switch (q->type) {
+   case PIPE_QUERY_GPU_FINISHED:
+      res8[0] = TRUE;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
+      res64[0] = q->data[1] - q->data[5];
+      break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      res8[0] = q->data[1] != q->data[5];
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
+   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
+      res64[0] = data64[0] - data64[2];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      res8[0] = data64[0] != data64[2];
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      res64[0] = data64[1];
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      res64[0] = 1000000000;
+      res8[8] = FALSE;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      res64[0] = data64[1] - data64[3];
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      for (i = 0; i < 10; ++i)
+         res64[i] = data64[i * 2] - data64[24 + i * 2];
+      break;
+   case NVC0_QUERY_TFB_BUFFER_OFFSET:
+      res32[0] = q->data[1];
+      break;
+   default:
+      assert(0); /* can't happen, we don't create queries with invalid type */
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+void
+nvc0_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+{
+   struct nvc0_query *q = nvc0_query(pq);
+   unsigned offset = q->offset;
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE) offset += 0x20;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NVC0(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, (1 << 12) |
+              NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
+static void
+nvc0_render_condition(struct pipe_context *pipe,
+                      struct pipe_query *pq,
+                      boolean condition, uint mode)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_query *q;
+   uint32_t cond;
+   boolean negated = FALSE;
+   boolean wait =
+      mode != PIPE_RENDER_COND_NO_WAIT &&
+      mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT;
+
+   nvc0->cond_query = pq;
+   nvc0->cond_cond = condition;
+   nvc0->cond_mode = mode;
+
+   if (!pq) {
+      PUSH_SPACE(push, 1);
+      IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+      return;
+   }
+   q = nvc0_query(pq);
+
+   /* NOTE: comparison of 2 queries only works if both have completed */
+   switch (q->type) {
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      cond = negated ? NVC0_3D_COND_MODE_EQUAL :
+                       NVC0_3D_COND_MODE_NOT_EQUAL;
+      wait = TRUE;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      if (likely(!negated)) {
+         if (unlikely(q->nesting))
+            cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL :
+                          NVC0_3D_COND_MODE_ALWAYS;
+         else
+            cond = NVC0_3D_COND_MODE_RES_NON_ZERO;
+      } else {
+         cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS;
+      }
+      break;
+   default:
+      assert(!"render condition query not a predicate");
+      mode = NVC0_3D_COND_MODE_ALWAYS;
+      break;
+   }
+
+   if (wait)
+      nvc0_query_fifo_wait(push, pq);
+
+   PUSH_SPACE(push, 4);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NVC0(push, NVC0_3D(COND_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, q->bo->offset + q->offset);
+   PUSH_DATA (push, q->bo->offset + q->offset);
+   PUSH_DATA (push, cond);
+}
+
+void
+nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                          struct pipe_query *pq, unsigned result_offset)
+{
+   struct nvc0_query *q = nvc0_query(pq);
+
+#define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
+
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
+                        NVC0_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nvc0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean *serialize)
+{
+   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
+
+   if (*serialize) {
+      *serialize = FALSE;
+      PUSH_SPACE(nvc0_context(pipe)->base.pushbuf, 1);
+      IMMED_NVC0(nvc0_context(pipe)->base.pushbuf, NVC0_3D(SERIALIZE), 0);
+
+      NOUVEAU_DRV_STAT(nouveau_screen(pipe->screen), gpu_serialize_count, 1);
+   }
+
+   nvc0_query(targ->pq)->index = index;
+
+   nvc0_query_end(pipe, targ->pq);
+}
+
+
+/* === DRIVER STATISTICS === */
+
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+
+static const char *nvc0_drv_stat_names[] =
+{
+   "drv-tex_obj_current_count",
+   "drv-tex_obj_current_bytes",
+   "drv-buf_obj_current_count",
+   "drv-buf_obj_current_bytes_vid",
+   "drv-buf_obj_current_bytes_sys",
+   "drv-tex_transfers_rd",
+   "drv-tex_transfers_wr",
+   "drv-tex_copy_count",
+   "drv-tex_blit_count",
+   "drv-tex_cache_flush_count",
+   "drv-buf_transfers_rd",
+   "drv-buf_transfers_wr",
+   "drv-buf_read_bytes_staging_vid",
+   "drv-buf_write_bytes_direct",
+   "drv-buf_write_bytes_staging_vid",
+   "drv-buf_write_bytes_staging_sys",
+   "drv-buf_copy_bytes",
+   "drv-buf_non_kernel_fence_sync_count",
+   "drv-any_non_kernel_fence_sync_count",
+   "drv-query_sync_count",
+   "drv-gpu_serialize_count",
+   "drv-draw_calls_array",
+   "drv-draw_calls_indexed",
+   "drv-draw_calls_fallback_count",
+   "drv-user_buffer_upload_bytes",
+   "drv-constbuf_upload_count",
+   "drv-constbuf_upload_bytes",
+   "drv-pushbuf_count",
+   "drv-resource_validate_count"
+};
+
+#endif /* NOUVEAU_ENABLE_DRIVER_STATISTICS */
+
+
+/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
+
+/* Code to read out MP counters: They are accessible via mmio, too, but let's
+ * just avoid mapping registers in userspace. We'd have to know which MPs are
+ * enabled/present, too, and that information is not presently exposed.
+ * We could add a kernel interface for it, but reading the counters like this
+ * has the advantage of being async (if get_result isn't called immediately).
+ */
+static const uint64_t nve4_read_mp_pm_counters_code[] =
+{
+   /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
+    * mov b32 $r8 $tidx
+    * mov b32 $r12 $physid
+    * mov b32 $r0 $pm0
+    * mov b32 $r1 $pm1
+    * mov b32 $r2 $pm2
+    * mov b32 $r3 $pm3
+    * mov b32 $r4 $pm4
+    * sched 0x20 0x20 0x23 0x04 0x20 0x04 0x2b
+    * mov b32 $r5 $pm5
+    * mov b32 $r6 $pm6
+    * mov b32 $r7 $pm7
+    * set $p0 0x1 eq u32 $r8 0x0
+    * mov b32 $r10 c0[0x0]
+    * ext u32 $r8 $r12 0x414
+    * mov b32 $r11 c0[0x4]
+    * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
+    * ext u32 $r9 $r12 0x208
+    * (not $p0) exit
+    * set $p1 0x1 eq u32 $r9 0x0
+    * mul $r8 u32 $r8 u32 96
+    * mul $r12 u32 $r9 u32 16
+    * mul $r13 u32 $r9 u32 4
+    * add b32 $r9 $r8 $r13
+    * sched 0x28 0x04 0x2c 0x04 0x2c 0x04 0x2c
+    * add b32 $r8 $r8 $r12
+    * mov b32 $r12 $r10
+    * add b32 $r10 $c $r10 $r8
+    * mov b32 $r13 $r11
+    * add b32 $r11 $r11 0x0 $c
+    * add b32 $r12 $c $r12 $r9
+    * st b128 wt g[$r10d] $r0q
+    * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
+    * mov b32 $r0 c0[0x8]
+    * add b32 $r13 $r13 0x0 $c
+    * $p1 st b128 wt g[$r12d+0x40] $r4q
+    * st b32 wt g[$r12d+0x50] $r0
+    * exit */
+   0x2202020202020207ULL,
+   0x2c00000084021c04ULL,
+   0x2c0000000c031c04ULL,
+   0x2c00000010001c04ULL,
+   0x2c00000014005c04ULL,
+   0x2c00000018009c04ULL,
+   0x2c0000001c00dc04ULL,
+   0x2c00000020011c04ULL,
+   0x22b0420042320207ULL,
+   0x2c00000024015c04ULL,
+   0x2c00000028019c04ULL,
+   0x2c0000002c01dc04ULL,
+   0x190e0000fc81dc03ULL,
+   0x2800400000029de4ULL,
+   0x7000c01050c21c03ULL,
+   0x280040001002dde4ULL,
+   0x204282020042e047ULL,
+   0x7000c00820c25c03ULL,
+   0x80000000000021e7ULL,
+   0x190e0000fc93dc03ULL,
+   0x1000000180821c02ULL,
+   0x1000000040931c02ULL,
+   0x1000000010935c02ULL,
+   0x4800000034825c03ULL,
+   0x22c042c042c04287ULL,
+   0x4800000030821c03ULL,
+   0x2800000028031de4ULL,
+   0x4801000020a29c03ULL,
+   0x280000002c035de4ULL,
+   0x0800000000b2dc42ULL,
+   0x4801000024c31c03ULL,
+   0x9400000000a01fc5ULL,
+   0x200002e04202c047ULL,
+   0x2800400020001de4ULL,
+   0x0800000000d35c42ULL,
+   0x9400000100c107c5ULL,
+   0x9400000140c01f85ULL,
+   0x8000000000001de7ULL
+};
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nve4_pm_query_names[] =
+{
+   /* MP counters */
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "warps_launched",
+   "threads_launched",
+   "sm_cta_launched",
+   "inst_issued1",
+   "inst_issued2",
+   "inst_executed",
+   "local_load",
+   "local_store",
+   "shared_load",
+   "shared_store",
+   "l1_local_load_hit",
+   "l1_local_load_miss",
+   "l1_local_store_hit",
+   "l1_local_store_miss",
+   "gld_request",
+   "gst_request",
+   "l1_global_load_hit",
+   "l1_global_load_miss",
+   "uncached_global_load_transaction",
+   "global_store_transaction",
+   "branch",
+   "divergent_branch",
+   "active_warps",
+   "active_cycles",
+   "inst_issued",
+   "atom_count",
+   "gred_count",
+   "shared_load_replay",
+   "shared_store_replay",
+   "local_load_transactions",
+   "local_store_transactions",
+   "l1_shared_load_transactions",
+   "l1_shared_store_transactions",
+   "global_ld_mem_divergence_replays",
+   "global_st_mem_divergence_replays",
+   /* metrics, i.e. functions of the MP counters */
+   "metric-ipc",                   /* inst_executed, clock */
+   "metric-ipac",                  /* inst_executed, active_cycles */
+   "metric-ipec",                  /* inst_executed, (bool)inst_executed */
+   "metric-achieved_occupancy",    /* active_warps, active_cycles */
+   "metric-sm_efficiency",         /* active_cycles, clock */
+   "metric-inst_replay_overhead"   /* inst_issued, inst_executed */
+};
+
+/* For simplicity, we will allocate as many group slots as we allocate counter
+ * slots. This means that a single counter which wants to source from 2 groups
+ * will have to be declared as using 2 counter slots. This shouldn't really be
+ * a problem because such queries don't make much sense ... (unless someone is
+ * really creative).
+ */
+struct nvc0_mp_counter_cfg
+{
+   uint32_t func    : 16; /* mask or 4-bit logic op (depending on mode) */
+   uint32_t mode    : 4;  /* LOGOP,B6,LOGOP_B6(_PULSE) */
+   uint32_t num_src : 3;  /* number of sources (1 - 6, only for NVC0:NVE4) */
+   uint32_t sig_dom : 1;  /* if 0, MP_PM_A (per warp-sched), if 1, MP_PM_B */
+   uint32_t sig_sel : 8;  /* signal group */
+   uint64_t src_sel;      /* signal selection for up to 6 sources (48 bit) */
+};
+
+#define NVC0_COUNTER_OPn_SUM            0
+#define NVC0_COUNTER_OPn_OR             1
+#define NVC0_COUNTER_OPn_AND            2
+#define NVC0_COUNTER_OP2_REL_SUM_MM     3 /* (sum(ctr0) - sum(ctr1)) / sum(ctr0) */
+#define NVC0_COUNTER_OP2_DIV_SUM_M0     4 /* sum(ctr0) / ctr1 of MP[0]) */
+#define NVC0_COUNTER_OP2_AVG_DIV_MM     5 /* avg(ctr0 / ctr1) */
+#define NVC0_COUNTER_OP2_AVG_DIV_M0     6 /* avg(ctr0) / ctr1 of MP[0]) */
+
+struct nvc0_mp_pm_query_cfg
+{
+   struct nvc0_mp_counter_cfg ctr[4];
+   uint8_t num_counters;
+   uint8_t op;
+   uint8_t norm[2]; /* normalization num,denom */
+};
+
+#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+   { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
+   { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
+   {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
+
+/* NOTES:
+ * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
+ * inst_executed etc.: we only count a single warp scheduler
+ * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
+ *  this is inaccurate !
+ */
+static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
+{
+   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
+   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
+   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
+   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
+   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
+   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
+   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
+   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
+   _Q1A(LAUNCHED_WARPS,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
+   _Q1A(LAUNCHED_THREADS,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
+   _Q1B(LAUNCHED_CTA,      0x0001, B6, WARP, 0x0000001c, 1, 1),
+   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
+   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
+   _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
+   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
+   _Q1A(LD_SHARED,   0x0001, B6, LDST, 0x00000000, 1, 1),
+   _Q1A(ST_SHARED,   0x0001, B6, LDST, 0x00000004, 1, 1),
+   _Q1A(LD_LOCAL,    0x0001, B6, LDST, 0x00000008, 1, 1),
+   _Q1A(ST_LOCAL,    0x0001, B6, LDST, 0x0000000c, 1, 1),
+   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
+   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
+   _Q1B(L1_LOCAL_LOAD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
+   _Q1B(L1_LOCAL_LOAD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
+   _Q1B(L1_LOCAL_STORE_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
+   _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
+   _Q1B(L1_GLOBAL_LOAD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
+   _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
+   _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000, 1, 1),
+   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
+   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
+   _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
+   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
+   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
+   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
+   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
+   _Q1B(LD_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
+   _Q1B(ST_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
+   _Q1B(LD_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
+   _Q1B(ST_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
+   _Q1B(L1_LD_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
+   _Q1B(L1_ST_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
+   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
+   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
+   _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
+   _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
+   _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
+   _M2A(INST_REPLAY_OHEAD, 0x3, B6, ISSUE, 0x104, 0x3, B6, EXEC, 0x398, REL_SUM_MM, 100, 1),
+   _M2B(MP_OCCUPANCY, 0x3f, B6, WARP, 0x31483104, 0x01, B6, WARP, 0x0, AVG_DIV_MM, 200, 64),
+   _M2B(MP_EFFICIENCY, 0x01, B6, WARP, 0x0, 0xffff, LOGOP, WARP, 0x0, AVG_DIV_M0, 100, 1),
+};
+
+#undef _Q1A
+#undef _Q1B
+#undef _M2A
+#undef _M2B
+
+/* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
+static const uint64_t nvc0_read_mp_pm_counters_code[] =
+{
+   /* mov b32 $r8 $tidx
+    * mov b32 $r9 $physid
+    * mov b32 $r0 $pm0
+    * mov b32 $r1 $pm1
+    * mov b32 $r2 $pm2
+    * mov b32 $r3 $pm3
+    * mov b32 $r4 $pm4
+    * mov b32 $r5 $pm5
+    * mov b32 $r6 $pm6
+    * mov b32 $r7 $pm7
+    * set $p0 0x1 eq u32 $r8 0x0
+    * mov b32 $r10 c0[0x0]
+    * mov b32 $r11 c0[0x4]
+    * ext u32 $r8 $r9 0x414
+    * (not $p0) exit
+    * mul $r8 u32 $r8 u32 36
+    * add b32 $r10 $c $r10 $r8
+    * add b32 $r11 $r11 0x0 $c
+    * mov b32 $r8 c0[0x8]
+    * st b128 wt g[$r10d+0x00] $r0q
+    * st b128 wt g[$r10d+0x10] $r4q
+    * st b32 wt g[$r10d+0x20] $r8
+    * exit */
+   0x2c00000084021c04ULL,
+   0x2c0000000c025c04ULL,
+   0x2c00000010001c04ULL,
+   0x2c00000014005c04ULL,
+   0x2c00000018009c04ULL,
+   0x2c0000001c00dc04ULL,
+   0x2c00000020011c04ULL,
+   0x2c00000024015c04ULL,
+   0x2c00000028019c04ULL,
+   0x2c0000002c01dc04ULL,
+   0x190e0000fc81dc03ULL,
+   0x2800400000029de4ULL,
+   0x280040001002dde4ULL,
+   0x7000c01050921c03ULL,
+   0x80000000000021e7ULL,
+   0x1000000090821c02ULL,
+   0x4801000020a29c03ULL,
+   0x0800000000b2dc42ULL,
+   0x2800400020021de4ULL,
+   0x9400000000a01fc5ULL,
+   0x9400000040a11fc5ULL,
+   0x9400000080a21f85ULL,
+   0x8000000000001de7ULL
+};
+
+static const char *nvc0_pm_query_names[] =
+{
+   /* MP counters */
+   "inst_executed",
+   "branch",
+   "divergent_branch",
+   "active_warps",
+   "active_cycles",
+   "warps_launched",
+   "threads_launched",
+   "shared_load",
+   "shared_store",
+   "local_load",
+   "local_store",
+   "gred_count",
+   "atom_count",
+   "gld_request",
+   "gst_request",
+   "inst_issued1_0",
+   "inst_issued1_1",
+   "inst_issued2_0",
+   "inst_issued2_1",
+   "thread_inst_executed_0",
+   "thread_inst_executed_1",
+   "thread_inst_executed_2",
+   "thread_inst_executed_3",
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+};
+
+#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_PM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
+
+static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
+{
+   _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
+   _Q(BRANCH,              0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
+   _Q(BRANCH_DIVERGENT,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
+   _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
+   _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LAUNCHED_WARPS,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LAUNCHED_THREADS,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
+   _Q(LD_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(ST_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LD_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(ST_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(GLD_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(GST_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(INST_ISSUED1_0,      0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(INST_ISSUED1_1,      0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(INST_ISSUED2_0,      0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(INST_ISSUED2_1,      0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(TH_INST_EXECUTED_0,  0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_1,  0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_2,  0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_3,  0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(PROF_TRIGGER_0,      0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_1,      0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_2,      0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_3,      0xaaaa, LOGOP, 0x01, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_4,      0xaaaa, LOGOP, 0x01, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_5,      0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_6,      0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(PROF_TRIGGER_7,      0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
+};
+
+#undef _Q
+
+static const struct nvc0_mp_pm_query_cfg *
+nvc0_mp_pm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+
+   if (screen->base.class_3d >= NVE4_3D_CLASS)
+      return &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+   return &nvc0_mp_pm_queries[q->type - NVC0_PM_QUERY(0)];
+}
+
+void
+nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const struct nvc0_mp_pm_query_cfg *cfg;
+   unsigned i, c;
+   unsigned num_ab[2] = { 0, 0 };
+
+   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+
+   /* check if we have enough free counter slots */
+   for (i = 0; i < cfg->num_counters; ++i)
+      num_ab[cfg->ctr[i].sig_dom]++;
+
+   if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
+       screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 8 + 6);
+
+   if (!screen->pm.mp_counters_enabled) {
+      screen->pm.mp_counters_enabled = TRUE;
+      BEGIN_NVC0(push, SUBC_SW(0x06ac), 1);
+      PUSH_DATA (push, 0x1fcb);
+   }
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->mp_count; ++i)
+      q->data[i * 10 + 10] = 0;
+
+   for (i = 0; i < cfg->num_counters; ++i) {
+      const unsigned d = cfg->ctr[i].sig_dom;
+
+      if (!screen->pm.num_mp_pm_active[d]) {
+         uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
+         if (screen->pm.num_mp_pm_active[!d])
+            m |= 1 << (7 + (8 * d));
+         BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
+         PUSH_DATA (push, m);
+      }
+      screen->pm.num_mp_pm_active[d]++;
+
+      for (c = d * 4; c < (d * 4 + 4); ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            q->ctr[i] = c;
+            screen->pm.mp_counter[c] = (struct pipe_query *)q;
+            break;
+         }
+      }
+      assert(c <= (d * 4 + 3)); /* must succeed, already checked for space */
+
+      /* configure and reset the counter(s) */
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         if (d == 0)
+            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_A_SIGSEL(c & 3)), 1);
+         else
+            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_B_SIGSEL(c & 3)), 1);
+         PUSH_DATA (push, cfg->ctr[i].sig_sel);
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SRCSEL(c)), 1);
+         PUSH_DATA (push, cfg->ctr[i].src_sel + 0x2108421 * (c & 3));
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 1);
+         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+         BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_SET(c)), 1);
+         PUSH_DATA (push, 0);
+      } else {
+         unsigned s;
+
+         for (s = 0; s < cfg->ctr[i].num_src; s++) {
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SIGSEL(s)), 1);
+            PUSH_DATA (push, cfg->ctr[i].sig_sel);
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SRCSEL(s)), 1);
+            PUSH_DATA (push, (cfg->ctr[i].src_sel >> (s * 8)) & 0xff);
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(s)), 1);
+            PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_SET(s)), 1);
+            PUSH_DATA (push, 0);
+         }
+      }
+   }
+}
+
+static void
+nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const boolean is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
+   const uint grid[3] = { screen->mp_count, 1, 1 };
+   unsigned c;
+   const struct nvc0_mp_pm_query_cfg *cfg;
+
+   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = TRUE;
+      prog->num_gprs = 14;
+      prog->parm_size = 12;
+      if (is_nve4) {
+         prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
+         prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
+      } else {
+         prog->code = (uint32_t *)nvc0_read_mp_pm_counters_code;
+         prog->code_size = sizeof(nvc0_read_mp_pm_counters_code);
+      }
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 8; ++c)
+      if (screen->pm.mp_counter[c]) {
+         if (is_nve4) {
+            IMMED_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(c)), 0);
+         } else {
+            IMMED_NVC0(push, NVC0_COMPUTE(MP_PM_OP(c)), 0);
+         }
+      }
+   /* release counters for this query */
+   for (c = 0; c < 8; ++c) {
+      if (nvc0_query(screen->pm.mp_counter[c]) == q) {
+         screen->pm.num_mp_pm_active[c / 4]--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                q->bo);
+
+   PUSH_SPACE(push, 1);
+   IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = (q->bo->offset + q->base);
+   input[1] = (q->bo->offset + q->base) >> 32;
+   input[2] = q->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_QUERY);
+
+   /* re-activate other counters */
+   PUSH_SPACE(push, 16);
+   mask = 0;
+   for (c = 0; c < 8; ++c) {
+      unsigned i;
+      q = nvc0_query(screen->pm.mp_counter[c]);
+      if (!q)
+         continue;
+      cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+      for (i = 0; i < cfg->num_counters; ++i) {
+         if (mask & (1 << q->ctr[i]))
+            break;
+         mask |= 1 << q->ctr[i];
+         if (is_nve4) {
+            BEGIN_NVC0(push, NVE4_COMPUTE(MP_PM_FUNC(q->ctr[i])), 1);
+         } else {
+            BEGIN_NVC0(push, NVC0_COMPUTE(MP_PM_OP(q->ctr[i])), 1);
+         }
+         PUSH_DATA (push, (cfg->ctr[i].func << 4) | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static INLINE boolean
+nvc0_mp_pm_query_read_data(uint32_t count[32][4],
+                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_query *q,
+                           const struct nvc0_mp_pm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   unsigned p, c;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x24 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         if (q->data[b + 8] != q->sequence) {
+            if (!wait)
+               return FALSE;
+            if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
+               return FALSE;
+         }
+         count[p][c] = q->data[b + q->ctr[c]];
+      }
+   }
+   return TRUE;
+}
+
+static INLINE boolean
+nve4_mp_pm_query_read_data(uint32_t count[32][4],
+                           struct nvc0_context *nvc0, boolean wait,
+                           struct nvc0_query *q,
+                           const struct nvc0_mp_pm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   unsigned p, c, d;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x60 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         count[p][c] = 0;
+         for (d = 0; d < ((q->ctr[c] & ~3) ? 1 : 4); ++d) {
+            if (q->data[b + 20 + d] != q->sequence) {
+               if (!wait)
+                  return FALSE;
+               if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nvc0->base.client))
+                  return FALSE;
+            }
+            if (q->ctr[c] & ~0x3)
+               count[p][c] = q->data[b + 16 + (q->ctr[c] & 3)];
+            else
+               count[p][c] += q->data[b + d * 4 + q->ctr[c]];
+         }
+      }
+   }
+   return TRUE;
+}
+
+/* Metric calculations:
+ * sum(x) ... sum of x over all MPs
+ * avg(x) ... average of x over all MPs
+ *
+ * IPC              : sum(inst_executed) / clock
+ * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
+ * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
+ * MP_EFFICIENCY    : avg(active_cycles / clock)
+ *
+ * NOTE: Interpretation of IPC requires knowledge of MP count.
+ */
+static boolean
+nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+                        void *result, boolean wait)
+{
+   uint32_t count[32][4];
+   uint64_t value = 0;
+   unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
+   unsigned p, c;
+   const struct nvc0_mp_pm_query_cfg *cfg;
+   boolean ret;
+
+   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
+      ret = nve4_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
+   else
+      ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
+   if (!ret)
+      return FALSE;
+
+   if (cfg->op == NVC0_COUNTER_OPn_SUM) {
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            value += count[p][c];
+      value = (value * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OPn_OR) {
+      uint32_t v = 0;
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            v |= count[p][c];
+      value = (v * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OPn_AND) {
+      uint32_t v = ~0;
+      for (c = 0; c < cfg->num_counters; ++c)
+         for (p = 0; p < mp_count; ++p)
+            v &= count[p][c];
+      value = (v * cfg->norm[0]) / cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_REL_SUM_MM) {
+      uint64_t v[2] = { 0, 0 };
+      for (p = 0; p < mp_count; ++p) {
+         v[0] += count[p][0];
+         v[1] += count[p][1];
+      }
+      if (v[0])
+         value = ((v[0] - v[1]) * cfg->norm[0]) / (v[0] * cfg->norm[1]);
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_DIV_SUM_M0) {
+      for (p = 0; p < mp_count; ++p)
+         value += count[p][0];
+      if (count[0][1])
+         value = (value * cfg->norm[0]) / (count[0][1] * cfg->norm[1]);
+      else
+         value = 0;
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_MM) {
+      unsigned mp_used = 0;
+      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
+         if (count[p][1])
+            value += (count[p][0] * cfg->norm[0]) / count[p][1];
+      if (mp_used)
+         value /= mp_used * cfg->norm[1];
+   } else
+   if (cfg->op == NVC0_COUNTER_OP2_AVG_DIV_M0) {
+      unsigned mp_used = 0;
+      for (p = 0; p < mp_count; ++p, mp_used += !!count[p][0])
+         value += count[p][0];
+      if (count[0][1] && mp_used) {
+         value *= cfg->norm[0];
+         value /= count[0][1] * mp_used * cfg->norm[1];
+      } else {
+         value = 0;
+      }
+   }
+
+   *(uint64_t *)result = value;
+   return TRUE;
+}
+
+int
+nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
+                                  unsigned id,
+                                  struct pipe_driver_query_info *info)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   int count = 0;
+
+   count += NVC0_QUERY_DRV_STAT_COUNT;
+
+   if (screen->base.device->drm_version >= 0x01000101) {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         count += NVE4_PM_QUERY_COUNT;
+      } else
+      if (screen->compute) {
+         count += NVC0_PM_QUERY_COUNT; /* NVC0_COMPUTE is not always enabled */
+      }
+   }
+
+   if (!info)
+      return count;
+
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+   if (id < NVC0_QUERY_DRV_STAT_COUNT) {
+      info->name = nvc0_drv_stat_names[id];
+      info->query_type = NVC0_QUERY_DRV_STAT(id);
+      info->max_value = ~0ULL;
+      info->uses_byte_units = !!strstr(info->name, "bytes");
+      return 1;
+   } else
+#endif
+   if (id < count) {
+      if (screen->base.class_3d >= NVE4_3D_CLASS) {
+         info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+         info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+         info->max_value = (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ?
+            ~0ULL : 100;
+         info->uses_byte_units = FALSE;
+         return 1;
+      } else
+      if (screen->compute) {
+         info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+         info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+         info->max_value = ~0ULL;
+         info->uses_byte_units = FALSE;
+         return 1;
+      }
+   }
+   /* user asked for info about non-existing query */
+   info->name = "this_is_not_the_query_you_are_looking_for";
+   info->query_type = 0xdeadd01d;
+   info->max_value = 0;
+   info->uses_byte_units = FALSE;
+   return 0;
+}
+
+void
+nvc0_init_query_functions(struct nvc0_context *nvc0)
+{
+   struct pipe_context *pipe = &nvc0->base.pipe;
+
+   pipe->create_query = nvc0_query_create;
+   pipe->destroy_query = nvc0_query_destroy;
+   pipe->begin_query = nvc0_query_begin;
+   pipe->end_query = nvc0_query_end;
+   pipe->get_query_result = nvc0_query_result;
+   pipe->render_condition = nvc0_render_condition;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
new file mode 100644
index 00000000000..4e70903b538
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -0,0 +1,62 @@
+
+#include "pipe/p_context.h"
+#include "nvc0/nvc0_resource.h"
+#include "nouveau_screen.h"
+
+
+static struct pipe_resource *
+nvc0_resource_create(struct pipe_screen *screen,
+                     const struct pipe_resource *templ)
+{
+   switch (templ->target) {
+   case PIPE_BUFFER:
+      return nouveau_buffer_create(screen, templ);
+   default:
+      return nvc0_miptree_create(screen, templ);
+   }
+}
+
+static struct pipe_resource *
+nvc0_resource_from_handle(struct pipe_screen * screen,
+                          const struct pipe_resource *templ,
+                          struct winsys_handle *whandle)
+{
+   if (templ->target == PIPE_BUFFER) {
+      return NULL;
+   } else {
+      struct pipe_resource *res = nv50_miptree_from_handle(screen,
+                                                           templ, whandle);
+      nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
+      return res;
+   }
+}
+
+static struct pipe_surface *
+nvc0_surface_create(struct pipe_context *pipe,
+                    struct pipe_resource *pres,
+                    const struct pipe_surface *templ)
+{
+   if (unlikely(pres->target == PIPE_BUFFER))
+      return nv50_surface_from_buffer(pipe, pres, templ);
+   return nvc0_miptree_surface_new(pipe, pres, templ);
+}
+
+void
+nvc0_init_resource_functions(struct pipe_context *pcontext)
+{
+   pcontext->transfer_map = u_transfer_map_vtbl;
+   pcontext->transfer_flush_region = u_transfer_flush_region_vtbl;
+   pcontext->transfer_unmap = u_transfer_unmap_vtbl;
+   pcontext->transfer_inline_write = u_transfer_inline_write_vtbl;
+   pcontext->create_surface = nvc0_surface_create;
+   pcontext->surface_destroy = nv50_surface_destroy;
+}
+
+void
+nvc0_screen_init_resource_functions(struct pipe_screen *pscreen)
+{
+   pscreen->resource_create = nvc0_resource_create;
+   pscreen->resource_from_handle = nvc0_resource_from_handle;
+   pscreen->resource_get_handle = u_resource_get_handle_vtbl;
+   pscreen->resource_destroy = u_resource_destroy_vtbl;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h
new file mode 100644
index 00000000000..0d5f026d6e1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.h
@@ -0,0 +1,58 @@
+
+#ifndef __NVC0_RESOURCE_H__
+#define __NVC0_RESOURCE_H__
+
+#include "nv50/nv50_resource.h"
+
+#define NVC0_RESOURCE_FLAG_VIDEO (NOUVEAU_RESOURCE_FLAG_DRV_PRIV << 0)
+
+
+#define NVC0_TILE_SHIFT_X(m) ((((m) >> 0) & 0xf) + 6)
+#define NVC0_TILE_SHIFT_Y(m) ((((m) >> 4) & 0xf) + 3)
+#define NVC0_TILE_SHIFT_Z(m) ((((m) >> 8) & 0xf) + 0)
+
+#define NVC0_TILE_SIZE_X(m) (64 << (((m) >> 0) & 0xf))
+#define NVC0_TILE_SIZE_Y(m) ( 8 << (((m) >> 4) & 0xf))
+#define NVC0_TILE_SIZE_Z(m) ( 1 << (((m) >> 8) & 0xf))
+
+/* it's ok to mask only in the end because max value is 3 * 5 */
+
+#define NVC0_TILE_SIZE_2D(m) ((64 * 8) << (((m) + ((m) >> 4)) & 0xf))
+
+#define NVC0_TILE_SIZE(m) ((64 * 8) << (((m) + ((m) >> 4) + ((m) >> 8)) & 0xf))
+
+
+void
+nvc0_init_resource_functions(struct pipe_context *pcontext);
+
+void
+nvc0_screen_init_resource_functions(struct pipe_screen *pscreen);
+
+/* Internal functions:
+ */
+struct pipe_resource *
+nvc0_miptree_create(struct pipe_screen *pscreen,
+                    const struct pipe_resource *tmp);
+
+const struct u_resource_vtbl nvc0_miptree_vtbl;
+
+struct pipe_surface *
+nvc0_miptree_surface_new(struct pipe_context *,
+                         struct pipe_resource *,
+                         const struct pipe_surface *templ);
+
+unsigned
+nvc0_mt_zslice_offset(const struct nv50_miptree *, unsigned l, unsigned z);
+
+void *
+nvc0_miptree_transfer_map(struct pipe_context *pctx,
+                          struct pipe_resource *res,
+                          unsigned level,
+                          unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **ptransfer);
+void
+nvc0_miptree_transfer_unmap(struct pipe_context *pcontext,
+                            struct pipe_transfer *ptx);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
new file mode 100644
index 00000000000..ff7890bbac4
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -0,0 +1,967 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "pipe/p_screen.h"
+
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
+
+#include "nouveau_vp3_video.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_screen.h"
+
+#include "nvc0/nvc0_graph_macros.h"
+
+#ifndef NOUVEAU_GETPARAM_GRAPH_UNITS
+# define NOUVEAU_GETPARAM_GRAPH_UNITS 13
+#endif
+
+static boolean
+nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
+                                enum pipe_format format,
+                                enum pipe_texture_target target,
+                                unsigned sample_count,
+                                unsigned bindings)
+{
+   if (sample_count > 8)
+      return FALSE;
+   if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
+      return FALSE;
+
+   if (!util_format_is_supported(format, bindings))
+      return FALSE;
+
+   if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER))
+      if (util_format_get_blocksizebits(format) == 3 * 32)
+         return FALSE;
+
+   /* transfers & shared are always supported */
+   bindings &= ~(PIPE_BIND_TRANSFER_READ |
+                 PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_SHARED);
+
+   return (nvc0_format_table[format].usage & bindings) == bindings;
+}
+
+static int
+nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
+   switch (param) {
+   case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+      return 16 * 5;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 15;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return (class_3d >= NVE4_3D_CLASS) ? 13 : 12;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return 2048;
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -8;
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 7;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return 65536;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_POINT_SPRITE:
+   case PIPE_CAP_TGSI_TEXCOORD:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return 150;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 8;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+      return 1;
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+      return 1;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return 4;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 128;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 0;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return 0;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+   case PIPE_CAP_START_INSTANCE:
+      return 1;
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+      return 0; /* state trackers will know better */
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+      return 1;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 256;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 1; /* 256 for binding as RT, but that's not possible in GL */
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return NOUVEAU_MIN_BUFFER_MAP_ALIGN;
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+      return 0;
+   case PIPE_CAP_COMPUTE:
+      return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 1;
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50;
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_LITTLE;
+   default:
+      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
+      return 0;
+   }
+}
+
+static int
+nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
+                             enum pipe_shader_cap param)
+{
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
+   switch (shader) {
+   case PIPE_SHADER_VERTEX:
+      /*
+   case PIPE_SHADER_TESSELLATION_CONTROL:
+   case PIPE_SHADER_TESSELLATION_EVALUATION:
+      */
+   case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_FRAGMENT:
+      break;
+   case PIPE_SHADER_COMPUTE:
+      if (class_3d < NVE4_3D_CLASS)
+         return 0;
+      break;
+   default:
+      return 0;
+   }
+
+   switch (param) {
+   case PIPE_SHADER_CAP_PREFERRED_IR:
+      return PIPE_SHADER_IR_TGSI;
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+      return 16384;
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return 16;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      if (shader == PIPE_SHADER_VERTEX)
+         return 32;
+      /* NOTE: These only count our slots for GENERIC varyings.
+       * The address space may be larger, but the actual hard limit seems to be
+       * less than what the address space layout permits, so don't add TEXCOORD,
+       * COLOR, etc. here.
+       */
+      if (shader == PIPE_SHADER_FRAGMENT)
+         return 0x1f0 / 16;
+      /* Actually this counts CLIPVERTEX, which occupies the last generic slot,
+       * and excludes 0x60 per-patch inputs.
+       */
+      return 0x200 / 16;
+   case PIPE_SHADER_CAP_MAX_CONSTS:
+      return 65536 / 16;
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
+         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
+      return NVC0_MAX_PIPE_CONSTBUFS;
+   case PIPE_SHADER_CAP_MAX_ADDRS:
+      return 1;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      return shader != PIPE_SHADER_FRAGMENT;
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      return 1;
+   case PIPE_SHADER_CAP_MAX_PREDS:
+      return 0;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return NVC0_CAP_MAX_PROGRAM_TEMPS;
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+      return 0;
+   case PIPE_SHADER_CAP_SUBROUTINES:
+      return 1;
+   case PIPE_SHADER_CAP_INTEGERS:
+      return 1;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+      return 16; /* would be 32 in linked (OpenGL-style) mode */
+      /*
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLER_VIEWS:
+      return 32;
+      */
+   default:
+      NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
+      return 0;
+   }
+}
+
+static float
+nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      return 10.0f;
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+      return 63.0f;
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 63.375f;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0f;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 15.0f;
+   default:
+      NOUVEAU_ERR("unknown PIPE_CAP %d\n", param);
+      return 0.0f;
+   }
+}
+
+static int
+nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   uint64_t *data64 = (uint64_t *)data;
+   const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      data64[0] = 3;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      data64[0] = (obj_class >= NVE4_COMPUTE_CLASS) ? 0x7fffffff : 65535;
+      data64[1] = 65535;
+      data64[2] = 65535;
+      return 24;
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      data64[0] = 1024;
+      data64[1] = 1024;
+      data64[2] = 64;
+      return 24;
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      data64[0] = 1024;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */
+      data64[0] = (uint64_t)1 << 40;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      data64[0] = 48 << 10;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      data64[0] = 512 << 10;
+      return 8;
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      data64[0] = 4096;
+      return 8;
+   default:
+      return 0;
+   }
+}
+
+static void
+nvc0_screen_destroy(struct pipe_screen *pscreen)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+
+   if (screen->base.fence.current) {
+      nouveau_fence_wait(screen->base.fence.current);
+      nouveau_fence_ref(NULL, &screen->base.fence.current);
+   }
+   if (screen->base.pushbuf)
+      screen->base.pushbuf->user_priv = NULL;
+
+   if (screen->blitter)
+      nvc0_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nvc0_program_destroy(NULL, screen->pm.prog);
+   }
+
+   nouveau_bo_ref(NULL, &screen->text);
+   nouveau_bo_ref(NULL, &screen->uniform_bo);
+   nouveau_bo_ref(NULL, &screen->tls);
+   nouveau_bo_ref(NULL, &screen->txc);
+   nouveau_bo_ref(NULL, &screen->fence.bo);
+   nouveau_bo_ref(NULL, &screen->poly_cache);
+   nouveau_bo_ref(NULL, &screen->parm);
+
+   nouveau_heap_destroy(&screen->lib_code);
+   nouveau_heap_destroy(&screen->text_heap);
+
+   FREE(screen->tic.entries);
+
+   nouveau_mm_destroy(screen->mm_VRAM_fe0);
+
+   nouveau_object_del(&screen->eng3d);
+   nouveau_object_del(&screen->eng2d);
+   nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->compute);
+
+   nouveau_screen_fini(&screen->base);
+
+   FREE(screen);
+}
+
+static int
+nvc0_graph_set_macro(struct nvc0_screen *screen, uint32_t m, unsigned pos,
+                     unsigned size, const uint32_t *data)
+{
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+
+   size /= 4;
+
+   BEGIN_NVC0(push, SUBC_3D(NVC0_GRAPH_MACRO_ID), 2);
+   PUSH_DATA (push, (m - 0x3800) / 8);
+   PUSH_DATA (push, pos);
+   BEGIN_1IC0(push, SUBC_3D(NVC0_GRAPH_MACRO_UPLOAD_POS), size + 1);
+   PUSH_DATA (push, pos);
+   PUSH_DATAp(push, data, size);
+
+   return pos + size;
+}
+
+static void
+nvc0_magic_3d_init(struct nouveau_pushbuf *push, uint16_t obj_class)
+{
+   BEGIN_NVC0(push, SUBC_3D(0x10cc), 1);
+   PUSH_DATA (push, 0xff);
+   BEGIN_NVC0(push, SUBC_3D(0x10e0), 2);
+   PUSH_DATA (push, 0xff);
+   PUSH_DATA (push, 0xff);
+   BEGIN_NVC0(push, SUBC_3D(0x10ec), 2);
+   PUSH_DATA (push, 0xff);
+   PUSH_DATA (push, 0xff);
+   BEGIN_NVC0(push, SUBC_3D(0x074c), 1);
+   PUSH_DATA (push, 0x3f);
+
+   BEGIN_NVC0(push, SUBC_3D(0x16a8), 1);
+   PUSH_DATA (push, (3 << 16) | 3);
+   BEGIN_NVC0(push, SUBC_3D(0x1794), 1);
+   PUSH_DATA (push, (2 << 16) | 2);
+   BEGIN_NVC0(push, SUBC_3D(0x0de8), 1);
+   PUSH_DATA (push, 1);
+
+   BEGIN_NVC0(push, SUBC_3D(0x12ac), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, SUBC_3D(0x0218), 1);
+   PUSH_DATA (push, 0x10);
+   BEGIN_NVC0(push, SUBC_3D(0x10fc), 1);
+   PUSH_DATA (push, 0x10);
+   BEGIN_NVC0(push, SUBC_3D(0x1290), 1);
+   PUSH_DATA (push, 0x10);
+   BEGIN_NVC0(push, SUBC_3D(0x12d8), 2);
+   PUSH_DATA (push, 0x10);
+   PUSH_DATA (push, 0x10);
+   BEGIN_NVC0(push, SUBC_3D(0x1140), 1);
+   PUSH_DATA (push, 0x10);
+   BEGIN_NVC0(push, SUBC_3D(0x1610), 1);
+   PUSH_DATA (push, 0xe);
+
+   BEGIN_NVC0(push, SUBC_3D(0x164c), 1);
+   PUSH_DATA (push, 1 << 12);
+   BEGIN_NVC0(push, SUBC_3D(0x030c), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, SUBC_3D(0x0300), 1);
+   PUSH_DATA (push, 3);
+
+   BEGIN_NVC0(push, SUBC_3D(0x02d0), 1);
+   PUSH_DATA (push, 0x3fffff);
+   BEGIN_NVC0(push, SUBC_3D(0x0fdc), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, SUBC_3D(0x19c0), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, SUBC_3D(0x075c), 1);
+   PUSH_DATA (push, 3);
+
+   if (obj_class >= NVE4_3D_CLASS) {
+      BEGIN_NVC0(push, SUBC_3D(0x07fc), 1);
+      PUSH_DATA (push, 1);
+   }
+
+   /* TODO: find out what software methods 0x1528, 0x1280 and (on nve4) 0x02dc
+    * are supposed to do */
+}
+
+static void
+nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+
+   /* we need to do it after possible flush in MARK_RING */
+   *sequence = ++screen->base.fence.sequence;
+
+   BEGIN_NVC0(push, NVC0_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, screen->fence.bo->offset);
+   PUSH_DATA (push, screen->fence.bo->offset);
+   PUSH_DATA (push, *sequence);
+   PUSH_DATA (push, NVC0_3D_QUERY_GET_FENCE | NVC0_3D_QUERY_GET_SHORT |
+              (0xf << NVC0_3D_QUERY_GET_UNIT__SHIFT));
+}
+
+static u32
+nvc0_screen_fence_update(struct pipe_screen *pscreen)
+{
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   return screen->fence.map[0];
+}
+
+static int
+nvc0_screen_init_compute(struct nvc0_screen *screen)
+{
+   screen->base.base.get_compute_param = nvc0_screen_get_compute_param;
+
+   switch (screen->base.device->chipset & 0xf0) {
+   case 0xc0:
+   case 0xd0:
+      /* Using COMPUTE has weird effects on 3D state, we need to
+       * investigate this further before enabling it by default.
+       */
+      if (debug_get_bool_option("NVC0_COMPUTE", FALSE))
+         return nvc0_screen_compute_setup(screen, screen->base.pushbuf);
+      return 0;
+   case 0xe0:
+   case 0xf0:
+      return nve4_screen_compute_setup(screen, screen->base.pushbuf);
+   default:
+      return -1;
+   }
+}
+
+boolean
+nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
+                            uint32_t lpos, uint32_t lneg, uint32_t cstack)
+{
+   struct nouveau_bo *bo = NULL;
+   int ret;
+   uint64_t size = (lpos + lneg) * 32 + cstack;
+
+   if (size >= (1 << 20)) {
+      NOUVEAU_ERR("requested TLS size too large: 0x%"PRIx64"\n", size);
+      return FALSE;
+   }
+
+   size *= (screen->base.device->chipset >= 0xe0) ? 64 : 48; /* max warps */
+   size  = align(size, 0x8000);
+   size *= screen->mp_count;
+
+   size = align(size, 1 << 17);
+
+   ret = nouveau_bo_new(screen->base.device, NOUVEAU_BO_VRAM, 1 << 17, size,
+                        NULL, &bo);
+   if (ret) {
+      NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
+      return FALSE;
+   }
+   nouveau_bo_ref(NULL, &screen->tls);
+   screen->tls = bo;
+   return TRUE;
+}
+
+#define FAIL_SCREEN_INIT(str, err)                    \
+   do {                                               \
+      NOUVEAU_ERR(str, err);                          \
+      nvc0_screen_destroy(pscreen);                   \
+      return NULL;                                    \
+   } while(0)
+
+struct pipe_screen *
+nvc0_screen_create(struct nouveau_device *dev)
+{
+   struct nvc0_screen *screen;
+   struct pipe_screen *pscreen;
+   struct nouveau_object *chan;
+   struct nouveau_pushbuf *push;
+   uint64_t value;
+   uint32_t obj_class;
+   int ret;
+   unsigned i;
+   union nouveau_bo_config mm_config;
+
+   switch (dev->chipset & ~0xf) {
+   case 0xc0:
+   case 0xd0:
+   case 0xe0:
+   case 0xf0:
+      break;
+   default:
+      return NULL;
+   }
+
+   screen = CALLOC_STRUCT(nvc0_screen);
+   if (!screen)
+      return NULL;
+   pscreen = &screen->base.base;
+
+   ret = nouveau_screen_init(&screen->base, dev);
+   if (ret) {
+      nvc0_screen_destroy(pscreen);
+      return NULL;
+   }
+   chan = screen->base.channel;
+   push = screen->base.pushbuf;
+   push->user_priv = screen;
+   push->rsvd_kick = 5;
+
+   screen->base.vidmem_bindings |= PIPE_BIND_CONSTANT_BUFFER |
+      PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
+   screen->base.sysmem_bindings |=
+      PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
+
+   pscreen->destroy = nvc0_screen_destroy;
+   pscreen->context_create = nvc0_create;
+   pscreen->is_format_supported = nvc0_screen_is_format_supported;
+   pscreen->get_param = nvc0_screen_get_param;
+   pscreen->get_shader_param = nvc0_screen_get_shader_param;
+   pscreen->get_paramf = nvc0_screen_get_paramf;
+   pscreen->get_driver_query_info = nvc0_screen_get_driver_query_info;
+
+   nvc0_screen_init_resource_functions(pscreen);
+
+   screen->base.base.get_video_param = nouveau_vp3_screen_get_video_param;
+   screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, 4096, NULL,
+                        &screen->fence.bo);
+   if (ret)
+      goto fail;
+   nouveau_bo_map(screen->fence.bo, 0, NULL);
+   screen->fence.map = screen->fence.bo->map;
+   screen->base.fence.emit = nvc0_screen_fence_emit;
+   screen->base.fence.update = nvc0_screen_fence_update;
+
+   switch (dev->chipset & 0xf0) {
+   case 0xf0:
+      obj_class = NVF0_P2MF_CLASS;
+      break;
+   case 0xe0:
+      obj_class = NVE4_P2MF_CLASS;
+      break;
+   default:
+      obj_class = NVC0_M2MF_CLASS;
+      break;
+   }
+   ret = nouveau_object_new(chan, 0xbeef323f, obj_class, NULL, 0,
+                            &screen->m2mf);
+   if (ret)
+      FAIL_SCREEN_INIT("Error allocating PGRAPH context for M2MF: %d\n", ret);
+
+   BEGIN_NVC0(push, SUBC_M2MF(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->m2mf->oclass);
+   if (screen->m2mf->oclass == NVE4_P2MF_CLASS) {
+      BEGIN_NVC0(push, SUBC_COPY(NV01_SUBCHAN_OBJECT), 1);
+      PUSH_DATA (push, 0xa0b5);
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef902d, NVC0_2D_CLASS, NULL, 0,
+                            &screen->eng2d);
+   if (ret)
+      FAIL_SCREEN_INIT("Error allocating PGRAPH context for 2D: %d\n", ret);
+
+   BEGIN_NVC0(push, SUBC_2D(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->eng2d->oclass);
+   BEGIN_NVC0(push, NVC0_2D(SINGLE_GPC), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_2D(OPERATION), 1);
+   PUSH_DATA (push, NVC0_2D_OPERATION_SRCCOPY);
+   BEGIN_NVC0(push, NVC0_2D(CLIP_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_2D(COLOR_KEY_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, SUBC_2D(0x0884), 1);
+   PUSH_DATA (push, 0x3f);
+   BEGIN_NVC0(push, SUBC_2D(0x0888), 1);
+   PUSH_DATA (push, 1);
+
+   BEGIN_NVC0(push, SUBC_2D(NVC0_GRAPH_NOTIFY_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->fence.bo->offset + 16);
+   PUSH_DATA (push, screen->fence.bo->offset + 16);
+
+   switch (dev->chipset & 0xf0) {
+   case 0xf0:
+      obj_class = NVF0_3D_CLASS;
+      break;
+   case 0xe0:
+      obj_class = NVE4_3D_CLASS;
+      break;
+   case 0xd0:
+   case 0xc0:
+   default:
+      switch (dev->chipset) {
+      case 0xd9:
+      case 0xc8:
+         obj_class = NVC8_3D_CLASS;
+         break;
+      case 0xc1:
+         obj_class = NVC1_3D_CLASS;
+         break;
+      default:
+         obj_class = NVC0_3D_CLASS;
+         break;
+      }
+      break;
+   }
+   ret = nouveau_object_new(chan, 0xbeef003d, obj_class, NULL, 0,
+                            &screen->eng3d);
+   if (ret)
+      FAIL_SCREEN_INIT("Error allocating PGRAPH context for 3D: %d\n", ret);
+   screen->base.class_3d = obj_class;
+
+   BEGIN_NVC0(push, SUBC_3D(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->eng3d->oclass);
+
+   BEGIN_NVC0(push, NVC0_3D(COND_MODE), 1);
+   PUSH_DATA (push, NVC0_3D_COND_MODE_ALWAYS);
+
+   if (debug_get_bool_option("NOUVEAU_SHADER_WATCHDOG", TRUE)) {
+      /* kill shaders after about 1 second (at 100 MHz) */
+      BEGIN_NVC0(push, NVC0_3D(WATCHDOG_TIMER), 1);
+      PUSH_DATA (push, 0x17);
+   }
+
+   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), dev->drm_version >= 0x01000101);
+   BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8);
+   for (i = 0; i < 8; ++i)
+           PUSH_DATA(push, dev->drm_version >= 0x01000101);
+
+   BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+   PUSH_DATA (push, 1);
+
+   BEGIN_NVC0(push, NVC0_3D(CSAA_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_ENABLE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, NVC0_3D_MULTISAMPLE_MODE_MS1);
+   BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(LINE_WIDTH_SEPARATE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(LINE_LAST_PIXEL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(BLEND_SEPARATE_ALPHA), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(BLEND_ENABLE_COMMON), 1);
+   PUSH_DATA (push, 0);
+   if (screen->eng3d->oclass < NVE4_3D_CLASS) {
+      BEGIN_NVC0(push, NVC0_3D(TEX_MISC), 1);
+      PUSH_DATA (push, NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP);
+   } else {
+      BEGIN_NVC0(push, NVE4_3D(TEX_CB_INDEX), 1);
+      PUSH_DATA (push, 15);
+   }
+   BEGIN_NVC0(push, NVC0_3D(CALL_LIMIT_LOG), 1);
+   PUSH_DATA (push, 8); /* 128 */
+   BEGIN_NVC0(push, NVC0_3D(ZCULL_STATCTRS_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   if (screen->eng3d->oclass >= NVC1_3D_CLASS) {
+      BEGIN_NVC0(push, NVC0_3D(CACHE_SPLIT), 1);
+      PUSH_DATA (push, NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1);
+   }
+
+   nvc0_magic_3d_init(push, screen->eng3d->oclass);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+                        &screen->text);
+   if (ret)
+      goto fail;
+
+   /* XXX: getting a page fault at the end of the code buffer every few
+    *  launches, don't use the last 256 bytes to work around them - prefetch ?
+    */
+   nouveau_heap_init(&screen->text_heap, 0, (1 << 20) - 0x100);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 12, 6 << 16, NULL,
+                        &screen->uniform_bo);
+   if (ret)
+      goto fail;
+
+   for (i = 0; i < 5; ++i) {
+      /* TIC and TSC entries for each unit (nve4+ only) */
+      /* auxiliary constants (6 user clip planes, base instance id) */
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
+      PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (i << 9));
+      BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
+      PUSH_DATA (push, (15 << 4) | 1);
+      if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
+         unsigned j;
+         BEGIN_1IC0(push, NVC0_3D(CB_POS), 9);
+         PUSH_DATA (push, 0);
+         for (j = 0; j < 8; ++j)
+            PUSH_DATA(push, j);
+      } else {
+         BEGIN_NVC0(push, NVC0_3D(TEX_LIMITS(i)), 1);
+         PUSH_DATA (push, 0x54);
+      }
+   }
+   BEGIN_NVC0(push, NVC0_3D(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 256);
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   BEGIN_1IC0(push, NVC0_3D(CB_POS), 5);
+   PUSH_DATA (push, 0);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 0.0f);
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+   PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
+
+   if (dev->drm_version >= 0x01000101) {
+      ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
+      if (ret) {
+         NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n");
+         goto fail;
+      }
+   } else {
+      if (dev->chipset >= 0xe0 && dev->chipset < 0xf0)
+         value = (8 << 8) | 4;
+      else
+         value = (16 << 8) | 4;
+   }
+   screen->mp_count = value >> 8;
+   screen->mp_count_compute = screen->mp_count;
+
+   nvc0_screen_resize_tls_area(screen, 128 * 16, 0, 0x200);
+
+   BEGIN_NVC0(push, NVC0_3D(CODE_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->text->offset);
+   PUSH_DATA (push, screen->text->offset);
+   BEGIN_NVC0(push, NVC0_3D(TEMP_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->size >> 32);
+   PUSH_DATA (push, screen->tls->size);
+   BEGIN_NVC0(push, NVC0_3D(WARP_TEMP_ALLOC), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(LOCAL_BASE), 1);
+   PUSH_DATA (push, 0);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+                        &screen->poly_cache);
+   if (ret)
+      goto fail;
+
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_QUARANTINE_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->poly_cache->offset);
+   PUSH_DATA (push, screen->poly_cache->offset);
+   PUSH_DATA (push, 3);
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 17, NULL,
+                        &screen->txc);
+   if (ret)
+      goto fail;
+
+   BEGIN_NVC0(push, NVC0_3D(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NVC0(push, NVC0_3D(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NVC0(push, NVC0_3D(SCREEN_Y_CONTROL), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(WINDOW_OFFSET_X), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(ZCULL_REGION), 1); /* deactivate ZCULL */
+   PUSH_DATA (push, 0x3f);
+
+   BEGIN_NVC0(push, NVC0_3D(CLIP_RECTS_MODE), 1);
+   PUSH_DATA (push, NVC0_3D_CLIP_RECTS_MODE_INSIDE_ANY);
+   BEGIN_NVC0(push, NVC0_3D(CLIP_RECT_HORIZ(0)), 8 * 2);
+   for (i = 0; i < 8 * 2; ++i)
+      PUSH_DATA(push, 0);
+   BEGIN_NVC0(push, NVC0_3D(CLIP_RECTS_EN), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(CLIPID_ENABLE), 1);
+   PUSH_DATA (push, 0);
+
+   /* neither scissors, viewport nor stencil mask should affect clears */
+   BEGIN_NVC0(push, NVC0_3D(CLEAR_FLAGS), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(DEPTH_RANGE_NEAR(0)), 2);
+   PUSH_DATAf(push, 0.0f);
+   PUSH_DATAf(push, 1.0f);
+   BEGIN_NVC0(push, NVC0_3D(VIEW_VOLUME_CLIP_CTRL), 1);
+   PUSH_DATA (push, NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK1);
+
+   /* We use scissors instead of exact view volume clipping,
+    * so they're always enabled.
+    */
+   BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(0)), 3);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 8192 << 16);
+   PUSH_DATA (push, 8192 << 16);
+
+#define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n);
+
+   i = 0;
+   MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_PER_INSTANCE, nvc0_9097_per_instance_bf);
+   MK_MACRO(NVC0_3D_MACRO_BLEND_ENABLES, nvc0_9097_blend_enables);
+   MK_MACRO(NVC0_3D_MACRO_VERTEX_ARRAY_SELECT, nvc0_9097_vertex_array_select);
+   MK_MACRO(NVC0_3D_MACRO_TEP_SELECT, nvc0_9097_tep_select);
+   MK_MACRO(NVC0_3D_MACRO_GP_SELECT, nvc0_9097_gp_select);
+   MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_FRONT, nvc0_9097_poly_mode_front);
+   MK_MACRO(NVC0_3D_MACRO_POLYGON_MODE_BACK, nvc0_9097_poly_mode_back);
+
+   BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(RT_SEPARATE_FRAG_DATA), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
+   PUSH_DATA (push, 0x40);
+   BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1);
+   PUSH_DATA (push, 0x30);
+   BEGIN_NVC0(push, NVC0_3D(PATCH_VERTICES), 1);
+   PUSH_DATA (push, 3);
+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
+   PUSH_DATA (push, 0x20);
+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(0)), 1);
+   PUSH_DATA (push, 0x00);
+
+   BEGIN_NVC0(push, NVC0_3D(POINT_COORD_REPLACE), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(POINT_RASTER_RULES), 1);
+   PUSH_DATA (push, NVC0_3D_POINT_RASTER_RULES_OGL);
+
+   IMMED_NVC0(push, NVC0_3D(EDGEFLAG), 1);
+
+   if (nvc0_screen_init_compute(screen))
+      goto fail;
+
+   PUSH_KICK (push);
+
+   screen->tic.entries = CALLOC(4096, sizeof(void *));
+   screen->tsc.entries = screen->tic.entries + 2048;
+
+   mm_config.nvc0.tile_mode = 0;
+   mm_config.nvc0.memtype = 0xfe0;
+   screen->mm_VRAM_fe0 = nouveau_mm_create(dev, NOUVEAU_BO_VRAM, &mm_config);
+
+   if (!nvc0_blitter_create(screen))
+      goto fail;
+
+   nouveau_fence_new(&screen->base, &screen->base.fence.current, FALSE);
+
+   return pscreen;
+
+fail:
+   nvc0_screen_destroy(pscreen);
+   return NULL;
+}
+
+int
+nvc0_screen_tic_alloc(struct nvc0_screen *screen, void *entry)
+{
+   int i = screen->tic.next;
+
+   while (screen->tic.lock[i / 32] & (1 << (i % 32)))
+      i = (i + 1) & (NVC0_TIC_MAX_ENTRIES - 1);
+
+   screen->tic.next = (i + 1) & (NVC0_TIC_MAX_ENTRIES - 1);
+
+   if (screen->tic.entries[i])
+      nv50_tic_entry(screen->tic.entries[i])->id = -1;
+
+   screen->tic.entries[i] = entry;
+   return i;
+}
+
+int
+nvc0_screen_tsc_alloc(struct nvc0_screen *screen, void *entry)
+{
+   int i = screen->tsc.next;
+
+   while (screen->tsc.lock[i / 32] & (1 << (i % 32)))
+      i = (i + 1) & (NVC0_TSC_MAX_ENTRIES - 1);
+
+   screen->tsc.next = (i + 1) & (NVC0_TSC_MAX_ENTRIES - 1);
+
+   if (screen->tsc.entries[i])
+      nv50_tsc_entry(screen->tsc.entries[i])->id = -1;
+
+   screen->tsc.entries[i] = entry;
+   return i;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
new file mode 100644
index 00000000000..27a0c5f784d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -0,0 +1,325 @@
+#ifndef __NVC0_SCREEN_H__
+#define __NVC0_SCREEN_H__
+
+#include "nouveau_screen.h"
+#include "nouveau_mm.h"
+#include "nouveau_fence.h"
+#include "nouveau_heap.h"
+
+#include "nv_object.xml.h"
+
+#include "nvc0/nvc0_winsys.h"
+#include "nvc0/nvc0_stateobj.h"
+
+#define NVC0_TIC_MAX_ENTRIES 2048
+#define NVC0_TSC_MAX_ENTRIES 2048
+
+/* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
+#define NVC0_MAX_PIPE_CONSTBUFS         14
+#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE  7
+
+#define NVC0_MAX_SURFACE_SLOTS 16
+
+struct nvc0_context;
+
+struct nvc0_blitter;
+
+struct nvc0_screen {
+   struct nouveau_screen base;
+
+   struct nvc0_context *cur_ctx;
+
+   int num_occlusion_queries_active;
+
+   struct nouveau_bo *text;
+   struct nouveau_bo *parm;       /* for COMPUTE */
+   struct nouveau_bo *uniform_bo; /* for 3D */
+   struct nouveau_bo *tls;
+   struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
+   struct nouveau_bo *poly_cache;
+
+   uint16_t mp_count;
+   uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */
+
+   struct nouveau_heap *text_heap;
+   struct nouveau_heap *lib_code; /* allocated from text_heap */
+
+   struct nvc0_blitter *blitter;
+
+   struct {
+      void **entries;
+      int next;
+      uint32_t lock[NVC0_TIC_MAX_ENTRIES / 32];
+   } tic;
+
+   struct {
+      void **entries;
+      int next;
+      uint32_t lock[NVC0_TSC_MAX_ENTRIES / 32];
+   } tsc;
+
+   struct {
+      struct nouveau_bo *bo;
+      uint32_t *map;
+   } fence;
+
+   struct {
+      struct nvc0_program *prog; /* compute state object to read MP counters */
+      struct pipe_query *mp_counter[8]; /* counter to query allocation */
+      uint8_t num_mp_pm_active[2];
+      boolean mp_counters_enabled;
+   } pm;
+
+   struct nouveau_mman *mm_VRAM_fe0;
+
+   struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
+   struct nouveau_object *eng2d;
+   struct nouveau_object *m2mf;
+   struct nouveau_object *compute;
+};
+
+static INLINE struct nvc0_screen *
+nvc0_screen(struct pipe_screen *screen)
+{
+   return (struct nvc0_screen *)screen;
+}
+
+
+/* Performance counter queries:
+ */
+#define NVE4_PM_QUERY_COUNT  49
+#define NVE4_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NVE4_PM_QUERY_LAST   NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
+#define NVE4_PM_QUERY_PROF_TRIGGER_0            0
+#define NVE4_PM_QUERY_PROF_TRIGGER_1            1
+#define NVE4_PM_QUERY_PROF_TRIGGER_2            2
+#define NVE4_PM_QUERY_PROF_TRIGGER_3            3
+#define NVE4_PM_QUERY_PROF_TRIGGER_4            4
+#define NVE4_PM_QUERY_PROF_TRIGGER_5            5
+#define NVE4_PM_QUERY_PROF_TRIGGER_6            6
+#define NVE4_PM_QUERY_PROF_TRIGGER_7            7
+#define NVE4_PM_QUERY_LAUNCHED_WARPS            8
+#define NVE4_PM_QUERY_LAUNCHED_THREADS          9
+#define NVE4_PM_QUERY_LAUNCHED_CTA              10
+#define NVE4_PM_QUERY_INST_ISSUED1              11
+#define NVE4_PM_QUERY_INST_ISSUED2              12
+#define NVE4_PM_QUERY_INST_EXECUTED             13
+#define NVE4_PM_QUERY_LD_LOCAL                  14
+#define NVE4_PM_QUERY_ST_LOCAL                  15
+#define NVE4_PM_QUERY_LD_SHARED                 16
+#define NVE4_PM_QUERY_ST_SHARED                 17
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT         18
+#define NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS        19
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_HIT        20
+#define NVE4_PM_QUERY_L1_LOCAL_STORE_MISS       21
+#define NVE4_PM_QUERY_GLD_REQUEST               22
+#define NVE4_PM_QUERY_GST_REQUEST               23
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT        24
+#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS       25
+#define NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED 26
+#define NVE4_PM_QUERY_GST_TRANSACTIONS          27
+#define NVE4_PM_QUERY_BRANCH                    28
+#define NVE4_PM_QUERY_BRANCH_DIVERGENT          29
+#define NVE4_PM_QUERY_ACTIVE_WARPS              30
+#define NVE4_PM_QUERY_ACTIVE_CYCLES             31
+#define NVE4_PM_QUERY_INST_ISSUED               32
+#define NVE4_PM_QUERY_ATOM_COUNT                33
+#define NVE4_PM_QUERY_GRED_COUNT                34
+#define NVE4_PM_QUERY_LD_SHARED_REPLAY          35
+#define NVE4_PM_QUERY_ST_SHARED_REPLAY          36
+#define NVE4_PM_QUERY_LD_LOCAL_TRANSACTIONS     37
+#define NVE4_PM_QUERY_ST_LOCAL_TRANSACTIONS     38
+#define NVE4_PM_QUERY_L1_LD_SHARED_TRANSACTIONS 39
+#define NVE4_PM_QUERY_L1_ST_SHARED_TRANSACTIONS 40
+#define NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY        41
+#define NVE4_PM_QUERY_GST_MEM_DIV_REPLAY        42
+#define NVE4_PM_QUERY_METRIC_IPC                43
+#define NVE4_PM_QUERY_METRIC_IPAC               44
+#define NVE4_PM_QUERY_METRIC_IPEC               45
+#define NVE4_PM_QUERY_METRIC_MP_OCCUPANCY       46
+#define NVE4_PM_QUERY_METRIC_MP_EFFICIENCY      47
+#define NVE4_PM_QUERY_METRIC_INST_REPLAY_OHEAD  48
+
+/*
+#define NVE4_PM_QUERY_GR_IDLE                   50
+#define NVE4_PM_QUERY_BSP_IDLE                  51
+#define NVE4_PM_QUERY_VP_IDLE                   52
+#define NVE4_PM_QUERY_PPP_IDLE                  53
+#define NVE4_PM_QUERY_CE0_IDLE                  54
+#define NVE4_PM_QUERY_CE1_IDLE                  55
+#define NVE4_PM_QUERY_CE2_IDLE                  56
+*/
+/* L2 queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_L2_SUBP_WRITE_L1_SECTOR_QUERIES 57
+...
+*/
+/* TEX queries (PCOUNTER) */
+/*
+#define NVE4_PM_QUERY_TEX0_CACHE_SECTOR_QUERIES 58
+...
+*/
+
+#define NVC0_PM_QUERY_COUNT 31
+#define NVC0_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
+#define NVC0_PM_QUERY_LAST   NVC0_PM_QUERY(NVC0_PM_QUERY_COUNT - 1)
+#define NVC0_PM_QUERY_INST_EXECUTED             0
+#define NVC0_PM_QUERY_BRANCH                    1
+#define NVC0_PM_QUERY_BRANCH_DIVERGENT          2
+#define NVC0_PM_QUERY_ACTIVE_WARPS              3
+#define NVC0_PM_QUERY_ACTIVE_CYCLES             4
+#define NVC0_PM_QUERY_LAUNCHED_WARPS            5
+#define NVC0_PM_QUERY_LAUNCHED_THREADS          6
+#define NVC0_PM_QUERY_LD_SHARED                 7
+#define NVC0_PM_QUERY_ST_SHARED                 8
+#define NVC0_PM_QUERY_LD_LOCAL                  9
+#define NVC0_PM_QUERY_ST_LOCAL                  10
+#define NVC0_PM_QUERY_GRED_COUNT                11
+#define NVC0_PM_QUERY_ATOM_COUNT                12
+#define NVC0_PM_QUERY_GLD_REQUEST               13
+#define NVC0_PM_QUERY_GST_REQUEST               14
+#define NVC0_PM_QUERY_INST_ISSUED1_0            15
+#define NVC0_PM_QUERY_INST_ISSUED1_1            16
+#define NVC0_PM_QUERY_INST_ISSUED2_0            17
+#define NVC0_PM_QUERY_INST_ISSUED2_1            18
+#define NVC0_PM_QUERY_TH_INST_EXECUTED_0        19
+#define NVC0_PM_QUERY_TH_INST_EXECUTED_1        20
+#define NVC0_PM_QUERY_TH_INST_EXECUTED_2        21
+#define NVC0_PM_QUERY_TH_INST_EXECUTED_3        22
+#define NVC0_PM_QUERY_PROF_TRIGGER_0            23
+#define NVC0_PM_QUERY_PROF_TRIGGER_1            24
+#define NVC0_PM_QUERY_PROF_TRIGGER_2            25
+#define NVC0_PM_QUERY_PROF_TRIGGER_3            26
+#define NVC0_PM_QUERY_PROF_TRIGGER_4            27
+#define NVC0_PM_QUERY_PROF_TRIGGER_5            28
+#define NVC0_PM_QUERY_PROF_TRIGGER_6            29
+#define NVC0_PM_QUERY_PROF_TRIGGER_7            30
+
+/* Driver statistics queries:
+ */
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+
+#define NVC0_QUERY_DRV_STAT(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
+#define NVC0_QUERY_DRV_STAT_COUNT  29
+#define NVC0_QUERY_DRV_STAT_LAST   NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1)
+#define NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT         0
+#define NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES         1
+#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT         2
+#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID     3
+#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS     4
+#define NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ               5
+#define NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE              6
+#define NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT                   7
+#define NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT                   8
+#define NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT            9
+#define NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ              10
+#define NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE             11
+#define NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID      12
+#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT          13
+#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID     14
+#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS     15
+#define NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES                  16
+#define NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT 17
+#define NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT 18
+#define NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT                19
+#define NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT             20
+#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY                21
+#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED              22
+#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT       23
+#define NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES        24
+#define NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT           25
+#define NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES           26
+#define NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT                   27
+#define NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT         28
+
+#else
+
+#define NVC0_QUERY_DRV_STAT_COUNT 0
+
+#endif
+
+int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
+                                      struct pipe_driver_query_info *);
+
+boolean nvc0_blitter_create(struct nvc0_screen *);
+void nvc0_blitter_destroy(struct nvc0_screen *);
+
+void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
+
+int nvc0_screen_tic_alloc(struct nvc0_screen *, void *);
+int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
+
+int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
+int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
+
+boolean nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
+                                    uint32_t lneg, uint32_t cstack);
+
+static INLINE void
+nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
+{
+   struct nvc0_screen *screen = nvc0_screen(res->base.screen);
+
+   if (res->mm) {
+      nouveau_fence_ref(screen->base.fence.current, &res->fence);
+      if (flags & NOUVEAU_BO_WR)
+         nouveau_fence_ref(screen->base.fence.current, &res->fence_wr);
+   }
+}
+
+static INLINE void
+nvc0_resource_validate(struct nv04_resource *res, uint32_t flags)
+{
+   if (likely(res->bo)) {
+      if (flags & NOUVEAU_BO_WR)
+         res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING |
+            NOUVEAU_BUFFER_STATUS_DIRTY;
+      if (flags & NOUVEAU_BO_RD)
+         res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      nvc0_resource_fence(res, flags);
+   }
+}
+
+struct nvc0_format {
+   uint32_t rt;
+   uint32_t tic;
+   uint32_t vtx;
+   uint32_t usage;
+};
+
+extern const struct nvc0_format nvc0_format_table[];
+
+static INLINE void
+nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
+{
+   if (tic->id >= 0)
+      screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
+}
+
+static INLINE void
+nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
+{
+   if (tsc->id >= 0)
+      screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
+}
+
+static INLINE void
+nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
+{
+   if (tic->id >= 0) {
+      screen->tic.entries[tic->id] = NULL;
+      screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
+   }
+}
+
+static INLINE void
+nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
+{
+   if (tsc->id >= 0) {
+      screen->tsc.entries[tsc->id] = NULL;
+      screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
+   }
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
new file mode 100644
index 00000000000..b820ef21df8
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "nvc0/nvc0_context.h"
+
+static INLINE void
+nvc0_program_update_context_state(struct nvc0_context *nvc0,
+                                  struct nvc0_program *prog, int stage)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   if (prog && prog->need_tls) {
+      const uint32_t flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+      if (!nvc0->state.tls_required)
+         BCTX_REFN_bo(nvc0->bufctx_3d, TLS, flags, nvc0->screen->tls);
+      nvc0->state.tls_required |= 1 << stage;
+   } else {
+      if (nvc0->state.tls_required == (1 << stage))
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TLS);
+      nvc0->state.tls_required &= ~(1 << stage);
+   }
+
+   if (prog && prog->immd_size) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      /* NOTE: may overlap code of a different shader */
+      PUSH_DATA (push, align(prog->immd_size, 0x100));
+      PUSH_DATAh(push, nvc0->screen->text->offset + prog->immd_base);
+      PUSH_DATA (push, nvc0->screen->text->offset + prog->immd_base);
+      BEGIN_NVC0(push, NVC0_3D(CB_BIND(stage)), 1);
+      PUSH_DATA (push, (14 << 4) | 1);
+
+      nvc0->state.c14_bound |= 1 << stage;
+   } else
+   if (nvc0->state.c14_bound & (1 << stage)) {
+      BEGIN_NVC0(push, NVC0_3D(CB_BIND(stage)), 1);
+      PUSH_DATA (push, (14 << 4) | 0);
+
+      nvc0->state.c14_bound &= ~(1 << stage);
+   }
+}
+
+static INLINE boolean
+nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
+{
+   if (prog->mem)
+      return TRUE;
+
+   if (!prog->translated) {
+      prog->translated = nvc0_program_translate(
+         prog, nvc0->screen->base.device->chipset);
+      if (!prog->translated)
+         return FALSE;
+   }
+
+   if (likely(prog->code_size))
+      return nvc0_program_upload_code(nvc0, prog);
+   return TRUE; /* stream output info only */
+}
+
+void
+nvc0_vertprog_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *vp = nvc0->vertprog;
+
+   if (!nvc0_program_validate(nvc0, vp))
+         return;
+   nvc0_program_update_context_state(nvc0, vp, 0);
+
+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(1)), 2);
+   PUSH_DATA (push, 0x11);
+   PUSH_DATA (push, vp->code_base);
+   BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(1)), 1);
+   PUSH_DATA (push, vp->num_gprs);
+
+   // BEGIN_NVC0(push, NVC0_3D_(0x163c), 1);
+   // PUSH_DATA (push, 0);
+}
+
+void
+nvc0_fragprog_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *fp = nvc0->fragprog;
+
+   if (!nvc0_program_validate(nvc0, fp))
+         return;
+   nvc0_program_update_context_state(nvc0, fp, 4);
+
+   if (fp->fp.early_z != nvc0->state.early_z_forced) {
+      nvc0->state.early_z_forced = fp->fp.early_z;
+      IMMED_NVC0(push, NVC0_3D(FORCE_EARLY_FRAGMENT_TESTS), fp->fp.early_z);
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(SP_SELECT(5)), 2);
+   PUSH_DATA (push, 0x51);
+   PUSH_DATA (push, fp->code_base);
+   BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(5)), 1);
+   PUSH_DATA (push, fp->num_gprs);
+
+   BEGIN_NVC0(push, SUBC_3D(0x0360), 2);
+   PUSH_DATA (push, 0x20164010);
+   PUSH_DATA (push, 0x20);
+   BEGIN_NVC0(push, NVC0_3D(ZCULL_TEST_MASK), 1);
+   PUSH_DATA (push, fp->flags[0]);
+}
+
+void
+nvc0_tctlprog_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *tp = nvc0->tctlprog;
+
+   if (tp && nvc0_program_validate(nvc0, tp)) {
+      if (tp->tp.tess_mode != ~0) {
+         BEGIN_NVC0(push, NVC0_3D(TESS_MODE), 1);
+         PUSH_DATA (push, tp->tp.tess_mode);
+      }
+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
+      PUSH_DATA (push, 0x21);
+      PUSH_DATA (push, tp->code_base);
+      BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
+      PUSH_DATA (push, tp->num_gprs);
+
+      if (tp->tp.input_patch_size <= 32)
+         IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), tp->tp.input_patch_size);
+   } else {
+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
+      PUSH_DATA (push, 0x20);
+   }
+   nvc0_program_update_context_state(nvc0, tp, 1);
+}
+
+void
+nvc0_tevlprog_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *tp = nvc0->tevlprog;
+
+   if (tp && nvc0_program_validate(nvc0, tp)) {
+      if (tp->tp.tess_mode != ~0) {
+         BEGIN_NVC0(push, NVC0_3D(TESS_MODE), 1);
+         PUSH_DATA (push, tp->tp.tess_mode);
+      }
+      BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1);
+      PUSH_DATA (push, 0x31);
+      BEGIN_NVC0(push, NVC0_3D(SP_START_ID(3)), 1);
+      PUSH_DATA (push, tp->code_base);
+      BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(3)), 1);
+      PUSH_DATA (push, tp->num_gprs);
+   } else {
+      BEGIN_NVC0(push, NVC0_3D(MACRO_TEP_SELECT), 1);
+      PUSH_DATA (push, 0x30);
+   }
+   nvc0_program_update_context_state(nvc0, tp, 2);
+}
+
+void
+nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *gp = nvc0->gmtyprog;
+
+   if (gp)
+      nvc0_program_validate(nvc0, gp);
+
+   /* we allow GPs with no code for specifying stream output state only */
+   if (gp && gp->code_size) {
+      const boolean gp_selects_layer = gp->hdr[13] & (1 << 9);
+
+      BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
+      PUSH_DATA (push, 0x41);
+      BEGIN_NVC0(push, NVC0_3D(SP_START_ID(4)), 1);
+      PUSH_DATA (push, gp->code_base);
+      BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(4)), 1);
+      PUSH_DATA (push, gp->num_gprs);
+      BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
+      PUSH_DATA (push, gp_selects_layer ? NVC0_3D_LAYER_USE_GP : 0);
+   } else {
+      IMMED_NVC0(push, NVC0_3D(LAYER), 0);
+      BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
+      PUSH_DATA (push, 0x40);
+   }
+   nvc0_program_update_context_state(nvc0, gp, 3);
+}
+
+void
+nvc0_tfb_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_transform_feedback_state *tfb;
+   unsigned b;
+
+   if (nvc0->gmtyprog) tfb = nvc0->gmtyprog->tfb;
+   else
+   if (nvc0->tevlprog) tfb = nvc0->tevlprog->tfb;
+   else
+      tfb = nvc0->vertprog->tfb;
+
+   IMMED_NVC0(push, NVC0_3D(TFB_ENABLE), (tfb && nvc0->num_tfbbufs) ? 1 : 0);
+
+   if (tfb && tfb != nvc0->state.tfb) {
+      for (b = 0; b < 4; ++b) {
+         if (tfb->varying_count[b]) {
+            unsigned n = (tfb->varying_count[b] + 3) / 4;
+
+            BEGIN_NVC0(push, NVC0_3D(TFB_STREAM(b)), 3);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, tfb->varying_count[b]);
+            PUSH_DATA (push, tfb->stride[b]);
+            BEGIN_NVC0(push, NVC0_3D(TFB_VARYING_LOCS(b, 0)), n);
+            PUSH_DATAp(push, tfb->varying_index[b], n);
+
+            if (nvc0->tfbbuf[b])
+               nvc0_so_target(nvc0->tfbbuf[b])->stride = tfb->stride[b];
+         } else {
+            IMMED_NVC0(push, NVC0_3D(TFB_VARYING_COUNT(b)), 0);
+         }
+      }
+   }
+   nvc0->state.tfb = tfb;
+
+   if (!(nvc0->dirty & NVC0_NEW_TFB_TARGETS))
+      return;
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TFB);
+
+   for (b = 0; b < nvc0->num_tfbbufs; ++b) {
+      struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
+
+      if (tfb)
+         targ->stride = tfb->stride[b];
+
+      if (!(nvc0->tfbbuf_dirty & (1 << b)))
+         continue;
+
+      if (!targ->clean)
+         nvc0_query_fifo_wait(push, targ->pq);
+      BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
+      PUSH_DATA (push, 1);
+      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, targ->pipe.buffer_size);
+      if (!targ->clean) {
+         nvc0_query_pushbuf_submit(push, targ->pq, 0x4);
+      } else {
+         PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
+         targ->clean = FALSE;
+      }
+      BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR);
+   }
+   for (; b < 4; ++b)
+      IMMED_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 0);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
new file mode 100644
index 00000000000..e56ef0160f2
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -0,0 +1,1247 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nvc0/nvc0_stateobj.h"
+#include "nvc0/nvc0_context.h"
+
+#include "nvc0/nvc0_3d.xml.h"
+#include "nv50/nv50_texture.xml.h"
+
+#include "nouveau_gldefs.h"
+
+static INLINE uint32_t
+nvc0_colormask(unsigned mask)
+{
+    uint32_t ret = 0;
+
+    if (mask & PIPE_MASK_R)
+        ret |= 0x0001;
+    if (mask & PIPE_MASK_G)
+        ret |= 0x0010;
+    if (mask & PIPE_MASK_B)
+        ret |= 0x0100;
+    if (mask & PIPE_MASK_A)
+        ret |= 0x1000;
+
+    return ret;
+}
+
+#define NVC0_BLEND_FACTOR_CASE(a, b) \
+   case PIPE_BLENDFACTOR_##a: return NV50_3D_BLEND_FACTOR_##b
+
+static INLINE uint32_t
+nvc0_blend_fac(unsigned factor)
+{
+   switch (factor) {
+   NVC0_BLEND_FACTOR_CASE(ONE, ONE);
+   NVC0_BLEND_FACTOR_CASE(SRC_COLOR, SRC_COLOR);
+   NVC0_BLEND_FACTOR_CASE(SRC_ALPHA, SRC_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(DST_ALPHA, DST_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(DST_COLOR, DST_COLOR);
+   NVC0_BLEND_FACTOR_CASE(SRC_ALPHA_SATURATE, SRC_ALPHA_SATURATE);
+   NVC0_BLEND_FACTOR_CASE(CONST_COLOR, CONSTANT_COLOR);
+   NVC0_BLEND_FACTOR_CASE(CONST_ALPHA, CONSTANT_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(SRC1_COLOR, SRC1_COLOR);
+   NVC0_BLEND_FACTOR_CASE(SRC1_ALPHA, SRC1_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(ZERO, ZERO);
+   NVC0_BLEND_FACTOR_CASE(INV_SRC_COLOR, ONE_MINUS_SRC_COLOR);
+   NVC0_BLEND_FACTOR_CASE(INV_SRC_ALPHA, ONE_MINUS_SRC_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(INV_DST_ALPHA, ONE_MINUS_DST_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(INV_DST_COLOR, ONE_MINUS_DST_COLOR);
+   NVC0_BLEND_FACTOR_CASE(INV_CONST_COLOR, ONE_MINUS_CONSTANT_COLOR);
+   NVC0_BLEND_FACTOR_CASE(INV_CONST_ALPHA, ONE_MINUS_CONSTANT_ALPHA);
+   NVC0_BLEND_FACTOR_CASE(INV_SRC1_COLOR, ONE_MINUS_SRC1_COLOR);
+   NVC0_BLEND_FACTOR_CASE(INV_SRC1_ALPHA, ONE_MINUS_SRC1_ALPHA);
+   default:
+      return NV50_3D_BLEND_FACTOR_ZERO;
+   }
+}
+
+static void *
+nvc0_blend_state_create(struct pipe_context *pipe,
+                        const struct pipe_blend_state *cso)
+{
+   struct nvc0_blend_stateobj *so = CALLOC_STRUCT(nvc0_blend_stateobj);
+   int i;
+   int r; /* reference */
+   uint32_t ms;
+   uint8_t blend_en = 0;
+   boolean indep_masks = FALSE;
+   boolean indep_funcs = FALSE;
+
+   so->pipe = *cso;
+
+   /* check which states actually have differing values */
+   if (cso->independent_blend_enable) {
+      for (r = 0; r < 8 && !cso->rt[r].blend_enable; ++r);
+      blend_en |= 1 << r;
+      for (i = r + 1; i < 8; ++i) {
+         if (!cso->rt[i].blend_enable)
+            continue;
+         blend_en |= 1 << i;
+         if (cso->rt[i].rgb_func != cso->rt[r].rgb_func ||
+             cso->rt[i].rgb_src_factor != cso->rt[r].rgb_src_factor ||
+             cso->rt[i].rgb_dst_factor != cso->rt[r].rgb_dst_factor ||
+             cso->rt[i].alpha_func != cso->rt[r].alpha_func ||
+             cso->rt[i].alpha_src_factor != cso->rt[r].alpha_src_factor ||
+             cso->rt[i].alpha_dst_factor != cso->rt[r].alpha_dst_factor) {
+            indep_funcs = TRUE;
+            break;
+         }
+      }
+      for (; i < 8; ++i)
+         blend_en |= (cso->rt[i].blend_enable ? 1 : 0) << i;
+
+      for (i = 1; i < 8; ++i) {
+         if (cso->rt[i].colormask != cso->rt[0].colormask) {
+            indep_masks = TRUE;
+            break;
+         }
+      }
+   } else {
+      r = 0;
+      if (cso->rt[0].blend_enable)
+         blend_en = 0xff;
+   }
+
+   if (cso->logicop_enable) {
+      SB_BEGIN_3D(so, LOGIC_OP_ENABLE, 2);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_logicop_func(cso->logicop_func));
+
+      SB_IMMED_3D(so, MACRO_BLEND_ENABLES, 0);
+   } else {
+      SB_IMMED_3D(so, LOGIC_OP_ENABLE, 0);
+
+      SB_IMMED_3D(so, BLEND_INDEPENDENT, indep_funcs);
+      SB_IMMED_3D(so, MACRO_BLEND_ENABLES, blend_en);
+      if (indep_funcs) {
+         for (i = 0; i < 8; ++i) {
+            if (cso->rt[i].blend_enable) {
+               SB_BEGIN_3D(so, IBLEND_EQUATION_RGB(i), 6);
+               SB_DATA    (so, nvgl_blend_eqn(cso->rt[i].rgb_func));
+               SB_DATA    (so, nvc0_blend_fac(cso->rt[i].rgb_src_factor));
+               SB_DATA    (so, nvc0_blend_fac(cso->rt[i].rgb_dst_factor));
+               SB_DATA    (so, nvgl_blend_eqn(cso->rt[i].alpha_func));
+               SB_DATA    (so, nvc0_blend_fac(cso->rt[i].alpha_src_factor));
+               SB_DATA    (so, nvc0_blend_fac(cso->rt[i].alpha_dst_factor));
+            }
+         }
+      } else
+      if (blend_en) {
+         SB_BEGIN_3D(so, BLEND_EQUATION_RGB, 5);
+         SB_DATA    (so, nvgl_blend_eqn(cso->rt[r].rgb_func));
+         SB_DATA    (so, nvc0_blend_fac(cso->rt[r].rgb_src_factor));
+         SB_DATA    (so, nvc0_blend_fac(cso->rt[r].rgb_dst_factor));
+         SB_DATA    (so, nvgl_blend_eqn(cso->rt[r].alpha_func));
+         SB_DATA    (so, nvc0_blend_fac(cso->rt[r].alpha_src_factor));
+         SB_BEGIN_3D(so, BLEND_FUNC_DST_ALPHA, 1);
+         SB_DATA    (so, nvc0_blend_fac(cso->rt[r].alpha_dst_factor));
+      }
+
+      SB_IMMED_3D(so, COLOR_MASK_COMMON, !indep_masks);
+      if (indep_masks) {
+         SB_BEGIN_3D(so, COLOR_MASK(0), 8);
+         for (i = 0; i < 8; ++i)
+            SB_DATA(so, nvc0_colormask(cso->rt[i].colormask));
+      } else {
+         SB_BEGIN_3D(so, COLOR_MASK(0), 1);
+         SB_DATA    (so, nvc0_colormask(cso->rt[0].colormask));
+      }
+   }
+
+   ms = 0;
+   if (cso->alpha_to_coverage)
+      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+   if (cso->alpha_to_one)
+      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+
+   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
+   SB_DATA    (so, ms);
+
+   assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+   return so;
+}
+
+static void
+nvc0_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->blend = hwcso;
+    nvc0->dirty |= NVC0_NEW_BLEND;
+}
+
+static void
+nvc0_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+    FREE(hwcso);
+}
+
+/* NOTE: ignoring line_last_pixel, using FALSE (set on screen init) */
+static void *
+nvc0_rasterizer_state_create(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *cso)
+{
+    struct nvc0_rasterizer_stateobj *so;
+    uint32_t reg;
+
+    so = CALLOC_STRUCT(nvc0_rasterizer_stateobj);
+    if (!so)
+        return NULL;
+    so->pipe = *cso;
+
+    /* Scissor enables are handled in scissor state, we will not want to
+     * always emit 16 commands, one for each scissor rectangle, here.
+     */
+
+    SB_BEGIN_3D(so, SHADE_MODEL, 1);
+    SB_DATA    (so, cso->flatshade ? NVC0_3D_SHADE_MODEL_FLAT :
+                                     NVC0_3D_SHADE_MODEL_SMOOTH);
+    SB_IMMED_3D(so, PROVOKING_VERTEX_LAST, !cso->flatshade_first);
+    SB_IMMED_3D(so, VERTEX_TWO_SIDE_ENABLE, cso->light_twoside);
+
+    SB_IMMED_3D(so, VERT_COLOR_CLAMP_EN, cso->clamp_vertex_color);
+    SB_BEGIN_3D(so, FRAG_COLOR_CLAMP_EN, 1);
+    SB_DATA    (so, cso->clamp_fragment_color ? 0x11111111 : 0x00000000);
+
+    SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample);
+
+    SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth);
+    if (cso->line_smooth)
+       SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1);
+    else
+       SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1);
+    SB_DATA    (so, fui(cso->line_width));
+
+    SB_IMMED_3D(so, LINE_STIPPLE_ENABLE, cso->line_stipple_enable);
+    if (cso->line_stipple_enable) {
+        SB_BEGIN_3D(so, LINE_STIPPLE_PATTERN, 1);
+        SB_DATA    (so, (cso->line_stipple_pattern << 8) |
+                         cso->line_stipple_factor);
+
+    }
+
+    SB_IMMED_3D(so, VP_POINT_SIZE_EN, cso->point_size_per_vertex);
+    if (!cso->point_size_per_vertex) {
+       SB_BEGIN_3D(so, POINT_SIZE, 1);
+       SB_DATA    (so, fui(cso->point_size));
+    }
+
+    reg = (cso->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT) ?
+       NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN_UPPER_LEFT :
+       NVC0_3D_POINT_COORD_REPLACE_COORD_ORIGIN_LOWER_LEFT;
+
+    SB_BEGIN_3D(so, POINT_COORD_REPLACE, 1);
+    SB_DATA    (so, ((cso->sprite_coord_enable & 0xff) << 3) | reg);
+    SB_IMMED_3D(so, POINT_SPRITE_ENABLE, cso->point_quad_rasterization);
+    SB_IMMED_3D(so, POINT_SMOOTH_ENABLE, cso->point_smooth);
+
+    SB_BEGIN_3D(so, MACRO_POLYGON_MODE_FRONT, 1);
+    SB_DATA    (so, nvgl_polygon_mode(cso->fill_front));
+    SB_BEGIN_3D(so, MACRO_POLYGON_MODE_BACK, 1);
+    SB_DATA    (so, nvgl_polygon_mode(cso->fill_back));
+    SB_IMMED_3D(so, POLYGON_SMOOTH_ENABLE, cso->poly_smooth);
+
+    SB_BEGIN_3D(so, CULL_FACE_ENABLE, 3);
+    SB_DATA    (so, cso->cull_face != PIPE_FACE_NONE);
+    SB_DATA    (so, cso->front_ccw ? NVC0_3D_FRONT_FACE_CCW :
+                                     NVC0_3D_FRONT_FACE_CW);
+    switch (cso->cull_face) {
+    case PIPE_FACE_FRONT_AND_BACK:
+       SB_DATA(so, NVC0_3D_CULL_FACE_FRONT_AND_BACK);
+       break;
+    case PIPE_FACE_FRONT:
+       SB_DATA(so, NVC0_3D_CULL_FACE_FRONT);
+       break;
+    case PIPE_FACE_BACK:
+    default:
+       SB_DATA(so, NVC0_3D_CULL_FACE_BACK);
+       break;
+    }
+
+    SB_IMMED_3D(so, POLYGON_STIPPLE_ENABLE, cso->poly_stipple_enable);
+    SB_BEGIN_3D(so, POLYGON_OFFSET_POINT_ENABLE, 3);
+    SB_DATA    (so, cso->offset_point);
+    SB_DATA    (so, cso->offset_line);
+    SB_DATA    (so, cso->offset_tri);
+
+    if (cso->offset_point || cso->offset_line || cso->offset_tri) {
+        SB_BEGIN_3D(so, POLYGON_OFFSET_FACTOR, 1);
+        SB_DATA    (so, fui(cso->offset_scale));
+        SB_BEGIN_3D(so, POLYGON_OFFSET_UNITS, 1);
+        SB_DATA    (so, fui(cso->offset_units * 2.0f));
+        SB_BEGIN_3D(so, POLYGON_OFFSET_CLAMP, 1);
+        SB_DATA    (so, fui(cso->offset_clamp));
+    }
+
+    if (cso->depth_clip)
+       reg = NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK1;
+    else
+       reg =
+          NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK1_UNK1 |
+          NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_NEAR |
+          NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_CLAMP_FAR |
+          NVC0_3D_VIEW_VOLUME_CLIP_CTRL_UNK12_UNK2;
+
+    SB_BEGIN_3D(so, VIEW_VOLUME_CLIP_CTRL, 1);
+    SB_DATA    (so, reg);
+
+    assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+    return (void *)so;
+}
+
+static void
+nvc0_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   nvc0->rast = hwcso;
+   nvc0->dirty |= NVC0_NEW_RASTERIZER;
+}
+
+static void
+nvc0_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+static void *
+nvc0_zsa_state_create(struct pipe_context *pipe,
+                      const struct pipe_depth_stencil_alpha_state *cso)
+{
+   struct nvc0_zsa_stateobj *so = CALLOC_STRUCT(nvc0_zsa_stateobj);
+
+   so->pipe = *cso;
+
+   SB_IMMED_3D(so, DEPTH_TEST_ENABLE, cso->depth.enabled);
+   if (cso->depth.enabled) {
+      SB_IMMED_3D(so, DEPTH_WRITE_ENABLE, cso->depth.writemask);
+      SB_BEGIN_3D(so, DEPTH_TEST_FUNC, 1);
+      SB_DATA    (so, nvgl_comparison_op(cso->depth.func));
+   }
+
+   if (cso->stencil[0].enabled) {
+      SB_BEGIN_3D(so, STENCIL_ENABLE, 5);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+      SB_DATA    (so, nvgl_comparison_op(cso->stencil[0].func));
+      SB_BEGIN_3D(so, STENCIL_FRONT_FUNC_MASK, 2);
+      SB_DATA    (so, cso->stencil[0].valuemask);
+      SB_DATA    (so, cso->stencil[0].writemask);
+   } else {
+      SB_IMMED_3D(so, STENCIL_ENABLE, 0);
+   }
+
+   if (cso->stencil[1].enabled) {
+      assert(cso->stencil[0].enabled);
+      SB_BEGIN_3D(so, STENCIL_TWO_SIDE_ENABLE, 5);
+      SB_DATA    (so, 1);
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+      SB_DATA    (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+      SB_DATA    (so, nvgl_comparison_op(cso->stencil[1].func));
+      SB_BEGIN_3D(so, STENCIL_BACK_MASK, 2);
+      SB_DATA    (so, cso->stencil[1].writemask);
+      SB_DATA    (so, cso->stencil[1].valuemask);
+   } else
+   if (cso->stencil[0].enabled) {
+      SB_IMMED_3D(so, STENCIL_TWO_SIDE_ENABLE, 0);
+   }
+
+   SB_IMMED_3D(so, ALPHA_TEST_ENABLE, cso->alpha.enabled);
+   if (cso->alpha.enabled) {
+      SB_BEGIN_3D(so, ALPHA_TEST_REF, 2);
+      SB_DATA    (so, fui(cso->alpha.ref_value));
+      SB_DATA    (so, nvgl_comparison_op(cso->alpha.func));
+   }
+
+   assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
+   return (void *)so;
+}
+
+static void
+nvc0_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+   nvc0->zsa = hwcso;
+   nvc0->dirty |= NVC0_NEW_ZSA;
+}
+
+static void
+nvc0_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   FREE(hwcso);
+}
+
+/* ====================== SAMPLERS AND TEXTURES ================================
+ */
+
+#define NV50_TSC_WRAP_CASE(n) \
+    case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
+
+static INLINE unsigned
+nv50_tsc_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   NV50_TSC_WRAP_CASE(REPEAT);
+   NV50_TSC_WRAP_CASE(MIRROR_REPEAT);
+   NV50_TSC_WRAP_CASE(CLAMP_TO_EDGE);
+   NV50_TSC_WRAP_CASE(CLAMP_TO_BORDER);
+   NV50_TSC_WRAP_CASE(CLAMP);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_EDGE);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_BORDER);
+   NV50_TSC_WRAP_CASE(MIRROR_CLAMP);
+   default:
+       NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+       return NV50_TSC_WRAP_REPEAT;
+   }
+}
+
+static void
+nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   unsigned s, i;
+
+   for (s = 0; s < 5; ++s)
+      for (i = 0; i < nvc0_context(pipe)->num_samplers[s]; ++i)
+         if (nvc0_context(pipe)->samplers[s][i] == hwcso)
+            nvc0_context(pipe)->samplers[s][i] = NULL;
+
+   nvc0_screen_tsc_free(nvc0_context(pipe)->screen, nv50_tsc_entry(hwcso));
+
+   FREE(hwcso);
+}
+
+static INLINE void
+nvc0_stage_sampler_states_bind(struct nvc0_context *nvc0, int s,
+                               unsigned nr, void **hwcso)
+{
+   unsigned i;
+
+   for (i = 0; i < nr; ++i) {
+      struct nv50_tsc_entry *old = nvc0->samplers[s][i];
+
+      if (hwcso[i] == old)
+         continue;
+      nvc0->samplers_dirty[s] |= 1 << i;
+
+      nvc0->samplers[s][i] = nv50_tsc_entry(hwcso[i]);
+      if (old)
+         nvc0_screen_tsc_unlock(nvc0->screen, old);
+   }
+   for (; i < nvc0->num_samplers[s]; ++i) {
+      if (nvc0->samplers[s][i]) {
+         nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
+         nvc0->samplers[s][i] = NULL;
+      }
+   }
+
+   nvc0->num_samplers[s] = nr;
+
+   nvc0->dirty |= NVC0_NEW_SAMPLERS;
+}
+
+static void
+nvc0_vp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nvc0_stage_sampler_states_bind(nvc0_context(pipe), 0, nr, s);
+}
+
+static void
+nvc0_fp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nvc0_stage_sampler_states_bind(nvc0_context(pipe), 4, nr, s);
+}
+
+static void
+nvc0_gp_sampler_states_bind(struct pipe_context *pipe, unsigned nr, void **s)
+{
+   nvc0_stage_sampler_states_bind(nvc0_context(pipe), 3, nr, s);
+}
+
+static void
+nvc0_stage_sampler_states_bind_range(struct nvc0_context *nvc0,
+                                     const unsigned s,
+                                     unsigned start, unsigned nr, void **cso)
+{
+   const unsigned end = start + nr;
+   int last_valid = -1;
+   unsigned i;
+
+   if (cso) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (cso[p])
+            last_valid = i;
+         if (cso[p] == nvc0->samplers[s][i])
+            continue;
+         nvc0->samplers_dirty[s] |= 1 << i;
+
+         if (nvc0->samplers[s][i])
+            nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
+         nvc0->samplers[s][i] = cso[p];
+      }
+   } else {
+      for (i = start; i < end; ++i) {
+         if (nvc0->samplers[s][i]) {
+            nvc0_screen_tsc_unlock(nvc0->screen, nvc0->samplers[s][i]);
+            nvc0->samplers[s][i] = NULL;
+            nvc0->samplers_dirty[s] |= 1 << i;
+         }
+      }
+   }
+
+   if (nvc0->num_samplers[s] <= end) {
+      if (last_valid < 0) {
+         for (i = start; i && !nvc0->samplers[s][i - 1]; --i);
+         nvc0->num_samplers[s] = i;
+      } else {
+         nvc0->num_samplers[s] = last_valid + 1;
+      }
+   }
+}
+
+static void
+nvc0_cp_sampler_states_bind(struct pipe_context *pipe,
+                            unsigned start, unsigned nr, void **cso)
+{
+   nvc0_stage_sampler_states_bind_range(nvc0_context(pipe), 5, start, nr, cso);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SAMPLERS;
+}
+
+/* NOTE: only called when not referenced anywhere, won't be bound */
+static void
+nvc0_sampler_view_destroy(struct pipe_context *pipe,
+                          struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+
+   nvc0_screen_tic_free(nvc0_context(pipe)->screen, nv50_tic_entry(view));
+
+   FREE(nv50_tic_entry(view));
+}
+
+static INLINE void
+nvc0_stage_set_sampler_views(struct nvc0_context *nvc0, int s,
+                             unsigned nr,
+                             struct pipe_sampler_view **views)
+{
+   unsigned i;
+
+   for (i = 0; i < nr; ++i) {
+      struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+
+      if (views[i] == nvc0->textures[s][i])
+         continue;
+      nvc0->textures_dirty[s] |= 1 << i;
+
+      if (old) {
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
+         nvc0_screen_tic_unlock(nvc0->screen, old);
+      }
+
+      pipe_sampler_view_reference(&nvc0->textures[s][i], views[i]);
+   }
+
+   for (i = nr; i < nvc0->num_textures[s]; ++i) {
+      struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+      if (old) {
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(s, i));
+         nvc0_screen_tic_unlock(nvc0->screen, old);
+         pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
+      }
+   }
+
+   nvc0->num_textures[s] = nr;
+
+   nvc0->dirty |= NVC0_NEW_TEXTURES;
+}
+
+static void
+nvc0_vp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nvc0_stage_set_sampler_views(nvc0_context(pipe), 0, nr, views);
+}
+
+static void
+nvc0_fp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nvc0_stage_set_sampler_views(nvc0_context(pipe), 4, nr, views);
+}
+
+static void
+nvc0_gp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nvc0_stage_set_sampler_views(nvc0_context(pipe), 3, nr, views);
+}
+
+static void
+nvc0_stage_set_sampler_views_range(struct nvc0_context *nvc0, const unsigned s,
+                                   unsigned start, unsigned nr,
+                                   struct pipe_sampler_view **views)
+{
+   struct nouveau_bufctx *bctx = (s == 5) ? nvc0->bufctx_cp : nvc0->bufctx_3d;
+   const unsigned end = start + nr;
+   const unsigned bin = (s == 5) ? NVC0_BIND_CP_TEX(0) : NVC0_BIND_TEX(s, 0);
+   int last_valid = -1;
+   unsigned i;
+
+   if (views) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (views[p])
+            last_valid = i;
+         if (views[p] == nvc0->textures[s][i])
+            continue;
+         nvc0->textures_dirty[s] |= 1 << i;
+
+         if (nvc0->textures[s][i]) {
+            struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+            nouveau_bufctx_reset(bctx, bin + i);
+            nvc0_screen_tic_unlock(nvc0->screen, old);
+         }
+         pipe_sampler_view_reference(&nvc0->textures[s][i], views[p]);
+      }
+   } else {
+      for (i = start; i < end; ++i) {
+         struct nv50_tic_entry *old = nv50_tic_entry(nvc0->textures[s][i]);
+         if (!old)
+            continue;
+         nvc0->textures_dirty[s] |= 1 << i;
+
+         nvc0_screen_tic_unlock(nvc0->screen, old);
+         pipe_sampler_view_reference(&nvc0->textures[s][i], NULL);
+         nouveau_bufctx_reset(bctx, bin + i);
+      }
+   }
+
+   if (nvc0->num_textures[s] <= end) {
+      if (last_valid < 0) {
+         for (i = start; i && !nvc0->textures[s][i - 1]; --i);
+         nvc0->num_textures[s] = i;
+      } else {
+         nvc0->num_textures[s] = last_valid + 1;
+      }
+   }
+}
+
+static void
+nvc0_cp_set_sampler_views(struct pipe_context *pipe,
+                          unsigned start, unsigned nr,
+                          struct pipe_sampler_view **views)
+{
+   nvc0_stage_set_sampler_views_range(nvc0_context(pipe), 5, start, nr, views);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_TEXTURES;
+}
+
+
+/* ============================= SHADERS =======================================
+ */
+
+static void *
+nvc0_sp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso, unsigned type)
+{
+   struct nvc0_program *prog;
+
+   prog = CALLOC_STRUCT(nvc0_program);
+   if (!prog)
+      return NULL;
+
+   prog->type = type;
+
+   if (cso->tokens)
+      prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
+
+   return (void *)prog;
+}
+
+static void
+nvc0_sp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+   struct nvc0_program *prog = (struct nvc0_program *)hwcso;
+
+   nvc0_program_destroy(nvc0_context(pipe), prog);
+
+   FREE((void *)prog->pipe.tokens);
+   FREE(prog);
+}
+
+static void *
+nvc0_vp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_VERTEX);
+}
+
+static void
+nvc0_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->vertprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_VERTPROG;
+}
+
+static void *
+nvc0_fp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_FRAGMENT);
+}
+
+static void
+nvc0_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->fragprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_FRAGPROG;
+}
+
+static void *
+nvc0_gp_state_create(struct pipe_context *pipe,
+                     const struct pipe_shader_state *cso)
+{
+   return nvc0_sp_state_create(pipe, cso, PIPE_SHADER_GEOMETRY);
+}
+
+static void
+nvc0_gp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->gmtyprog = hwcso;
+    nvc0->dirty |= NVC0_NEW_GMTYPROG;
+}
+
+static void *
+nvc0_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nvc0_program *prog;
+
+   prog = CALLOC_STRUCT(nvc0_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nvc0_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->compprog = hwcso;
+    nvc0->dirty_cp |= NVC0_NEW_CP_PROGRAM;
+}
+
+static void
+nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+                         struct pipe_constant_buffer *cb)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct pipe_resource *res = cb ? cb->buffer : NULL;
+   const unsigned s = nvc0_shader_stage(shader);
+   const unsigned i = index;
+
+   if (unlikely(shader == PIPE_SHADER_COMPUTE)) {
+      assert(!cb || !cb->user_buffer);
+      if (nvc0->constbuf[s][i].u.buf)
+         nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_CB(i));
+
+      nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF;
+   } else {
+      if (nvc0->constbuf[s][i].user)
+         nvc0->constbuf[s][i].u.buf = NULL;
+      else
+      if (nvc0->constbuf[s][i].u.buf)
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CB(s, i));
+
+      nvc0->dirty |= NVC0_NEW_CONSTBUF;
+   }
+   nvc0->constbuf_dirty[s] |= 1 << i;
+
+   pipe_resource_reference(&nvc0->constbuf[s][i].u.buf, res);
+
+   nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
+   if (nvc0->constbuf[s][i].user) {
+      nvc0->constbuf[s][i].u.data = cb->user_buffer;
+      nvc0->constbuf[s][i].size = cb->buffer_size;
+   } else
+   if (cb) {
+      nvc0->constbuf[s][i].offset = cb->buffer_offset;
+      nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+   }
+}
+
+/* =============================================================================
+ */
+
+static void
+nvc0_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *bcol)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->blend_colour = *bcol;
+    nvc0->dirty |= NVC0_NEW_BLEND_COLOUR;
+}
+
+static void
+nvc0_set_stencil_ref(struct pipe_context *pipe,
+                     const struct pipe_stencil_ref *sr)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->stencil_ref = *sr;
+    nvc0->dirty |= NVC0_NEW_STENCIL_REF;
+}
+
+static void
+nvc0_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    memcpy(nvc0->clip.ucp, clip->ucp, sizeof(clip->ucp));
+
+    nvc0->dirty |= NVC0_NEW_CLIP;
+}
+
+static void
+nvc0_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->sample_mask = sample_mask;
+    nvc0->dirty |= NVC0_NEW_SAMPLE_MASK;
+}
+
+
+static void
+nvc0_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+    unsigned i;
+
+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+
+    for (i = 0; i < fb->nr_cbufs; ++i)
+       pipe_surface_reference(&nvc0->framebuffer.cbufs[i], fb->cbufs[i]);
+    for (; i < nvc0->framebuffer.nr_cbufs; ++i)
+       pipe_surface_reference(&nvc0->framebuffer.cbufs[i], NULL);
+
+    nvc0->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+    nvc0->framebuffer.width = fb->width;
+    nvc0->framebuffer.height = fb->height;
+
+    pipe_surface_reference(&nvc0->framebuffer.zsbuf, fb->zsbuf);
+
+    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+}
+
+static void
+nvc0_set_polygon_stipple(struct pipe_context *pipe,
+                         const struct pipe_poly_stipple *stipple)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->stipple = *stipple;
+    nvc0->dirty |= NVC0_NEW_STIPPLE;
+}
+
+static void
+nvc0_set_scissor_states(struct pipe_context *pipe,
+                        unsigned start_slot,
+                        unsigned num_scissors,
+                        const struct pipe_scissor_state *scissor)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->scissor = *scissor;
+    nvc0->dirty |= NVC0_NEW_SCISSOR;
+}
+
+static void
+nvc0_set_viewport_states(struct pipe_context *pipe,
+                         unsigned start_slot,
+                         unsigned num_viewports,
+                         const struct pipe_viewport_state *vpt)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->viewport = *vpt;
+    nvc0->dirty |= NVC0_NEW_VIEWPORT;
+}
+
+static void
+nvc0_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned start_slot, unsigned count,
+                        const struct pipe_vertex_buffer *vb)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+    unsigned i;
+
+    util_set_vertex_buffers_count(nvc0->vtxbuf, &nvc0->num_vtxbufs, vb,
+                                  start_slot, count);
+
+    if (!vb) {
+       nvc0->vbo_user &= ~(((1ull << count) - 1) << start_slot);
+       nvc0->constant_vbos &= ~(((1ull << count) - 1) << start_slot);
+       return;
+    }
+
+    for (i = 0; i < count; ++i) {
+       unsigned dst_index = start_slot + i;
+
+       if (vb[i].user_buffer) {
+          nvc0->vbo_user |= 1 << dst_index;
+          if (!vb[i].stride)
+             nvc0->constant_vbos |= 1 << dst_index;
+          else
+             nvc0->constant_vbos &= ~(1 << dst_index);
+       } else {
+          nvc0->vbo_user &= ~(1 << dst_index);
+          nvc0->constant_vbos &= ~(1 << dst_index);
+       }
+    }
+
+    nvc0->dirty |= NVC0_NEW_ARRAYS;
+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+}
+
+static void
+nvc0_set_index_buffer(struct pipe_context *pipe,
+                      const struct pipe_index_buffer *ib)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    if (nvc0->idxbuf.buffer)
+       nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_IDX);
+
+    if (ib) {
+       pipe_resource_reference(&nvc0->idxbuf.buffer, ib->buffer);
+       nvc0->idxbuf.index_size = ib->index_size;
+       if (ib->buffer) {
+          nvc0->idxbuf.offset = ib->offset;
+          nvc0->dirty |= NVC0_NEW_IDXBUF;
+       } else {
+          nvc0->idxbuf.user_buffer = ib->user_buffer;
+          nvc0->dirty &= ~NVC0_NEW_IDXBUF;
+       }
+    } else {
+       nvc0->dirty &= ~NVC0_NEW_IDXBUF;
+       pipe_resource_reference(&nvc0->idxbuf.buffer, NULL);
+    }
+}
+
+static void
+nvc0_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->vertex = hwcso;
+    nvc0->dirty |= NVC0_NEW_VERTEX;
+}
+
+static struct pipe_stream_output_target *
+nvc0_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
+{
+   struct nvc0_so_target *targ = MALLOC_STRUCT(nvc0_so_target);
+   if (!targ)
+      return NULL;
+
+   targ->pq = pipe->create_query(pipe, NVC0_QUERY_TFB_BUFFER_OFFSET);
+   if (!targ->pq) {
+      FREE(targ);
+      return NULL;
+   }
+   targ->clean = TRUE;
+
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
+
+   return &targ->pipe;
+}
+
+static void
+nvc0_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
+{
+   struct nvc0_so_target *targ = nvc0_so_target(ptarg);
+   pipe->destroy_query(pipe, targ->pq);
+   pipe_resource_reference(&targ->pipe.buffer, NULL);
+   FREE(targ);
+}
+
+static void
+nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
+                                    unsigned num_targets,
+                                    struct pipe_stream_output_target **targets,
+                                    unsigned append_mask)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   unsigned i;
+   boolean serialize = TRUE;
+
+   assert(num_targets <= 4);
+
+   for (i = 0; i < num_targets; ++i) {
+      if (nvc0->tfbbuf[i] == targets[i] && (append_mask & (1 << i)))
+         continue;
+      nvc0->tfbbuf_dirty |= 1 << i;
+
+      if (nvc0->tfbbuf[i] && nvc0->tfbbuf[i] != targets[i])
+         nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nvc0_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nvc0->tfbbuf[i], targets[i]);
+   }
+   for (; i < nvc0->num_tfbbufs; ++i) {
+      nvc0->tfbbuf_dirty |= 1 << i;
+      nvc0_so_target_save_offset(pipe, nvc0->tfbbuf[i], i, &serialize);
+      pipe_so_target_reference(&nvc0->tfbbuf[i], NULL);
+   }
+   nvc0->num_tfbbufs = num_targets;
+
+   if (nvc0->tfbbuf_dirty)
+      nvc0->dirty |= NVC0_NEW_TFB_TARGETS;
+}
+
+static void
+nvc0_bind_surfaces_range(struct nvc0_context *nvc0, const unsigned t,
+                         unsigned start, unsigned nr,
+                         struct pipe_surface **psurfaces)
+{
+   const unsigned end = start + nr;
+   const unsigned mask = ((1 << nr) - 1) << start;
+   unsigned i;
+
+   if (psurfaces) {
+      for (i = start; i < end; ++i) {
+         const unsigned p = i - start;
+         if (psurfaces[p])
+            nvc0->surfaces_valid[t] |= (1 << i);
+         else
+            nvc0->surfaces_valid[t] &= ~(1 << i);
+         pipe_surface_reference(&nvc0->surfaces[t][i], psurfaces[p]);
+      }
+   } else {
+      for (i = start; i < end; ++i)
+         pipe_surface_reference(&nvc0->surfaces[t][i], NULL);
+      nvc0->surfaces_valid[t] &= ~mask;
+   }
+   nvc0->surfaces_dirty[t] |= mask;
+
+   if (t == 0)
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_SUF);
+   else
+      nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_SUF);
+}
+
+static void
+nvc0_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 1, start, nr, resources);
+
+   nvc0_context(pipe)->dirty_cp |= NVC0_NEW_CP_SURFACES;
+}
+
+static void
+nvc0_set_shader_resources(struct pipe_context *pipe,
+                          unsigned start, unsigned nr,
+                          struct pipe_surface **resources)
+{
+   nvc0_bind_surfaces_range(nvc0_context(pipe), 0, start, nr, resources);
+
+   nvc0_context(pipe)->dirty |= NVC0_NEW_SURFACES;
+}
+
+static INLINE void
+nvc0_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nvc0_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nvc0->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nvc0->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nvc0->global_residents, req_size);
+      memset((uint8_t *)nvc0->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nvc0_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
+
+   nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS;
+}
+
+void
+nvc0_init_state_functions(struct nvc0_context *nvc0)
+{
+   struct pipe_context *pipe = &nvc0->base.pipe;
+
+   pipe->create_blend_state = nvc0_blend_state_create;
+   pipe->bind_blend_state = nvc0_blend_state_bind;
+   pipe->delete_blend_state = nvc0_blend_state_delete;
+
+   pipe->create_rasterizer_state = nvc0_rasterizer_state_create;
+   pipe->bind_rasterizer_state = nvc0_rasterizer_state_bind;
+   pipe->delete_rasterizer_state = nvc0_rasterizer_state_delete;
+
+   pipe->create_depth_stencil_alpha_state = nvc0_zsa_state_create;
+   pipe->bind_depth_stencil_alpha_state = nvc0_zsa_state_bind;
+   pipe->delete_depth_stencil_alpha_state = nvc0_zsa_state_delete;
+
+   pipe->create_sampler_state = nv50_sampler_state_create;
+   pipe->delete_sampler_state = nvc0_sampler_state_delete;
+   pipe->bind_vertex_sampler_states   = nvc0_vp_sampler_states_bind;
+   pipe->bind_fragment_sampler_states = nvc0_fp_sampler_states_bind;
+   pipe->bind_geometry_sampler_states = nvc0_gp_sampler_states_bind;
+   pipe->bind_compute_sampler_states = nvc0_cp_sampler_states_bind;
+
+   pipe->create_sampler_view = nvc0_create_sampler_view;
+   pipe->sampler_view_destroy = nvc0_sampler_view_destroy;
+   pipe->set_vertex_sampler_views   = nvc0_vp_set_sampler_views;
+   pipe->set_fragment_sampler_views = nvc0_fp_set_sampler_views;
+   pipe->set_geometry_sampler_views = nvc0_gp_set_sampler_views;
+   pipe->set_compute_sampler_views = nvc0_cp_set_sampler_views;
+
+   pipe->create_vs_state = nvc0_vp_state_create;
+   pipe->create_fs_state = nvc0_fp_state_create;
+   pipe->create_gs_state = nvc0_gp_state_create;
+   pipe->bind_vs_state = nvc0_vp_state_bind;
+   pipe->bind_fs_state = nvc0_fp_state_bind;
+   pipe->bind_gs_state = nvc0_gp_state_bind;
+   pipe->delete_vs_state = nvc0_sp_state_delete;
+   pipe->delete_fs_state = nvc0_sp_state_delete;
+   pipe->delete_gs_state = nvc0_sp_state_delete;
+
+   pipe->create_compute_state = nvc0_cp_state_create;
+   pipe->bind_compute_state = nvc0_cp_state_bind;
+   pipe->delete_compute_state = nvc0_sp_state_delete;
+
+   pipe->set_blend_color = nvc0_set_blend_color;
+   pipe->set_stencil_ref = nvc0_set_stencil_ref;
+   pipe->set_clip_state = nvc0_set_clip_state;
+   pipe->set_sample_mask = nvc0_set_sample_mask;
+   pipe->set_constant_buffer = nvc0_set_constant_buffer;
+   pipe->set_framebuffer_state = nvc0_set_framebuffer_state;
+   pipe->set_polygon_stipple = nvc0_set_polygon_stipple;
+   pipe->set_scissor_states = nvc0_set_scissor_states;
+   pipe->set_viewport_states = nvc0_set_viewport_states;
+
+   pipe->create_vertex_elements_state = nvc0_vertex_state_create;
+   pipe->delete_vertex_elements_state = nvc0_vertex_state_delete;
+   pipe->bind_vertex_elements_state = nvc0_vertex_state_bind;
+
+   pipe->set_vertex_buffers = nvc0_set_vertex_buffers;
+   pipe->set_index_buffer = nvc0_set_index_buffer;
+
+   pipe->create_stream_output_target = nvc0_so_target_create;
+   pipe->stream_output_target_destroy = nvc0_so_target_destroy;
+   pipe->set_stream_output_targets = nvc0_set_transform_feedback_targets;
+
+   pipe->set_global_binding = nvc0_set_global_bindings;
+   pipe->set_compute_resources = nvc0_set_compute_resources;
+   pipe->set_shader_resources = nvc0_set_shader_resources;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
new file mode 100644
index 00000000000..0ba4bad154a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -0,0 +1,577 @@
+
+#include "util/u_math.h"
+
+#include "nvc0/nvc0_context.h"
+
+#if 0
+static void
+nvc0_validate_zcull(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+    struct nv50_surface *sf = nv50_surface(fb->zsbuf);
+    struct nv50_miptree *mt = nv50_miptree(sf->base.texture);
+    struct nouveau_bo *bo = mt->base.bo;
+    uint32_t size;
+    uint32_t offset = align(mt->total_size, 1 << 17);
+    unsigned width, height;
+
+    assert(mt->base.base.depth0 == 1 && mt->base.base.array_size < 2);
+
+    size = mt->total_size * 2;
+
+    height = align(fb->height, 32);
+    width = fb->width % 224;
+    if (width)
+       width = fb->width + (224 - width);
+    else
+       width = fb->width;
+
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_REGION), 1);
+    PUSH_DATA (push, 0);
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_ADDRESS_HIGH), 2);
+    PUSH_DATAh(push, bo->offset + offset);
+    PUSH_DATA (push, bo->offset + offset);
+    offset += 1 << 17;
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_LIMIT_HIGH), 2);
+    PUSH_DATAh(push, bo->offset + offset);
+    PUSH_DATA (push, bo->offset + offset);
+    BEGIN_NVC0(push, SUBC_3D(0x07e0), 2);
+    PUSH_DATA (push, size);
+    PUSH_DATA (push, size >> 16);
+    BEGIN_NVC0(push, SUBC_3D(0x15c8), 1); /* bits 0x3 */
+    PUSH_DATA (push, 2);
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_WIDTH), 4);
+    PUSH_DATA (push, width);
+    PUSH_DATA (push, height);
+    PUSH_DATA (push, 1);
+    PUSH_DATA (push, 0);
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_WINDOW_OFFSET_X), 2);
+    PUSH_DATA (push, 0);
+    PUSH_DATA (push, 0);
+    BEGIN_NVC0(push, NVC0_3D(ZCULL_INVALIDATE), 1);
+    PUSH_DATA (push, 0);
+}
+#endif
+
+static void
+nvc0_validate_fb(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+    unsigned i;
+    unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
+    boolean serialize = FALSE;
+
+    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+
+    BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+    PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
+    BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
+    PUSH_DATA (push, fb->width << 16);
+    PUSH_DATA (push, fb->height << 16);
+
+    for (i = 0; i < fb->nr_cbufs; ++i) {
+        struct nv50_surface *sf = nv50_surface(fb->cbufs[i]);
+        struct nv04_resource *res = nv04_resource(sf->base.texture);
+        struct nouveau_bo *bo = res->bo;
+
+        BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9);
+        PUSH_DATAh(push, res->address + sf->offset);
+        PUSH_DATA (push, res->address + sf->offset);
+        if (likely(nouveau_bo_memtype(bo))) {
+           struct nv50_miptree *mt = nv50_miptree(sf->base.texture);
+
+           assert(sf->base.texture->target != PIPE_BUFFER);
+
+           PUSH_DATA(push, sf->width);
+           PUSH_DATA(push, sf->height);
+           PUSH_DATA(push, nvc0_format_table[sf->base.format].rt);
+           PUSH_DATA(push, (mt->layout_3d << 16) |
+                    mt->level[sf->base.u.tex.level].tile_mode);
+           PUSH_DATA(push, sf->base.u.tex.first_layer + sf->depth);
+           PUSH_DATA(push, mt->layer_stride >> 2);
+           PUSH_DATA(push, sf->base.u.tex.first_layer);
+
+           ms_mode = mt->ms_mode;
+        } else {
+           if (res->base.target == PIPE_BUFFER) {
+              PUSH_DATA(push, 262144);
+              PUSH_DATA(push, 1);
+           } else {
+              PUSH_DATA(push, nv50_miptree(sf->base.texture)->level[0].pitch);
+              PUSH_DATA(push, sf->height);
+           }
+           PUSH_DATA(push, nvc0_format_table[sf->base.format].rt);
+           PUSH_DATA(push, 1 << 12);
+           PUSH_DATA(push, 1);
+           PUSH_DATA(push, 0);
+           PUSH_DATA(push, 0);
+
+           nvc0_resource_fence(res, NOUVEAU_BO_WR);
+
+           assert(!fb->zsbuf);
+        }
+
+        if (res->status & NOUVEAU_BUFFER_STATUS_GPU_READING)
+           serialize = TRUE;
+        res->status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+        res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+        /* only register for writing, otherwise we'd always serialize here */
+        BCTX_REFN(nvc0->bufctx_3d, FB, res, WR);
+    }
+
+    if (fb->zsbuf) {
+        struct nv50_miptree *mt = nv50_miptree(fb->zsbuf->texture);
+        struct nv50_surface *sf = nv50_surface(fb->zsbuf);
+        int unk = mt->base.base.target == PIPE_TEXTURE_2D;
+
+        BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5);
+        PUSH_DATAh(push, mt->base.address + sf->offset);
+        PUSH_DATA (push, mt->base.address + sf->offset);
+        PUSH_DATA (push, nvc0_format_table[fb->zsbuf->format].rt);
+        PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+        PUSH_DATA (push, mt->layer_stride >> 2);
+        BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1);
+        PUSH_DATA (push, 1);
+        BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3);
+        PUSH_DATA (push, sf->width);
+        PUSH_DATA (push, sf->height);
+        PUSH_DATA (push, (unk << 16) |
+                   (sf->base.u.tex.first_layer + sf->depth));
+        BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
+        PUSH_DATA (push, sf->base.u.tex.first_layer);
+
+        ms_mode = mt->ms_mode;
+
+        if (mt->base.status & NOUVEAU_BUFFER_STATUS_GPU_READING)
+           serialize = TRUE;
+        mt->base.status |=  NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+        mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+        BCTX_REFN(nvc0->bufctx_3d, FB, &mt->base, WR);
+    } else {
+        BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1);
+        PUSH_DATA (push, 0);
+    }
+
+    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode);
+
+    if (serialize)
+       IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
+
+    NOUVEAU_DRV_STAT(&nvc0->screen->base, gpu_serialize_count, serialize);
+}
+
+static void
+nvc0_validate_blend_colour(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   BEGIN_NVC0(push, NVC0_3D(BLEND_COLOR(0)), 4);
+   PUSH_DATAf(push, nvc0->blend_colour.color[0]);
+   PUSH_DATAf(push, nvc0->blend_colour.color[1]);
+   PUSH_DATAf(push, nvc0->blend_colour.color[2]);
+   PUSH_DATAf(push, nvc0->blend_colour.color[3]);
+}
+
+static void
+nvc0_validate_stencil_ref(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    const ubyte *ref = &nvc0->stencil_ref.ref_value[0];
+
+    IMMED_NVC0(push, NVC0_3D(STENCIL_FRONT_FUNC_REF), ref[0]);
+    IMMED_NVC0(push, NVC0_3D(STENCIL_BACK_FUNC_REF), ref[1]);
+}
+
+static void
+nvc0_validate_stipple(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    unsigned i;
+
+    BEGIN_NVC0(push, NVC0_3D(POLYGON_STIPPLE_PATTERN(0)), 32);
+    for (i = 0; i < 32; ++i)
+        PUSH_DATA(push, util_bswap32(nvc0->stipple.stipple[i]));
+}
+
+static void
+nvc0_validate_scissor(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    struct pipe_scissor_state *s = &nvc0->scissor;
+
+    if (!(nvc0->dirty & NVC0_NEW_SCISSOR) &&
+        nvc0->rast->pipe.scissor == nvc0->state.scissor)
+       return;
+    nvc0->state.scissor = nvc0->rast->pipe.scissor;
+
+    BEGIN_NVC0(push, NVC0_3D(SCISSOR_HORIZ(0)), 2);
+    if (nvc0->rast->pipe.scissor) {
+       PUSH_DATA(push, (s->maxx << 16) | s->minx);
+       PUSH_DATA(push, (s->maxy << 16) | s->miny);
+    } else {
+       PUSH_DATA(push, (0xffff << 16) | 0);
+       PUSH_DATA(push, (0xffff << 16) | 0);
+    }
+}
+
+static void
+nvc0_validate_viewport(struct nvc0_context *nvc0)
+{
+    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+    struct pipe_viewport_state *vp = &nvc0->viewport;
+    int x, y, w, h;
+    float zmin, zmax;
+
+    BEGIN_NVC0(push, NVC0_3D(VIEWPORT_TRANSLATE_X(0)), 3);
+    PUSH_DATAf(push, vp->translate[0]);
+    PUSH_DATAf(push, vp->translate[1]);
+    PUSH_DATAf(push, vp->translate[2]);
+    BEGIN_NVC0(push, NVC0_3D(VIEWPORT_SCALE_X(0)), 3);
+    PUSH_DATAf(push, vp->scale[0]);
+    PUSH_DATAf(push, vp->scale[1]);
+    PUSH_DATAf(push, vp->scale[2]);
+
+    /* now set the viewport rectangle to viewport dimensions for clipping */
+
+    x = util_iround(MAX2(0.0f, vp->translate[0] - fabsf(vp->scale[0])));
+    y = util_iround(MAX2(0.0f, vp->translate[1] - fabsf(vp->scale[1])));
+    w = util_iround(vp->translate[0] + fabsf(vp->scale[0])) - x;
+    h = util_iround(vp->translate[1] + fabsf(vp->scale[1])) - y;
+
+    zmin = vp->translate[2] - fabsf(vp->scale[2]);
+    zmax = vp->translate[2] + fabsf(vp->scale[2]);
+
+    nvc0->vport_int[0] = (w << 16) | x;
+    nvc0->vport_int[1] = (h << 16) | y;
+    BEGIN_NVC0(push, NVC0_3D(VIEWPORT_HORIZ(0)), 2);
+    PUSH_DATA (push, nvc0->vport_int[0]);
+    PUSH_DATA (push, nvc0->vport_int[1]);
+    BEGIN_NVC0(push, NVC0_3D(DEPTH_RANGE_NEAR(0)), 2);
+    PUSH_DATAf(push, zmin);
+    PUSH_DATAf(push, zmax);
+}
+
+static INLINE void
+nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, 512);
+   PUSH_DATAh(push, bo->offset + (5 << 16) + (s << 9));
+   PUSH_DATA (push, bo->offset + (5 << 16) + (s << 9));
+   BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1);
+   PUSH_DATA (push, 256);
+   PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
+}
+
+static INLINE void
+nvc0_check_program_ucps(struct nvc0_context *nvc0,
+                        struct nvc0_program *vp, uint8_t mask)
+{
+   const unsigned n = util_logbase2(mask) + 1;
+
+   if (vp->vp.num_ucps >= n)
+      return;
+   nvc0_program_destroy(nvc0, vp);
+
+   vp->vp.num_ucps = n;
+   if (likely(vp == nvc0->vertprog))
+      nvc0_vertprog_validate(nvc0);
+   else
+   if (likely(vp == nvc0->gmtyprog))
+      nvc0_vertprog_validate(nvc0);
+   else
+      nvc0_tevlprog_validate(nvc0);
+}
+
+static void
+nvc0_validate_clip(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *vp;
+   unsigned stage;
+   uint8_t clip_enable = nvc0->rast->pipe.clip_plane_enable;
+
+   if (nvc0->gmtyprog) {
+      stage = 3;
+      vp = nvc0->gmtyprog;
+   } else
+   if (nvc0->tevlprog) {
+      stage = 2;
+      vp = nvc0->tevlprog;
+   } else {
+      stage = 0;
+      vp = nvc0->vertprog;
+   }
+
+   if (clip_enable && vp->vp.num_ucps < PIPE_MAX_CLIP_PLANES)
+      nvc0_check_program_ucps(nvc0, vp, clip_enable);
+
+   if (nvc0->dirty & (NVC0_NEW_CLIP | (NVC0_NEW_VERTPROG << stage)))
+      if (vp->vp.num_ucps > 0 && vp->vp.num_ucps <= PIPE_MAX_CLIP_PLANES)
+         nvc0_upload_uclip_planes(nvc0, stage);
+
+   clip_enable &= vp->vp.clip_enable;
+
+   if (nvc0->state.clip_enable != clip_enable) {
+      nvc0->state.clip_enable = clip_enable;
+      IMMED_NVC0(push, NVC0_3D(CLIP_DISTANCE_ENABLE), clip_enable);
+   }
+   if (nvc0->state.clip_mode != vp->vp.clip_mode) {
+      nvc0->state.clip_mode = vp->vp.clip_mode;
+      BEGIN_NVC0(push, NVC0_3D(CLIP_DISTANCE_MODE), 1);
+      PUSH_DATA (push, vp->vp.clip_mode);
+   }
+}
+
+static void
+nvc0_validate_blend(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   PUSH_SPACE(push, nvc0->blend->size);
+   PUSH_DATAp(push, nvc0->blend->state, nvc0->blend->size);
+}
+
+static void
+nvc0_validate_zsa(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   PUSH_SPACE(push, nvc0->zsa->size);
+   PUSH_DATAp(push, nvc0->zsa->state, nvc0->zsa->size);
+}
+
+static void
+nvc0_validate_rasterizer(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   PUSH_SPACE(push, nvc0->rast->size);
+   PUSH_DATAp(push, nvc0->rast->state, nvc0->rast->size);
+}
+
+static void
+nvc0_constbufs_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned s;
+
+   for (s = 0; s < 5; ++s) {
+      while (nvc0->constbuf_dirty[s]) {
+         int i = ffs(nvc0->constbuf_dirty[s]) - 1;
+         nvc0->constbuf_dirty[s] &= ~(1 << i);
+
+         if (nvc0->constbuf[s][i].user) {
+            struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+            const unsigned base = s << 16;
+            const unsigned size = nvc0->constbuf[s][0].size;
+            assert(i == 0); /* we really only want OpenGL uniforms here */
+            assert(nvc0->constbuf[s][0].u.data);
+
+            if (nvc0->state.uniform_buffer_bound[s] < size) {
+               nvc0->state.uniform_buffer_bound[s] = align(size, 0x100);
+
+               BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+               PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]);
+               PUSH_DATAh(push, bo->offset + base);
+               PUSH_DATA (push, bo->offset + base);
+               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
+               PUSH_DATA (push, (0 << 4) | 1);
+            }
+            nvc0_cb_push(&nvc0->base, bo, NOUVEAU_BO_VRAM,
+                         base, nvc0->state.uniform_buffer_bound[s],
+                         0, (size + 3) / 4,
+                         nvc0->constbuf[s][0].u.data);
+         } else {
+            struct nv04_resource *res =
+               nv04_resource(nvc0->constbuf[s][i].u.buf);
+            if (res) {
+               BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+               PUSH_DATA (push, nvc0->constbuf[s][i].size);
+               PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+               PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
+               PUSH_DATA (push, (i << 4) | 1);
+
+               BCTX_REFN(nvc0->bufctx_3d, CB(s, i), res, RD);
+            } else {
+               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
+               PUSH_DATA (push, (i << 4) | 0);
+            }
+            if (i == 0)
+               nvc0->state.uniform_buffer_bound[s] = 0;
+         }
+      }
+   }
+}
+
+static void
+nvc0_validate_sample_mask(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   unsigned mask[4] =
+   {
+      nvc0->sample_mask & 0xffff,
+      nvc0->sample_mask & 0xffff,
+      nvc0->sample_mask & 0xffff,
+      nvc0->sample_mask & 0xffff
+   };
+
+   BEGIN_NVC0(push, NVC0_3D(MSAA_MASK(0)), 4);
+   PUSH_DATA (push, mask[0]);
+   PUSH_DATA (push, mask[1]);
+   PUSH_DATA (push, mask[2]);
+   PUSH_DATA (push, mask[3]);
+   BEGIN_NVC0(push, NVC0_3D(SAMPLE_SHADING), 1);
+   PUSH_DATA (push, 0x01);
+}
+
+void
+nvc0_validate_global_residents(struct nvc0_context *nvc0,
+                               struct nouveau_bufctx *bctx, int bin)
+{
+   unsigned i;
+
+   for (i = 0; i < nvc0->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nvc0->global_residents, struct pipe_resource *, i);
+      if (res)
+         nvc0_add_resident(bctx, bin, nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
+static void
+nvc0_validate_derived_1(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   boolean rasterizer_discard;
+
+   if (nvc0->rast && nvc0->rast->pipe.rasterizer_discard) {
+      rasterizer_discard = TRUE;
+   } else {
+      boolean zs = nvc0->zsa &&
+         (nvc0->zsa->pipe.depth.enabled || nvc0->zsa->pipe.stencil[0].enabled);
+      rasterizer_discard = !zs &&
+         (!nvc0->fragprog || !nvc0->fragprog->hdr[18]);
+   }
+
+   if (rasterizer_discard != nvc0->state.rasterizer_discard) {
+      nvc0->state.rasterizer_discard = rasterizer_discard;
+      IMMED_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), !rasterizer_discard);
+   }
+}
+
+static void
+nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
+{
+   struct nvc0_context *ctx_from = ctx_to->screen->cur_ctx;
+   unsigned s;
+
+   if (ctx_from)
+      ctx_to->state = ctx_from->state;
+
+   ctx_to->dirty = ~0;
+
+   for (s = 0; s < 5; ++s) {
+      ctx_to->samplers_dirty[s] = ~0;
+      ctx_to->textures_dirty[s] = ~0;
+   }
+
+   if (!ctx_to->vertex)
+      ctx_to->dirty &= ~(NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS);
+   if (!ctx_to->idxbuf.buffer)
+      ctx_to->dirty &= ~NVC0_NEW_IDXBUF;
+
+   if (!ctx_to->vertprog)
+      ctx_to->dirty &= ~NVC0_NEW_VERTPROG;
+   if (!ctx_to->fragprog)
+      ctx_to->dirty &= ~NVC0_NEW_FRAGPROG;
+
+   if (!ctx_to->blend)
+      ctx_to->dirty &= ~NVC0_NEW_BLEND;
+   if (!ctx_to->rast)
+      ctx_to->dirty &= ~(NVC0_NEW_RASTERIZER | NVC0_NEW_SCISSOR);
+   if (!ctx_to->zsa)
+      ctx_to->dirty &= ~NVC0_NEW_ZSA;
+
+   ctx_to->screen->cur_ctx = ctx_to;
+}
+
+static struct state_validate {
+    void (*func)(struct nvc0_context *);
+    uint32_t states;
+} validate_list[] = {
+    { nvc0_validate_fb,            NVC0_NEW_FRAMEBUFFER },
+    { nvc0_validate_blend,         NVC0_NEW_BLEND },
+    { nvc0_validate_zsa,           NVC0_NEW_ZSA },
+    { nvc0_validate_sample_mask,   NVC0_NEW_SAMPLE_MASK },
+    { nvc0_validate_rasterizer,    NVC0_NEW_RASTERIZER },
+    { nvc0_validate_blend_colour,  NVC0_NEW_BLEND_COLOUR },
+    { nvc0_validate_stencil_ref,   NVC0_NEW_STENCIL_REF },
+    { nvc0_validate_stipple,       NVC0_NEW_STIPPLE },
+    { nvc0_validate_scissor,       NVC0_NEW_SCISSOR | NVC0_NEW_RASTERIZER },
+    { nvc0_validate_viewport,      NVC0_NEW_VIEWPORT },
+    { nvc0_vertprog_validate,      NVC0_NEW_VERTPROG },
+    { nvc0_tctlprog_validate,      NVC0_NEW_TCTLPROG },
+    { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
+    { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
+    { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
+    { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
+                                   NVC0_NEW_RASTERIZER },
+    { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
+                                   NVC0_NEW_VERTPROG |
+                                   NVC0_NEW_TEVLPROG |
+                                   NVC0_NEW_GMTYPROG },
+    { nvc0_constbufs_validate,     NVC0_NEW_CONSTBUF },
+    { nvc0_validate_textures,      NVC0_NEW_TEXTURES },
+    { nvc0_validate_samplers,      NVC0_NEW_SAMPLERS },
+    { nve4_set_tex_handles,        NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS },
+    { nvc0_vertex_arrays_validate, NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS },
+    { nvc0_validate_surfaces,      NVC0_NEW_SURFACES },
+    { nvc0_idxbuf_validate,        NVC0_NEW_IDXBUF },
+    { nvc0_tfb_validate,           NVC0_NEW_TFB_TARGETS | NVC0_NEW_GMTYPROG }
+};
+#define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
+
+boolean
+nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask, unsigned words)
+{
+   uint32_t state_mask;
+   int ret;
+   unsigned i;
+
+   if (nvc0->screen->cur_ctx != nvc0)
+      nvc0_switch_pipe_context(nvc0);
+
+   state_mask = nvc0->dirty & mask;
+
+   if (state_mask) {
+      for (i = 0; i < validate_list_len; ++i) {
+         struct state_validate *validate = &validate_list[i];
+
+         if (state_mask & validate->states)
+            validate->func(nvc0);
+      }
+      nvc0->dirty &= ~state_mask;
+
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, FALSE);
+   }
+
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d);
+   ret = nouveau_pushbuf_validate(nvc0->base.pushbuf);
+
+   if (unlikely(nvc0->state.flushed)) {
+      nvc0->state.flushed = FALSE;
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, TRUE);
+   }
+   return !ret;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
new file mode 100644
index 00000000000..80c33424032
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -0,0 +1,77 @@
+
+#ifndef __NVC0_STATEOBJ_H__
+#define __NVC0_STATEOBJ_H__
+
+#include "pipe/p_state.h"
+
+#define SB_BEGIN_3D(so, m, s)                                                  \
+   (so)->state[(so)->size++] = NVC0_FIFO_PKHDR_SQ(NVC0_3D(m), s)
+
+#define SB_IMMED_3D(so, m, d)                                                  \
+   (so)->state[(so)->size++] = NVC0_FIFO_PKHDR_IL(NVC0_3D(m), d)
+
+#define SB_DATA(so, u) (so)->state[(so)->size++] = (u)
+
+#include "nv50/nv50_stateobj_tex.h"
+
+struct nvc0_blend_stateobj {
+   struct pipe_blend_state pipe;
+   int size;
+   uint32_t state[72];
+};
+
+struct nvc0_rasterizer_stateobj {
+   struct pipe_rasterizer_state pipe;
+   int size;
+   uint32_t state[43];
+};
+
+struct nvc0_zsa_stateobj {
+   struct pipe_depth_stencil_alpha_state pipe;
+   int size;
+   uint32_t state[26];
+};
+
+struct nvc0_constbuf {
+   union {
+      struct pipe_resource *buf;
+      const void *data;
+   } u;
+   uint32_t size;
+   uint32_t offset;
+   boolean user; /* should only be TRUE if u.data is valid and non-NULL */
+};
+
+struct nvc0_vertex_element {
+   struct pipe_vertex_element pipe;
+   uint32_t state;
+   uint32_t state_alt; /* buffer 0 and with source offset (for translate) */
+};
+
+struct nvc0_vertex_stateobj {
+   uint32_t min_instance_div[PIPE_MAX_ATTRIBS];
+   uint16_t vb_access_size[PIPE_MAX_ATTRIBS];
+   struct translate *translate;
+   unsigned num_elements;
+   uint32_t instance_elts;
+   uint32_t instance_bufs;
+   boolean shared_slots;
+   boolean need_conversion; /* e.g. VFETCH cannot convert f64 to f32 */
+   unsigned size; /* size of vertex in bytes (when packed) */
+   struct nvc0_vertex_element element[0];
+};
+
+struct nvc0_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
+};
+
+static INLINE struct nvc0_so_target *
+nvc0_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nvc0_so_target *)ptarg;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
new file mode 100644
index 00000000000..5070df80671
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#include "pipe/p_defines.h"
+
+#include "util/u_inlines.h"
+#include "util/u_pack_color.h"
+#include "util/u_format.h"
+#include "util/u_surface.h"
+
+#include "os/os_thread.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+
+#include "nv50/nv50_defs.xml.h"
+#include "nv50/nv50_texture.xml.h"
+
+/* these are used in nv50_blit.h */
+#define NV50_ENG2D_SUPPORTED_FORMATS 0xff9ccfe1cce3ccc9ULL
+#define NV50_ENG2D_NOCONVERT_FORMATS 0x009cc02000000000ULL
+#define NV50_ENG2D_LUMINANCE_FORMATS 0x001cc02000000000ULL
+#define NV50_ENG2D_INTENSITY_FORMATS 0x0080000000000000ULL
+#define NV50_ENG2D_OPERATION_FORMATS 0x060001c000638000ULL
+
+#define NOUVEAU_DRIVER 0xc0
+#include "nv50/nv50_blit.h"
+
+static INLINE uint8_t
+nvc0_2d_format(enum pipe_format format, boolean dst, boolean dst_src_equal)
+{
+   uint8_t id = nvc0_format_table[format].rt;
+
+   /* A8_UNORM is treated as I8_UNORM as far as the 2D engine is concerned. */
+   if (!dst && unlikely(format == PIPE_FORMAT_I8_UNORM) && !dst_src_equal)
+      return NV50_SURFACE_FORMAT_A8_UNORM;
+
+   /* Hardware values for color formats range from 0xc0 to 0xff,
+    * but the 2D engine doesn't support all of them.
+    */
+   if (nv50_2d_format_supported(format))
+      return id;
+   assert(dst_src_equal);
+
+   switch (util_format_get_blocksize(format)) {
+   case 1:
+      return NV50_SURFACE_FORMAT_R8_UNORM;
+   case 2:
+      return NV50_SURFACE_FORMAT_R16_UNORM;
+   case 4:
+      return NV50_SURFACE_FORMAT_BGRA8_UNORM;
+   case 8:
+      return NV50_SURFACE_FORMAT_RGBA16_UNORM;
+   case 16:
+      return NV50_SURFACE_FORMAT_RGBA32_FLOAT;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static int
+nvc0_2d_texture_set(struct nouveau_pushbuf *push, boolean dst,
+                    struct nv50_miptree *mt, unsigned level, unsigned layer,
+                    enum pipe_format pformat, boolean dst_src_pformat_equal)
+{
+   struct nouveau_bo *bo = mt->base.bo;
+   uint32_t width, height, depth;
+   uint32_t format;
+   uint32_t mthd = dst ? NVC0_2D_DST_FORMAT : NVC0_2D_SRC_FORMAT;
+   uint32_t offset = mt->level[level].offset;
+
+   format = nvc0_2d_format(pformat, dst, dst_src_pformat_equal);
+   if (!format) {
+      NOUVEAU_ERR("invalid/unsupported surface format: %s\n",
+                  util_format_name(pformat));
+      return 1;
+   }
+
+   width = u_minify(mt->base.base.width0, level) << mt->ms_x;
+   height = u_minify(mt->base.base.height0, level) << mt->ms_y;
+   depth = u_minify(mt->base.base.depth0, level);
+
+   /* layer has to be < depth, and depth > tile depth / 2 */
+
+   if (!mt->layout_3d) {
+      offset += mt->layer_stride * layer;
+      layer = 0;
+      depth = 1;
+   } else
+   if (!dst) {
+      offset += nvc0_mt_zslice_offset(mt, level, layer);
+      layer = 0;
+   }
+
+   if (!nouveau_bo_memtype(bo)) {
+      BEGIN_NVC0(push, SUBC_2D(mthd), 2);
+      PUSH_DATA (push, format);
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, SUBC_2D(mthd + 0x14), 5);
+      PUSH_DATA (push, mt->level[level].pitch);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+      PUSH_DATAh(push, bo->offset + offset);
+      PUSH_DATA (push, bo->offset + offset);
+   } else {
+      BEGIN_NVC0(push, SUBC_2D(mthd), 5);
+      PUSH_DATA (push, format);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, mt->level[level].tile_mode);
+      PUSH_DATA (push, depth);
+      PUSH_DATA (push, layer);
+      BEGIN_NVC0(push, SUBC_2D(mthd + 0x18), 4);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+      PUSH_DATAh(push, bo->offset + offset);
+      PUSH_DATA (push, bo->offset + offset);
+   }
+
+#if 0
+   if (dst) {
+      BEGIN_NVC0(push, SUBC_2D(NVC0_2D_CLIP_X), 4);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, width);
+      PUSH_DATA (push, height);
+   }
+#endif
+   return 0;
+}
+
+static int
+nvc0_2d_texture_do_copy(struct nouveau_pushbuf *push,
+                        struct nv50_miptree *dst, unsigned dst_level,
+                        unsigned dx, unsigned dy, unsigned dz,
+                        struct nv50_miptree *src, unsigned src_level,
+                        unsigned sx, unsigned sy, unsigned sz,
+                        unsigned w, unsigned h)
+{
+   const enum pipe_format dfmt = dst->base.base.format;
+   const enum pipe_format sfmt = src->base.base.format;
+   int ret;
+   boolean eqfmt = dfmt == sfmt;
+
+   if (!PUSH_SPACE(push, 2 * 16 + 32))
+      return PIPE_ERROR;
+
+   ret = nvc0_2d_texture_set(push, TRUE, dst, dst_level, dz, dfmt, eqfmt);
+   if (ret)
+      return ret;
+
+   ret = nvc0_2d_texture_set(push, FALSE, src, src_level, sz, sfmt, eqfmt);
+   if (ret)
+      return ret;
+
+   IMMED_NVC0(push, NVC0_2D(BLIT_CONTROL), 0x00);
+   BEGIN_NVC0(push, NVC0_2D(BLIT_DST_X), 4);
+   PUSH_DATA (push, dx << dst->ms_x);
+   PUSH_DATA (push, dy << dst->ms_y);
+   PUSH_DATA (push, w << dst->ms_x);
+   PUSH_DATA (push, h << dst->ms_y);
+   BEGIN_NVC0(push, NVC0_2D(BLIT_DU_DX_FRACT), 4);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_X_FRACT), 4);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, sx << src->ms_x);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, sy << src->ms_x);
+
+   return 0;
+}
+
+static void
+nvc0_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst, unsigned dst_level,
+                          unsigned dstx, unsigned dsty, unsigned dstz,
+                          struct pipe_resource *src, unsigned src_level,
+                          const struct pipe_box *src_box)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   int ret;
+   boolean m2mf;
+   unsigned dst_layer = dstz, src_layer = src_box->z;
+
+   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+      nouveau_copy_buffer(&nvc0->base,
+                          nv04_resource(dst), dstx,
+                          nv04_resource(src), src_box->x, src_box->width);
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, buf_copy_bytes, src_box->width);
+      return;
+   }
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_copy_count, 1);
+
+   /* 0 and 1 are equal, only supporting 0/1, 2, 4 and 8 */
+   assert((src->nr_samples | 1) == (dst->nr_samples | 1));
+
+   m2mf = (src->format == dst->format) ||
+      (util_format_get_blocksizebits(src->format) ==
+       util_format_get_blocksizebits(dst->format));
+
+   nv04_resource(dst)->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+
+   if (m2mf) {
+      struct nv50_m2mf_rect drect, srect;
+      unsigned i;
+      unsigned nx = util_format_get_nblocksx(src->format, src_box->width);
+      unsigned ny = util_format_get_nblocksy(src->format, src_box->height);
+
+      nv50_m2mf_rect_setup(&drect, dst, dst_level, dstx, dsty, dstz);
+      nv50_m2mf_rect_setup(&srect, src, src_level,
+                           src_box->x, src_box->y, src_box->z);
+
+      for (i = 0; i < src_box->depth; ++i) {
+         nvc0->m2mf_copy_rect(nvc0, &drect, &srect, nx, ny);
+
+         if (nv50_miptree(dst)->layout_3d)
+            drect.z++;
+         else
+            drect.base += nv50_miptree(dst)->layer_stride;
+
+         if (nv50_miptree(src)->layout_3d)
+            srect.z++;
+         else
+            srect.base += nv50_miptree(src)->layer_stride;
+      }
+      return;
+   }
+
+   assert(nv50_2d_dst_format_faithful(dst->format));
+   assert(nv50_2d_src_format_faithful(src->format));
+
+   BCTX_REFN(nvc0->bufctx, 2D, nv04_resource(src), RD);
+   BCTX_REFN(nvc0->bufctx, 2D, nv04_resource(dst), WR);
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx);
+   nouveau_pushbuf_validate(nvc0->base.pushbuf);
+
+   for (; dst_layer < dstz + src_box->depth; ++dst_layer, ++src_layer) {
+      ret = nvc0_2d_texture_do_copy(nvc0->base.pushbuf,
+                                    nv50_miptree(dst), dst_level,
+                                    dstx, dsty, dst_layer,
+                                    nv50_miptree(src), src_level,
+                                    src_box->x, src_box->y, src_layer,
+                                    src_box->width, src_box->height);
+      if (ret)
+         break;
+   }
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_clear_render_target(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         const union pipe_color_union *color,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv50_surface *sf = nv50_surface(dst);
+   struct nv04_resource *res = nv04_resource(sf->base.texture);
+   unsigned z;
+
+   if (!PUSH_SPACE(push, 32 + sf->depth))
+      return;
+
+   PUSH_REFN (push, res->bo, res->domain | NOUVEAU_BO_WR);
+
+   BEGIN_NVC0(push, NVC0_3D(CLEAR_COLOR(0)), 4);
+   PUSH_DATAf(push, color->f[0]);
+   PUSH_DATAf(push, color->f[1]);
+   PUSH_DATAf(push, color->f[2]);
+   PUSH_DATAf(push, color->f[3]);
+
+   BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, ( width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+
+   BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9);
+   PUSH_DATAh(push, res->address + sf->offset);
+   PUSH_DATA (push, res->address + sf->offset);
+   if (likely(nouveau_bo_memtype(res->bo))) {
+      struct nv50_miptree *mt = nv50_miptree(dst->texture);
+
+      PUSH_DATA(push, sf->width);
+      PUSH_DATA(push, sf->height);
+      PUSH_DATA(push, nvc0_format_table[dst->format].rt);
+      PUSH_DATA(push, (mt->layout_3d << 16) |
+               mt->level[sf->base.u.tex.level].tile_mode);
+      PUSH_DATA(push, dst->u.tex.first_layer + sf->depth);
+      PUSH_DATA(push, mt->layer_stride >> 2);
+      PUSH_DATA(push, dst->u.tex.first_layer);
+   } else {
+      if (res->base.target == PIPE_BUFFER) {
+         PUSH_DATA(push, 262144);
+         PUSH_DATA(push, 1);
+      } else {
+         PUSH_DATA(push, nv50_miptree(&res->base)->level[0].pitch);
+         PUSH_DATA(push, sf->height);
+      }
+      PUSH_DATA(push, nvc0_format_table[sf->base.format].rt);
+      PUSH_DATA(push, 1 << 12);
+      PUSH_DATA(push, 1);
+      PUSH_DATA(push, 0);
+      PUSH_DATA(push, 0);
+
+      IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+
+      /* tiled textures don't have to be fenced, they're not mapped directly */
+      nvc0_resource_fence(res, NOUVEAU_BO_WR);
+   }
+
+   BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
+   for (z = 0; z < sf->depth; ++z) {
+      PUSH_DATA (push, 0x3c |
+                 (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
+   }
+
+   nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+}
+
+static void
+nvc0_clear_depth_stencil(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
+{
+	struct nvc0_context *nvc0 = nvc0_context(pipe);
+	struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+	struct nv50_miptree *mt = nv50_miptree(dst->texture);
+	struct nv50_surface *sf = nv50_surface(dst);
+	uint32_t mode = 0;
+	int unk = mt->base.base.target == PIPE_TEXTURE_2D;
+	unsigned z;
+
+	if (!PUSH_SPACE(push, 32 + sf->depth))
+		return;
+
+	PUSH_REFN (push, mt->base.bo, mt->base.domain | NOUVEAU_BO_WR);
+
+	if (clear_flags & PIPE_CLEAR_DEPTH) {
+		BEGIN_NVC0(push, NVC0_3D(CLEAR_DEPTH), 1);
+		PUSH_DATAf(push, depth);
+		mode |= NVC0_3D_CLEAR_BUFFERS_Z;
+	}
+
+	if (clear_flags & PIPE_CLEAR_STENCIL) {
+		BEGIN_NVC0(push, NVC0_3D(CLEAR_STENCIL), 1);
+		PUSH_DATA (push, stencil & 0xff);
+		mode |= NVC0_3D_CLEAR_BUFFERS_S;
+	}
+
+	BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
+	PUSH_DATA (push, ( width << 16) | dstx);
+	PUSH_DATA (push, (height << 16) | dsty);
+
+	BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5);
+	PUSH_DATAh(push, mt->base.address + sf->offset);
+	PUSH_DATA (push, mt->base.address + sf->offset);
+	PUSH_DATA (push, nvc0_format_table[dst->format].rt);
+	PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+	PUSH_DATA (push, mt->layer_stride >> 2);
+	BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1);
+	PUSH_DATA (push, 1);
+	BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3);
+	PUSH_DATA (push, sf->width);
+	PUSH_DATA (push, sf->height);
+	PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
+	BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
+	PUSH_DATA (push, dst->u.tex.first_layer);
+
+	BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
+	for (z = 0; z < sf->depth; ++z) {
+		PUSH_DATA (push, mode |
+			   (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
+	}
+
+	nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+}
+
+void
+nvc0_clear(struct pipe_context *pipe, unsigned buffers,
+           const union pipe_color_union *color,
+           double depth, unsigned stencil)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+   unsigned i;
+   uint32_t mode = 0;
+
+   /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
+   if (!nvc0_state_validate(nvc0, NVC0_NEW_FRAMEBUFFER, 9 + (fb->nr_cbufs * 2)))
+      return;
+
+   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_COLOR(0)), 4);
+      PUSH_DATAf(push, color->f[0]);
+      PUSH_DATAf(push, color->f[1]);
+      PUSH_DATAf(push, color->f[2]);
+      PUSH_DATAf(push, color->f[3]);
+      mode =
+         NVC0_3D_CLEAR_BUFFERS_R | NVC0_3D_CLEAR_BUFFERS_G |
+         NVC0_3D_CLEAR_BUFFERS_B | NVC0_3D_CLEAR_BUFFERS_A;
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTH) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_DEPTH), 1);
+      PUSH_DATA (push, fui(depth));
+      mode |= NVC0_3D_CLEAR_BUFFERS_Z;
+   }
+
+   if (buffers & PIPE_CLEAR_STENCIL) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_STENCIL), 1);
+      PUSH_DATA (push, stencil & 0xff);
+      mode |= NVC0_3D_CLEAR_BUFFERS_S;
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 1);
+   PUSH_DATA (push, mode);
+
+   for (i = 1; i < fb->nr_cbufs; i++) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 1);
+      PUSH_DATA (push, (i << 6) | 0x3c);
+   }
+}
+
+
+/* =============================== BLIT CODE ===================================
+ */
+
+struct nvc0_blitter
+{
+   struct nvc0_program *fp[NV50_BLIT_MAX_TEXTURE_TYPES][NV50_BLIT_MODES];
+   struct nvc0_program vp;
+
+   struct nv50_tsc_entry sampler[2]; /* nearest, bilinear */
+
+   pipe_mutex mutex;
+
+   struct nvc0_screen *screen;
+};
+
+struct nvc0_blitctx
+{
+   struct nvc0_context *nvc0;
+   struct nvc0_program *fp;
+   uint8_t mode;
+   uint16_t color_mask;
+   uint8_t filter;
+   enum pipe_texture_target target;
+   struct {
+      struct pipe_framebuffer_state fb;
+      struct nvc0_rasterizer_stateobj *rast;
+      struct nvc0_program *vp;
+      struct nvc0_program *tcp;
+      struct nvc0_program *tep;
+      struct nvc0_program *gp;
+      struct nvc0_program *fp;
+      unsigned num_textures[5];
+      unsigned num_samplers[5];
+      struct pipe_sampler_view *texture[2];
+      struct nv50_tsc_entry *sampler[2];
+      uint32_t dirty;
+   } saved;
+   struct nvc0_rasterizer_stateobj rast;
+};
+
+static void
+nvc0_blitter_make_vp(struct nvc0_blitter *blit)
+{
+   static const uint32_t code_nvc0[] =
+   {
+      0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */
+      0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */
+      0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */
+      0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */
+      0x00001de7, 0x80000000, /* exit */
+   };
+   static const uint32_t code_nve4[] =
+   {
+      0x00000007, 0x20000000, /* sched */
+      0xfff11c26, 0x06000080, /* vfetch b64 $r4:$r5 a[0x80] */
+      0xfff01c46, 0x06000090, /* vfetch b96 $r0:$r1:$r2 a[0x90] */
+      0x13f01c26, 0x0a7e0070, /* export b64 o[0x70] $r4:$r5 */
+      0x03f01c46, 0x0a7e0080, /* export b96 o[0x80] $r0:$r1:$r2 */
+      0x00001de7, 0x80000000, /* exit */
+   };
+
+   blit->vp.type = PIPE_SHADER_VERTEX;
+   blit->vp.translated = TRUE;
+   if (blit->screen->base.class_3d >= NVE4_3D_CLASS) {
+      blit->vp.code = (uint32_t *)code_nve4; /* const_cast */
+      blit->vp.code_size = sizeof(code_nve4);
+   } else {
+      blit->vp.code = (uint32_t *)code_nvc0; /* const_cast */
+      blit->vp.code_size = sizeof(code_nvc0);
+   }
+   blit->vp.num_gprs = 6;
+   blit->vp.vp.edgeflag = PIPE_MAX_ATTRIBS;
+
+   blit->vp.hdr[0]  = 0x00020461; /* vertprog magic */
+   blit->vp.hdr[4]  = 0x000ff000; /* no outputs read */
+   blit->vp.hdr[6]  = 0x00000073; /* a[0x80].xy, a[0x90].xyz */
+   blit->vp.hdr[13] = 0x00073000; /* o[0x70].xy, o[0x80].xyz */
+}
+
+static void
+nvc0_blitter_make_sampler(struct nvc0_blitter *blit)
+{
+   /* clamp to edge, min/max lod = 0, nearest filtering */
+
+   blit->sampler[0].id = -1;
+
+   blit->sampler[0].tsc[0] = NV50_TSC_0_SRGB_CONVERSION_ALLOWED |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPS__SHIFT) |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPT__SHIFT) |
+      (NV50_TSC_WRAP_CLAMP_TO_EDGE << NV50_TSC_0_WRAPR__SHIFT);
+   blit->sampler[0].tsc[1] =
+      NV50_TSC_1_MAGF_NEAREST | NV50_TSC_1_MINF_NEAREST | NV50_TSC_1_MIPF_NONE;
+
+   /* clamp to edge, min/max lod = 0, bilinear filtering */
+
+   blit->sampler[1].id = -1;
+
+   blit->sampler[1].tsc[0] = blit->sampler[0].tsc[0];
+   blit->sampler[1].tsc[1] =
+      NV50_TSC_1_MAGF_LINEAR | NV50_TSC_1_MINF_LINEAR | NV50_TSC_1_MIPF_NONE;
+}
+
+static void
+nvc0_blit_select_fp(struct nvc0_blitctx *ctx, const struct pipe_blit_info *info)
+{
+   struct nvc0_blitter *blitter = ctx->nvc0->screen->blitter;
+
+   const enum pipe_texture_target ptarg =
+      nv50_blit_reinterpret_pipe_texture_target(info->src.resource->target);
+
+   const unsigned targ = nv50_blit_texture_type(ptarg);
+   const unsigned mode = ctx->mode;
+
+   if (!blitter->fp[targ][mode]) {
+      pipe_mutex_lock(blitter->mutex);
+      if (!blitter->fp[targ][mode])
+         blitter->fp[targ][mode] =
+            nv50_blitter_make_fp(&ctx->nvc0->base.pipe, mode, ptarg);
+      pipe_mutex_unlock(blitter->mutex);
+   }
+   ctx->fp = blitter->fp[targ][mode];
+}
+
+static void
+nvc0_blit_set_dst(struct nvc0_blitctx *ctx,
+                  struct pipe_resource *res, unsigned level, unsigned layer,
+                  enum pipe_format format)
+{
+   struct nvc0_context *nvc0 = ctx->nvc0;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct pipe_surface templ;
+
+   if (util_format_is_depth_or_stencil(format))
+      templ.format = nv50_blit_zeta_to_colour_format(format);
+   else
+      templ.format = format;
+
+   templ.u.tex.level = level;
+   templ.u.tex.first_layer = templ.u.tex.last_layer = layer;
+
+   if (layer == -1) {
+      templ.u.tex.first_layer = 0;
+      templ.u.tex.last_layer =
+         (res->target == PIPE_TEXTURE_3D ? res->depth0 : res->array_size) - 1;
+   }
+
+   nvc0->framebuffer.cbufs[0] = nvc0_miptree_surface_new(pipe, res, &templ);
+   nvc0->framebuffer.nr_cbufs = 1;
+   nvc0->framebuffer.zsbuf = NULL;
+   nvc0->framebuffer.width = nvc0->framebuffer.cbufs[0]->width;
+   nvc0->framebuffer.height = nvc0->framebuffer.cbufs[0]->height;
+}
+
+static void
+nvc0_blit_set_src(struct nvc0_blitctx *ctx,
+                  struct pipe_resource *res, unsigned level, unsigned layer,
+                  enum pipe_format format, const uint8_t filter)
+{
+   struct nvc0_context *nvc0 = ctx->nvc0;
+   struct pipe_context *pipe = &nvc0->base.pipe;
+   struct pipe_sampler_view templ;
+   uint32_t flags;
+   unsigned s;
+   enum pipe_texture_target target;
+
+   target = nv50_blit_reinterpret_pipe_texture_target(res->target);
+
+   templ.format = format;
+   templ.u.tex.first_layer = templ.u.tex.last_layer = layer;
+   templ.u.tex.first_level = templ.u.tex.last_level = level;
+   templ.swizzle_r = PIPE_SWIZZLE_RED;
+   templ.swizzle_g = PIPE_SWIZZLE_GREEN;
+   templ.swizzle_b = PIPE_SWIZZLE_BLUE;
+   templ.swizzle_a = PIPE_SWIZZLE_ALPHA;
+
+   if (layer == -1) {
+      templ.u.tex.first_layer = 0;
+      templ.u.tex.last_layer =
+         (res->target == PIPE_TEXTURE_3D ? res->depth0 : res->array_size) - 1;
+   }
+
+   flags = res->last_level ? 0 : NV50_TEXVIEW_SCALED_COORDS;
+   flags |= NV50_TEXVIEW_ACCESS_RESOLVE;
+   if (filter && res->nr_samples == 8)
+      flags |= NV50_TEXVIEW_FILTER_MSAA8;
+
+   nvc0->textures[4][0] = nvc0_create_texture_view(
+      pipe, res, &templ, flags, target);
+   nvc0->textures[4][1] = NULL;
+
+   for (s = 0; s <= 3; ++s)
+      nvc0->num_textures[s] = 0;
+   nvc0->num_textures[4] = 1;
+
+   templ.format = nv50_zs_to_s_format(format);
+   if (templ.format != format) {
+      nvc0->textures[4][1] = nvc0_create_texture_view(
+         pipe, res, &templ, flags, target);
+      nvc0->num_textures[4] = 2;
+   }
+}
+
+static void
+nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit)
+{
+   struct nouveau_pushbuf *push = blit->nvc0->base.pushbuf;
+
+   /* TODO: maybe make this a MACRO (if we need more logic) ? */
+
+   if (blit->nvc0->cond_query)
+      IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
+   /* blend state */
+   BEGIN_NVC0(push, NVC0_3D(COLOR_MASK(0)), 1);
+   PUSH_DATA (push, blit->color_mask);
+   IMMED_NVC0(push, NVC0_3D(BLEND_ENABLE(0)), 0);
+   IMMED_NVC0(push, NVC0_3D(LOGIC_OP_ENABLE), 0);
+
+   /* rasterizer state */
+   IMMED_NVC0(push, NVC0_3D(FRAG_COLOR_CLAMP_EN), 0);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_ENABLE), 0);
+   BEGIN_NVC0(push, NVC0_3D(MSAA_MASK(0)), 4);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   PUSH_DATA (push, 0xffff);
+   BEGIN_NVC0(push, NVC0_3D(MACRO_POLYGON_MODE_FRONT), 1);
+   PUSH_DATA (push, NVC0_3D_MACRO_POLYGON_MODE_FRONT_FILL);
+   BEGIN_NVC0(push, NVC0_3D(MACRO_POLYGON_MODE_BACK), 1);
+   PUSH_DATA (push, NVC0_3D_MACRO_POLYGON_MODE_BACK_FILL);
+   IMMED_NVC0(push, NVC0_3D(POLYGON_SMOOTH_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(POLYGON_OFFSET_FILL_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(POLYGON_STIPPLE_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(CULL_FACE_ENABLE), 0);
+
+   /* zsa state */
+   IMMED_NVC0(push, NVC0_3D(DEPTH_TEST_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(STENCIL_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(ALPHA_TEST_ENABLE), 0);
+
+   /* disable transform feedback */
+   IMMED_NVC0(push, NVC0_3D(TFB_ENABLE), 0);
+}
+
+static void
+nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx)
+{
+   struct nvc0_context *nvc0 = ctx->nvc0;
+   struct nvc0_blitter *blitter = nvc0->screen->blitter;
+   int s;
+
+   ctx->saved.fb.width = nvc0->framebuffer.width;
+   ctx->saved.fb.height = nvc0->framebuffer.height;
+   ctx->saved.fb.nr_cbufs = nvc0->framebuffer.nr_cbufs;
+   ctx->saved.fb.cbufs[0] = nvc0->framebuffer.cbufs[0];
+   ctx->saved.fb.zsbuf = nvc0->framebuffer.zsbuf;
+
+   ctx->saved.rast = nvc0->rast;
+
+   ctx->saved.vp = nvc0->vertprog;
+   ctx->saved.tcp = nvc0->tctlprog;
+   ctx->saved.tep = nvc0->tevlprog;
+   ctx->saved.gp = nvc0->gmtyprog;
+   ctx->saved.fp = nvc0->fragprog;
+
+   nvc0->rast = &ctx->rast;
+
+   nvc0->vertprog = &blitter->vp;
+   nvc0->tctlprog = NULL;
+   nvc0->tevlprog = NULL;
+   nvc0->gmtyprog = NULL;
+   nvc0->fragprog = ctx->fp;
+
+   for (s = 0; s <= 4; ++s) {
+      ctx->saved.num_textures[s] = nvc0->num_textures[s];
+      ctx->saved.num_samplers[s] = nvc0->num_samplers[s];
+      nvc0->textures_dirty[s] = (1 << nvc0->num_textures[s]) - 1;
+      nvc0->samplers_dirty[s] = (1 << nvc0->num_samplers[s]) - 1;
+   }
+   ctx->saved.texture[0] = nvc0->textures[4][0];
+   ctx->saved.texture[1] = nvc0->textures[4][1];
+   ctx->saved.sampler[0] = nvc0->samplers[4][0];
+   ctx->saved.sampler[1] = nvc0->samplers[4][1];
+
+   nvc0->samplers[4][0] = &blitter->sampler[ctx->filter];
+   nvc0->samplers[4][1] = &blitter->sampler[ctx->filter];
+
+   for (s = 0; s <= 3; ++s)
+      nvc0->num_samplers[s] = 0;
+   nvc0->num_samplers[4] = 2;
+
+   ctx->saved.dirty = nvc0->dirty;
+
+   nvc0->textures_dirty[4] |= 3;
+   nvc0->samplers_dirty[4] |= 3;
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+
+   nvc0->dirty = NVC0_NEW_FRAMEBUFFER |
+      NVC0_NEW_VERTPROG | NVC0_NEW_FRAGPROG |
+      NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG |
+      NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS;
+}
+
+static void
+nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
+{
+   struct nvc0_context *nvc0 = blit->nvc0;
+   int s;
+
+   pipe_surface_reference(&nvc0->framebuffer.cbufs[0], NULL);
+
+   nvc0->framebuffer.width = blit->saved.fb.width;
+   nvc0->framebuffer.height = blit->saved.fb.height;
+   nvc0->framebuffer.nr_cbufs = blit->saved.fb.nr_cbufs;
+   nvc0->framebuffer.cbufs[0] = blit->saved.fb.cbufs[0];
+   nvc0->framebuffer.zsbuf = blit->saved.fb.zsbuf;
+
+   nvc0->rast = blit->saved.rast;
+
+   nvc0->vertprog = blit->saved.vp;
+   nvc0->tctlprog = blit->saved.tcp;
+   nvc0->tevlprog = blit->saved.tep;
+   nvc0->gmtyprog = blit->saved.gp;
+   nvc0->fragprog = blit->saved.fp;
+
+   pipe_sampler_view_reference(&nvc0->textures[4][0], NULL);
+   pipe_sampler_view_reference(&nvc0->textures[4][1], NULL);
+
+   for (s = 0; s <= 4; ++s) {
+      nvc0->num_textures[s] = blit->saved.num_textures[s];
+      nvc0->num_samplers[s] = blit->saved.num_samplers[s];
+      nvc0->textures_dirty[s] = (1 << nvc0->num_textures[s]) - 1;
+      nvc0->samplers_dirty[s] = (1 << nvc0->num_samplers[s]) - 1;
+   }
+   nvc0->textures[4][0] = blit->saved.texture[0];
+   nvc0->textures[4][1] = blit->saved.texture[1];
+   nvc0->samplers[4][0] = blit->saved.sampler[0];
+   nvc0->samplers[4][1] = blit->saved.sampler[1];
+
+   nvc0->textures_dirty[4] |= 3;
+   nvc0->samplers_dirty[4] |= 3;
+
+   if (nvc0->cond_query)
+      nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
+                                       nvc0->cond_cond, nvc0->cond_mode);
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+
+   nvc0->dirty = blit->saved.dirty |
+      (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
+       NVC0_NEW_RASTERIZER | NVC0_NEW_ZSA | NVC0_NEW_BLEND |
+       NVC0_NEW_TEXTURES | NVC0_NEW_SAMPLERS |
+       NVC0_NEW_VERTPROG | NVC0_NEW_FRAGPROG |
+       NVC0_NEW_TCTLPROG | NVC0_NEW_TEVLPROG | NVC0_NEW_GMTYPROG |
+       NVC0_NEW_TFB_TARGETS);
+}
+
+static void
+nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
+{
+   struct nvc0_blitctx *blit = nvc0->blit;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct pipe_resource *src = info->src.resource;
+   struct pipe_resource *dst = info->dst.resource;
+   int32_t minx, maxx, miny, maxy;
+   int32_t i;
+   float x0, x1, y0, y1, z;
+   float dz;
+   float x_range, y_range;
+
+   blit->mode = nv50_blit_select_mode(info);
+   blit->color_mask = nv50_blit_derive_color_mask(info);
+   blit->filter = nv50_blit_get_filter(info);
+
+   nvc0_blit_select_fp(blit, info);
+   nvc0_blitctx_pre_blit(blit);
+
+   nvc0_blit_set_dst(blit, dst, info->dst.level, -1, info->dst.format);
+   nvc0_blit_set_src(blit, src, info->src.level, -1, info->src.format,
+                     blit->filter);
+
+   nvc0_blitctx_prepare_state(blit);
+
+   nvc0_state_validate(nvc0, ~0, 48);
+
+   x_range = (float)info->src.box.width / (float)info->dst.box.width;
+   y_range = (float)info->src.box.height / (float)info->dst.box.height;
+
+   x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x;
+   y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y;
+
+   x1 = x0 + 16384.0f * x_range;
+   y1 = y0 + 16384.0f * y_range;
+
+   x0 *= (float)(1 << nv50_miptree(src)->ms_x);
+   x1 *= (float)(1 << nv50_miptree(src)->ms_x);
+   y0 *= (float)(1 << nv50_miptree(src)->ms_y);
+   y1 *= (float)(1 << nv50_miptree(src)->ms_y);
+
+   if (src->last_level > 0) {
+      /* If there are mip maps, GPU always assumes normalized coordinates. */
+      const unsigned l = info->src.level;
+      const float fh = u_minify(src->width0 << nv50_miptree(src)->ms_x, l);
+      const float fv = u_minify(src->height0 << nv50_miptree(src)->ms_y, l);
+      x0 /= fh;
+      x1 /= fh;
+      y0 /= fv;
+      y1 /= fv;
+   }
+
+   dz = (float)info->src.box.depth / (float)info->dst.box.depth;
+   z = (float)info->src.box.z;
+   if (nv50_miptree(src)->layout_3d)
+      z += 0.5f * dz;
+
+   IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 0);
+   IMMED_NVC0(push, NVC0_3D(VIEW_VOLUME_CLIP_CTRL), 0x2 |
+              NVC0_3D_VIEW_VOLUME_CLIP_CTRL_DEPTH_RANGE_0_1);
+   BEGIN_NVC0(push, NVC0_3D(VIEWPORT_HORIZ(0)), 2);
+   PUSH_DATA (push, nvc0->framebuffer.width << 16);
+   PUSH_DATA (push, nvc0->framebuffer.height << 16);
+
+   /* Draw a large triangle in screen coordinates covering the whole
+    * render target, with scissors defining the destination region.
+    * The vertex is supplied with non-normalized texture coordinates
+    * arranged in a way to yield the desired offset and scale.
+    */
+
+   minx = info->dst.box.x;
+   maxx = info->dst.box.x + info->dst.box.width;
+   miny = info->dst.box.y;
+   maxy = info->dst.box.y + info->dst.box.height;
+   if (info->scissor_enable) {
+      minx = MAX2(minx, info->scissor.minx);
+      maxx = MIN2(maxx, info->scissor.maxx);
+      miny = MAX2(miny, info->scissor.miny);
+      maxy = MIN2(maxy, info->scissor.maxy);
+   }
+   BEGIN_NVC0(push, NVC0_3D(SCISSOR_HORIZ(0)), 2);
+   PUSH_DATA (push, (maxx << 16) | minx);
+   PUSH_DATA (push, (maxy << 16) | miny);
+
+   for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
+      if (info->dst.box.z + i) {
+         BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
+         PUSH_DATA (push, info->dst.box.z + i);
+      }
+
+      IMMED_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL),
+                       NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_TRIANGLES);
+
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 4);
+      PUSH_DATA (push, 0x74301);
+      PUSH_DATAf(push, x0);
+      PUSH_DATAf(push, y0);
+      PUSH_DATAf(push, z);
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 3);
+      PUSH_DATA (push, 0x74200);
+      PUSH_DATAf(push, 0.0f);
+      PUSH_DATAf(push, 0.0f);
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 4);
+      PUSH_DATA (push, 0x74301);
+      PUSH_DATAf(push, x1);
+      PUSH_DATAf(push, y0);
+      PUSH_DATAf(push, z);
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 3);
+      PUSH_DATA (push, 0x74200);
+      PUSH_DATAf(push, 16384 << nv50_miptree(dst)->ms_x);
+      PUSH_DATAf(push, 0.0f);
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 4);
+      PUSH_DATA (push, 0x74301);
+      PUSH_DATAf(push, x0);
+      PUSH_DATAf(push, y1);
+      PUSH_DATAf(push, z);
+      BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 3);
+      PUSH_DATA (push, 0x74200);
+      PUSH_DATAf(push, 0.0f);
+      PUSH_DATAf(push, 16384 << nv50_miptree(dst)->ms_y);
+
+      IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
+   }
+   if (info->dst.box.z + info->dst.box.depth - 1)
+      IMMED_NVC0(push, NVC0_3D(LAYER), 0);
+
+   nvc0_blitctx_post_blit(blit);
+
+   /* restore viewport */
+
+   BEGIN_NVC0(push, NVC0_3D(VIEWPORT_HORIZ(0)), 2);
+   PUSH_DATA (push, nvc0->vport_int[0]);
+   PUSH_DATA (push, nvc0->vport_int[1]);
+   IMMED_NVC0(push, NVC0_3D(VIEWPORT_TRANSFORM_EN), 1);
+}
+
+static void
+nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv50_miptree *dst = nv50_miptree(info->dst.resource);
+   struct nv50_miptree *src = nv50_miptree(info->src.resource);
+   const int32_t srcx_adj = info->src.box.width < 0 ? -1 : 0;
+   const int32_t srcy_adj = info->src.box.height < 0 ? -1 : 0;
+   const int dz = info->dst.box.z;
+   const int sz = info->src.box.z;
+   uint32_t dstw, dsth;
+   int32_t dstx, dsty;
+   int64_t srcx, srcy;
+   int64_t du_dx, dv_dy;
+   int i;
+   uint32_t mode;
+   uint32_t mask = nv50_blit_eng2d_get_mask(info);
+   boolean b;
+
+   mode = nv50_blit_get_filter(info) ?
+      NVC0_2D_BLIT_CONTROL_FILTER_BILINEAR :
+      NVC0_2D_BLIT_CONTROL_FILTER_POINT_SAMPLE;
+   mode |= (src->base.base.nr_samples > dst->base.base.nr_samples) ?
+      NVC0_2D_BLIT_CONTROL_ORIGIN_CORNER : NVC0_2D_BLIT_CONTROL_ORIGIN_CENTER;
+
+   du_dx = ((int64_t)info->src.box.width << 32) / info->dst.box.width;
+   dv_dy = ((int64_t)info->src.box.height << 32) / info->dst.box.height;
+
+   b = info->dst.format == info->src.format;
+   nvc0_2d_texture_set(push, 1, dst, info->dst.level, dz, info->dst.format, b);
+   nvc0_2d_texture_set(push, 0, src, info->src.level, sz, info->src.format, b);
+
+   if (info->scissor_enable) {
+      BEGIN_NVC0(push, NVC0_2D(CLIP_X), 5);
+      PUSH_DATA (push, info->scissor.minx << dst->ms_x);
+      PUSH_DATA (push, info->scissor.miny << dst->ms_y);
+      PUSH_DATA (push, (info->scissor.maxx - info->scissor.minx) << dst->ms_x);
+      PUSH_DATA (push, (info->scissor.maxy - info->scissor.miny) << dst->ms_y);
+      PUSH_DATA (push, 1); /* enable */
+   }
+
+   if (mask != 0xffffffff) {
+      IMMED_NVC0(push, NVC0_2D(ROP), 0xca); /* DPSDxax */
+      IMMED_NVC0(push, NVC0_2D(PATTERN_COLOR_FORMAT),
+                       NVC0_2D_PATTERN_COLOR_FORMAT_32BPP);
+      BEGIN_NVC0(push, NVC0_2D(PATTERN_COLOR(0)), 4);
+      PUSH_DATA (push, 0x00000000);
+      PUSH_DATA (push, mask);
+      PUSH_DATA (push, 0xffffffff);
+      PUSH_DATA (push, 0xffffffff);
+      IMMED_NVC0(push, NVC0_2D(OPERATION), NVC0_2D_OPERATION_ROP);
+   } else
+   if (info->src.format != info->dst.format) {
+      if (info->src.format == PIPE_FORMAT_R8_UNORM ||
+          info->src.format == PIPE_FORMAT_R8_SNORM ||
+          info->src.format == PIPE_FORMAT_R16_UNORM ||
+          info->src.format == PIPE_FORMAT_R16_SNORM ||
+          info->src.format == PIPE_FORMAT_R16_FLOAT ||
+          info->src.format == PIPE_FORMAT_R32_FLOAT) {
+         mask = 0xffff0000; /* also makes condition for OPERATION reset true */
+         BEGIN_NVC0(push, NVC0_2D(BETA4), 2);
+         PUSH_DATA (push, mask);
+         PUSH_DATA (push, NVC0_2D_OPERATION_SRCCOPY_PREMULT);
+      } else
+      if (info->src.format == PIPE_FORMAT_A8_UNORM) {
+         mask = 0xff000000;
+         BEGIN_NVC0(push, NVC0_2D(BETA4), 2);
+         PUSH_DATA (push, mask);
+         PUSH_DATA (push, NVC0_2D_OPERATION_SRCCOPY_PREMULT);
+      }
+   }
+
+   if (src->ms_x > dst->ms_x || src->ms_y > dst->ms_y) {
+      /* ms_x is always >= ms_y */
+      du_dx <<= src->ms_x - dst->ms_x;
+      dv_dy <<= src->ms_y - dst->ms_y;
+   } else {
+      du_dx >>= dst->ms_x - src->ms_x;
+      dv_dy >>= dst->ms_y - src->ms_y;
+   }
+
+   srcx = (int64_t)(info->src.box.x + srcx_adj) << (src->ms_x + 32);
+   srcy = (int64_t)(info->src.box.y + srcy_adj) << (src->ms_y + 32);
+
+   if (src->base.base.nr_samples > dst->base.base.nr_samples) {
+      /* center src coorinates for proper MS resolve filtering */
+      srcx += (int64_t)(src->ms_x + 0) << 32;
+      srcy += (int64_t)(src->ms_y + 1) << 31;
+   }
+
+   dstx = info->dst.box.x << dst->ms_x;
+   dsty = info->dst.box.y << dst->ms_y;
+
+   dstw = info->dst.box.width << dst->ms_x;
+   dsth = info->dst.box.height << dst->ms_y;
+
+   if (dstx < 0) {
+      dstw += dstx;
+      srcx -= du_dx * dstx;
+      dstx = 0;
+   }
+   if (dsty < 0) {
+      dsth += dsty;
+      srcy -= dv_dy * dsty;
+      dsty = 0;
+   }
+
+   IMMED_NVC0(push, NVC0_2D(BLIT_CONTROL), mode);
+   BEGIN_NVC0(push, NVC0_2D(BLIT_DST_X), 4);
+   PUSH_DATA (push, dstx);
+   PUSH_DATA (push, dsty);
+   PUSH_DATA (push, dstw);
+   PUSH_DATA (push, dsth);
+   BEGIN_NVC0(push, NVC0_2D(BLIT_DU_DX_FRACT), 4);
+   PUSH_DATA (push, du_dx);
+   PUSH_DATA (push, du_dx >> 32);
+   PUSH_DATA (push, dv_dy);
+   PUSH_DATA (push, dv_dy >> 32);
+
+   BCTX_REFN(nvc0->bufctx, 2D, &dst->base, WR);
+   BCTX_REFN(nvc0->bufctx, 2D, &src->base, RD);
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx);
+   if (nouveau_pushbuf_validate(nvc0->base.pushbuf))
+      return;
+
+   for (i = 0; i < info->dst.box.depth; ++i) {
+      if (i > 0) {
+         /* no scaling in z-direction possible for eng2d blits */
+         if (dst->layout_3d) {
+            BEGIN_NVC0(push, NVC0_2D(DST_LAYER), 1);
+            PUSH_DATA (push, info->dst.box.z + i);
+         } else {
+            const unsigned z = info->dst.box.z + i;
+            BEGIN_NVC0(push, NVC0_2D(DST_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+         }
+         if (src->layout_3d) {
+            /* not possible because of depth tiling */
+            assert(0);
+         } else {
+            const unsigned z = info->src.box.z + i;
+            BEGIN_NVC0(push, NVC0_2D(SRC_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
+            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+         }
+         BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_Y_INT), 1); /* trigger */
+         PUSH_DATA (push, srcy >> 32);
+      } else {
+         BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_X_FRACT), 4);
+         PUSH_DATA (push, srcx);
+         PUSH_DATA (push, srcx >> 32);
+         PUSH_DATA (push, srcy);
+         PUSH_DATA (push, srcy >> 32);
+      }
+   }
+   nvc0_resource_validate(&dst->base, NOUVEAU_BO_WR);
+   nvc0_resource_validate(&src->base, NOUVEAU_BO_RD);
+
+   nouveau_bufctx_reset(nvc0->bufctx, NVC0_BIND_2D);
+
+   if (info->scissor_enable)
+      IMMED_NVC0(push, NVC0_2D(CLIP_ENABLE), 0);
+   if (mask != 0xffffffff)
+      IMMED_NVC0(push, NVC0_2D(OPERATION), NVC0_2D_OPERATION_SRCCOPY);
+}
+
+static void
+nvc0_blit(struct pipe_context *pipe, const struct pipe_blit_info *info)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   boolean eng3d = FALSE;
+
+   if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
+      if (!(info->mask & PIPE_MASK_ZS))
+         return;
+      if (info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT ||
+          info->dst.resource->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+         eng3d = TRUE;
+      if (info->filter != PIPE_TEX_FILTER_NEAREST)
+         eng3d = TRUE;
+   } else {
+      if (!(info->mask & PIPE_MASK_RGBA))
+         return;
+      if (info->mask != PIPE_MASK_RGBA)
+         eng3d = TRUE;
+   }
+
+   if (nv50_miptree(info->src.resource)->layout_3d) {
+      eng3d = TRUE;
+   } else
+   if (info->src.box.depth != info->dst.box.depth) {
+      eng3d = TRUE;
+      debug_printf("blit: cannot filter array or cube textures in z direction");
+   }
+
+   if (!eng3d && info->dst.format != info->src.format) {
+      if (!nv50_2d_dst_format_faithful(info->dst.format)) {
+         eng3d = TRUE;
+      } else
+      if (!nv50_2d_src_format_faithful(info->src.format)) {
+         if (!util_format_is_luminance(info->src.format)) {
+            if (util_format_is_intensity(info->src.format))
+               eng3d = info->src.format != PIPE_FORMAT_I8_UNORM;
+            else
+            if (!nv50_2d_dst_format_ops_supported(info->dst.format))
+               eng3d = TRUE;
+            else
+               eng3d = !nv50_2d_format_supported(info->src.format);
+         }
+      } else
+      if (util_format_is_luminance_alpha(info->src.format))
+         eng3d = TRUE;
+   }
+
+   if (info->src.resource->nr_samples == 8 &&
+       info->dst.resource->nr_samples <= 1)
+      eng3d = TRUE;
+#if 0
+   /* FIXME: can't make this work with eng2d anymore, at least not on nv50 */
+   if (info->src.resource->nr_samples > 1 ||
+       info->dst.resource->nr_samples > 1)
+      eng3d = TRUE;
+#endif
+   /* FIXME: find correct src coordinates adjustments */
+   if ((info->src.box.width !=  info->dst.box.width &&
+        info->src.box.width != -info->dst.box.width) ||
+       (info->src.box.height !=  info->dst.box.height &&
+        info->src.box.height != -info->dst.box.height))
+      eng3d = TRUE;
+
+   if (!eng3d)
+      nvc0_blit_eng2d(nvc0, info);
+   else
+      nvc0_blit_3d(nvc0, info);
+
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_blit_count, 1);
+}
+
+boolean
+nvc0_blitter_create(struct nvc0_screen *screen)
+{
+   screen->blitter = CALLOC_STRUCT(nvc0_blitter);
+   if (!screen->blitter) {
+      NOUVEAU_ERR("failed to allocate blitter struct\n");
+      return FALSE;
+   }
+   screen->blitter->screen = screen;
+
+   pipe_mutex_init(screen->blitter->mutex);
+
+   nvc0_blitter_make_vp(screen->blitter);
+   nvc0_blitter_make_sampler(screen->blitter);
+
+   return TRUE;
+}
+
+void
+nvc0_blitter_destroy(struct nvc0_screen *screen)
+{
+   struct nvc0_blitter *blitter = screen->blitter;
+   unsigned i, m;
+
+   for (i = 0; i < NV50_BLIT_MAX_TEXTURE_TYPES; ++i) {
+      for (m = 0; m < NV50_BLIT_MODES; ++m) {
+         struct nvc0_program *prog = blitter->fp[i][m];
+         if (prog) {
+            nvc0_program_destroy(NULL, prog);
+            FREE((void *)prog->pipe.tokens);
+            FREE(prog);
+         }
+      }
+   }
+
+   FREE(blitter);
+}
+
+boolean
+nvc0_blitctx_create(struct nvc0_context *nvc0)
+{
+   nvc0->blit = CALLOC_STRUCT(nvc0_blitctx);
+   if (!nvc0->blit) {
+      NOUVEAU_ERR("failed to allocate blit context\n");
+      return FALSE;
+   }
+
+   nvc0->blit->nvc0 = nvc0;
+
+   nvc0->blit->rast.pipe.half_pixel_center = 1;
+
+   return TRUE;
+}
+
+void
+nvc0_blitctx_destroy(struct nvc0_context *nvc0)
+{
+   if (nvc0->blit)
+      FREE(nvc0->blit);
+}
+
+void
+nvc0_init_surface_functions(struct nvc0_context *nvc0)
+{
+   struct pipe_context *pipe = &nvc0->base.pipe;
+
+   pipe->resource_copy_region = nvc0_resource_copy_region;
+   pipe->blit = nvc0_blit;
+   pipe->clear_render_target = nvc0_clear_render_target;
+   pipe->clear_depth_stencil = nvc0_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
new file mode 100644
index 00000000000..765cd2d2bab
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+#include "nv50/nv50_texture.xml.h"
+#include "nv50/nv50_defs.xml.h"
+
+#include "util/u_format.h"
+
+#define NVE4_TIC_ENTRY_INVALID 0x000fffff
+#define NVE4_TSC_ENTRY_INVALID 0xfff00000
+
+#define NV50_TIC_0_SWIZZLE__MASK                      \
+   (NV50_TIC_0_MAPA__MASK | NV50_TIC_0_MAPB__MASK |   \
+    NV50_TIC_0_MAPG__MASK | NV50_TIC_0_MAPR__MASK)
+
+static INLINE uint32_t
+nv50_tic_swizzle(uint32_t tc, unsigned swz, boolean tex_int)
+{
+   switch (swz) {
+   case PIPE_SWIZZLE_RED:
+      return (tc & NV50_TIC_0_MAPR__MASK) >> NV50_TIC_0_MAPR__SHIFT;
+   case PIPE_SWIZZLE_GREEN:
+      return (tc & NV50_TIC_0_MAPG__MASK) >> NV50_TIC_0_MAPG__SHIFT;
+   case PIPE_SWIZZLE_BLUE:
+      return (tc & NV50_TIC_0_MAPB__MASK) >> NV50_TIC_0_MAPB__SHIFT;
+   case PIPE_SWIZZLE_ALPHA:
+      return (tc & NV50_TIC_0_MAPA__MASK) >> NV50_TIC_0_MAPA__SHIFT;
+   case PIPE_SWIZZLE_ONE:
+      return tex_int ? NV50_TIC_MAP_ONE_INT : NV50_TIC_MAP_ONE_FLOAT;
+   case PIPE_SWIZZLE_ZERO:
+   default:
+      return NV50_TIC_MAP_ZERO;
+   }
+}
+
+struct pipe_sampler_view *
+nvc0_create_sampler_view(struct pipe_context *pipe,
+                         struct pipe_resource *res,
+                         const struct pipe_sampler_view *templ)
+{
+   uint32_t flags = 0;
+
+   if (res->target == PIPE_TEXTURE_RECT || res->target == PIPE_BUFFER)
+      flags |= NV50_TEXVIEW_SCALED_COORDS;
+
+   return nvc0_create_texture_view(pipe, res, templ, flags, res->target);
+}
+
+struct pipe_sampler_view *
+nvc0_create_texture_view(struct pipe_context *pipe,
+                         struct pipe_resource *texture,
+                         const struct pipe_sampler_view *templ,
+                         uint32_t flags,
+                         enum pipe_texture_target target)
+{
+   const struct util_format_description *desc;
+   uint64_t address;
+   uint32_t *tic;
+   uint32_t swz[4];
+   uint32_t width, height;
+   uint32_t depth;
+   struct nv50_tic_entry *view;
+   struct nv50_miptree *mt;
+   boolean tex_int;
+
+   view = MALLOC_STRUCT(nv50_tic_entry);
+   if (!view)
+      return NULL;
+   mt = nv50_miptree(texture);
+
+   view->pipe = *templ;
+   view->pipe.reference.count = 1;
+   view->pipe.texture = NULL;
+   view->pipe.context = pipe;
+
+   view->id = -1;
+
+   pipe_resource_reference(&view->pipe.texture, texture);
+
+   tic = &view->tic[0];
+
+   desc = util_format_description(view->pipe.format);
+
+   tic[0] = nvc0_format_table[view->pipe.format].tic;
+
+   tex_int = util_format_is_pure_integer(view->pipe.format);
+
+   swz[0] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_r, tex_int);
+   swz[1] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_g, tex_int);
+   swz[2] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_b, tex_int);
+   swz[3] = nv50_tic_swizzle(tic[0], view->pipe.swizzle_a, tex_int);
+   tic[0] = (tic[0] & ~NV50_TIC_0_SWIZZLE__MASK) |
+      (swz[0] << NV50_TIC_0_MAPR__SHIFT) |
+      (swz[1] << NV50_TIC_0_MAPG__SHIFT) |
+      (swz[2] << NV50_TIC_0_MAPB__SHIFT) |
+      (swz[3] << NV50_TIC_0_MAPA__SHIFT);
+
+   address = mt->base.address;
+
+   tic[2] = 0x10001000 | NV50_TIC_2_NO_BORDER;
+
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+      tic[2] |= NV50_TIC_2_COLORSPACE_SRGB;
+
+   if (!(flags & NV50_TEXVIEW_SCALED_COORDS))
+      tic[2] |= NV50_TIC_2_NORMALIZED_COORDS;
+
+   /* check for linear storage type */
+   if (unlikely(!nouveau_bo_memtype(nv04_resource(texture)->bo))) {
+      if (texture->target == PIPE_BUFFER) {
+         assert(!(tic[2] & NV50_TIC_2_NORMALIZED_COORDS));
+         address +=
+            view->pipe.u.buf.first_element * desc->block.bits / 8;
+         tic[2] |= NV50_TIC_2_LINEAR | NV50_TIC_2_TARGET_BUFFER;
+         tic[3] = 0;
+         tic[4] = /* width */
+            view->pipe.u.buf.last_element - view->pipe.u.buf.first_element + 1;
+         tic[5] = 0;
+      } else {
+         /* must be 2D texture without mip maps */
+         tic[2] |= NV50_TIC_2_LINEAR | NV50_TIC_2_TARGET_RECT;
+         tic[3] = mt->level[0].pitch;
+         tic[4] = mt->base.base.width0;
+         tic[5] = (1 << 16) | mt->base.base.height0;
+      }
+      tic[6] =
+      tic[7] = 0;
+      tic[1] = address;
+      tic[2] |= address >> 32;
+      return &view->pipe;
+   }
+
+   tic[2] |=
+      ((mt->level[0].tile_mode & 0x0f0) << (22 - 4)) |
+      ((mt->level[0].tile_mode & 0xf00) << (25 - 8));
+
+   depth = MAX2(mt->base.base.array_size, mt->base.base.depth0);
+
+   if (mt->base.base.array_size > 1) {
+      /* there doesn't seem to be a base layer field in TIC */
+      address += view->pipe.u.tex.first_layer * mt->layer_stride;
+      depth = view->pipe.u.tex.last_layer - view->pipe.u.tex.first_layer + 1;
+   }
+   tic[1] = address;
+   tic[2] |= address >> 32;
+
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      tic[2] |= NV50_TIC_2_TARGET_1D;
+      break;
+   case PIPE_TEXTURE_2D:
+      tic[2] |= NV50_TIC_2_TARGET_2D;
+      break;
+   case PIPE_TEXTURE_RECT:
+      tic[2] |= NV50_TIC_2_TARGET_RECT;
+      break;
+   case PIPE_TEXTURE_3D:
+      tic[2] |= NV50_TIC_2_TARGET_3D;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      depth /= 6;
+      tic[2] |= NV50_TIC_2_TARGET_CUBE;
+      break;
+   case PIPE_TEXTURE_1D_ARRAY:
+      tic[2] |= NV50_TIC_2_TARGET_1D_ARRAY;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+      tic[2] |= NV50_TIC_2_TARGET_2D_ARRAY;
+      break;
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      depth /= 6;
+      tic[2] |= NV50_TIC_2_TARGET_CUBE_ARRAY;
+      break;
+   default:
+      NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
+                  mt->base.base.target);
+      return FALSE;
+   }
+
+   tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
+
+   if (flags & NV50_TEXVIEW_ACCESS_RESOLVE) {
+      width = mt->base.base.width0 << mt->ms_x;
+      height = mt->base.base.height0 << mt->ms_y;
+   } else {
+      width = mt->base.base.width0;
+      height = mt->base.base.height0;
+   }
+
+   tic[4] = (1 << 31) | width;
+
+   tic[5] = height & 0xffff;
+   tic[5] |= depth << 16;
+   tic[5] |= mt->base.base.last_level << 28;
+
+   /* sampling points: (?) */
+   if (flags & NV50_TEXVIEW_ACCESS_RESOLVE)
+      tic[6] = (mt->ms_x > 1) ? 0x88000000 : 0x03000000;
+   else
+      tic[6] = 0x03000000;
+
+   tic[7] = (view->pipe.u.tex.last_level << 4) | view->pipe.u.tex.first_level;
+   tic[7] |= mt->ms_mode << 12;
+
+   return &view->pipe;
+}
+
+static boolean
+nvc0_validate_tic(struct nvc0_context *nvc0, int s)
+{
+   uint32_t commands[32];
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nouveau_bo *txc = nvc0->screen->txc;
+   unsigned i;
+   unsigned n = 0;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nvc0->num_textures[s]; ++i) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
+      struct nv04_resource *res;
+      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+
+      if (!tic) {
+         if (dirty)
+            commands[n++] = (i << 1) | 0;
+         continue;
+      }
+      res = nv04_resource(tic->pipe.texture);
+
+      if (tic->id < 0) {
+         tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
+
+         PUSH_SPACE(push, 17);
+         BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+         PUSH_DATAh(push, txc->offset + (tic->id * 32));
+         PUSH_DATA (push, txc->offset + (tic->id * 32));
+         BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, 1);
+         BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+         PUSH_DATA (push, 0x100111);
+         BEGIN_NIC0(push, NVC0_M2MF(DATA), 8);
+         PUSH_DATAp(push, &tic->tic[0], 8);
+
+         need_flush = TRUE;
+      } else
+      if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
+         PUSH_DATA (push, (tic->id << 4) | 1);
+         NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_cache_flush_count, 1);
+      }
+      nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      if (!dirty)
+         continue;
+      commands[n++] = (tic->id << 9) | (i << 1) | 1;
+
+      BCTX_REFN(nvc0->bufctx_3d, TEX(s, i), res, RD);
+   }
+   for (; i < nvc0->state.num_textures[s]; ++i)
+      commands[n++] = (i << 1) | 0;
+
+   nvc0->state.num_textures[s] = nvc0->num_textures[s];
+
+   if (n) {
+      BEGIN_NIC0(push, NVC0_3D(BIND_TIC(s)), n);
+      PUSH_DATAp(push, commands, n);
+   }
+   nvc0->textures_dirty[s] = 0;
+
+   return need_flush;
+}
+
+static boolean
+nve4_validate_tic(struct nvc0_context *nvc0, unsigned s)
+{
+   struct nouveau_bo *txc = nvc0->screen->txc;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned i;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nvc0->num_textures[s]; ++i) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
+      struct nv04_resource *res;
+      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+
+      if (!tic) {
+         nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+         continue;
+      }
+      res = nv04_resource(tic->pipe.texture);
+
+      if (tic->id < 0) {
+         tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
+
+         PUSH_SPACE(push, 16);
+         BEGIN_NVC0(push, NVE4_P2MF(DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, txc->offset + (tic->id * 32));
+         PUSH_DATA (push, txc->offset + (tic->id * 32));
+         BEGIN_NVC0(push, NVE4_P2MF(LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, 1);
+         BEGIN_1IC0(push, NVE4_P2MF(EXEC), 9);
+         PUSH_DATA (push, 0x1001);
+         PUSH_DATAp(push, &tic->tic[0], 8);
+
+         need_flush = TRUE;
+      } else
+      if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         BEGIN_NVC0(push, NVC0_3D(TEX_CACHE_CTL), 1);
+         PUSH_DATA (push, (tic->id << 4) | 1);
+      }
+      nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
+      nvc0->tex_handles[s][i] |= tic->id;
+      if (dirty)
+         BCTX_REFN(nvc0->bufctx_3d, TEX(s, i), res, RD);
+   }
+   for (; i < nvc0->state.num_textures[s]; ++i) {
+      nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+      nvc0->textures_dirty[s] |= 1 << i;
+   }
+
+   nvc0->state.num_textures[s] = nvc0->num_textures[s];
+
+   return need_flush;
+}
+
+void nvc0_validate_textures(struct nvc0_context *nvc0)
+{
+   boolean need_flush;
+
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
+      need_flush  = nve4_validate_tic(nvc0, 0);
+      need_flush |= nve4_validate_tic(nvc0, 3);
+      need_flush |= nve4_validate_tic(nvc0, 4);
+   } else {
+      need_flush  = nvc0_validate_tic(nvc0, 0);
+      need_flush |= nvc0_validate_tic(nvc0, 3);
+      need_flush |= nvc0_validate_tic(nvc0, 4);
+   }
+
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(TIC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
+   }
+}
+
+static boolean
+nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
+{
+   uint32_t commands[16];
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned i;
+   unsigned n = 0;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nvc0->num_samplers[s]; ++i) {
+      struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
+
+      if (!(nvc0->samplers_dirty[s] & (1 << i)))
+         continue;
+      if (!tsc) {
+         commands[n++] = (i << 4) | 0;
+         continue;
+      }
+      if (tsc->id < 0) {
+         tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc);
+
+         nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
+                               65536 + tsc->id * 32, NOUVEAU_BO_VRAM,
+                               32, tsc->tsc);
+         need_flush = TRUE;
+      }
+      nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
+
+      commands[n++] = (tsc->id << 12) | (i << 4) | 1;
+   }
+   for (; i < nvc0->state.num_samplers[s]; ++i)
+      commands[n++] = (i << 4) | 0;
+
+   nvc0->state.num_samplers[s] = nvc0->num_samplers[s];
+
+   if (n) {
+      BEGIN_NIC0(push, NVC0_3D(BIND_TSC(s)), n);
+      PUSH_DATAp(push, commands, n);
+   }
+   nvc0->samplers_dirty[s] = 0;
+
+   return need_flush;
+}
+
+boolean
+nve4_validate_tsc(struct nvc0_context *nvc0, int s)
+{
+   struct nouveau_bo *txc = nvc0->screen->txc;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned i;
+   boolean need_flush = FALSE;
+
+   for (i = 0; i < nvc0->num_samplers[s]; ++i) {
+      struct nv50_tsc_entry *tsc = nv50_tsc_entry(nvc0->samplers[s][i]);
+
+      if (!tsc) {
+         nvc0->tex_handles[s][i] |= NVE4_TSC_ENTRY_INVALID;
+         continue;
+      }
+      if (tsc->id < 0) {
+         tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc);
+
+         PUSH_SPACE(push, 16);
+         BEGIN_NVC0(push, NVE4_P2MF(DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, txc->offset + 65536 + (tsc->id * 32));
+         PUSH_DATA (push, txc->offset + 65536 + (tsc->id * 32));
+         BEGIN_NVC0(push, NVE4_P2MF(LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, 1);
+         BEGIN_1IC0(push, NVE4_P2MF(EXEC), 9);
+         PUSH_DATA (push, 0x1001);
+         PUSH_DATAp(push, &tsc->tsc[0], 8);
+
+         need_flush = TRUE;
+      }
+      nvc0->screen->tsc.lock[tsc->id / 32] |= 1 << (tsc->id % 32);
+
+      nvc0->tex_handles[s][i] &= ~NVE4_TSC_ENTRY_INVALID;
+      nvc0->tex_handles[s][i] |= tsc->id << 20;
+   }
+   for (; i < nvc0->state.num_samplers[s]; ++i) {
+      nvc0->tex_handles[s][i] |= NVE4_TSC_ENTRY_INVALID;
+      nvc0->samplers_dirty[s] |= 1 << i;
+   }
+
+   nvc0->state.num_samplers[s] = nvc0->num_samplers[s];
+
+   return need_flush;
+}
+
+void nvc0_validate_samplers(struct nvc0_context *nvc0)
+{
+   boolean need_flush;
+
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
+      need_flush  = nve4_validate_tsc(nvc0, 0);
+      need_flush |= nve4_validate_tsc(nvc0, 3);
+      need_flush |= nve4_validate_tsc(nvc0, 4);
+   } else {
+      need_flush  = nvc0_validate_tsc(nvc0, 0);
+      need_flush |= nvc0_validate_tsc(nvc0, 3);
+      need_flush |= nvc0_validate_tsc(nvc0, 4);
+   }
+
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(TSC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
+   }
+}
+
+/* Upload the "diagonal" entries for the possible texture sources ($t == $s).
+ * At some point we might want to get a list of the combinations used by a
+ * shader and fill in those entries instead of having it extract the handles.
+ */
+void
+nve4_set_tex_handles(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   unsigned s;
+
+   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
+      return;
+   address = nvc0->screen->uniform_bo->offset + (5 << 16);
+
+   for (s = 0; s < 5; ++s, address += (1 << 9)) {
+      uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
+      if (!dirty)
+         continue;
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, 512);
+      PUSH_DATAh(push, address);
+      PUSH_DATA (push, address);
+      do {
+         int i = ffs(dirty) - 1;
+         dirty &= ~(1 << i);
+
+         BEGIN_NVC0(push, NVC0_3D(CB_POS), 2);
+         PUSH_DATA (push, (8 + i) * 4);
+         PUSH_DATA (push, nvc0->tex_handles[s][i]);
+      } while (dirty);
+
+      nvc0->textures_dirty[s] = 0;
+      nvc0->samplers_dirty[s] = 0;
+   }
+}
+
+
+static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT];
+static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT];
+static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT];
+
+void
+nve4_set_surface_info(struct nouveau_pushbuf *push,
+                      struct pipe_surface *psf,
+                      struct nvc0_screen *screen)
+{
+   struct nv50_surface *sf = nv50_surface(psf);
+   struct nv04_resource *res;
+   uint64_t address;
+   uint32_t *const info = push->cur;
+   uint8_t log2cpp;
+
+   if (psf && !nve4_su_format_map[psf->format])
+      NOUVEAU_ERR("unsupported surface format, try is_format_supported() !\n");
+
+   push->cur += 16;
+
+   if (!psf || !nve4_su_format_map[psf->format]) {
+      memset(info, 0, 16 * sizeof(*info));
+
+      info[0] = 0xbadf0000;
+      info[1] = 0x80004000;
+      info[12] = nve4_suldp_lib_offset[PIPE_FORMAT_R32G32B32A32_UINT] +
+         screen->lib_code->start;
+      return;
+   }
+   res = nv04_resource(sf->base.texture);
+
+   address = res->address + sf->offset;
+
+   info[8] = sf->width;
+   info[9] = sf->height;
+   info[10] = sf->depth;
+   switch (res->base.target) {
+   case PIPE_TEXTURE_1D_ARRAY:
+      info[11] = 1;
+      break;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+      info[11] = 2;
+      break;
+   case PIPE_TEXTURE_3D:
+      info[11] = 3;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      info[11] = 4;
+      break;
+   default:
+      info[11] = 0;
+      break;
+   }
+   log2cpp = (0xf000 & nve4_su_format_aux_map[sf->base.format]) >> 12;
+
+   info[12] = nve4_suldp_lib_offset[sf->base.format] + screen->lib_code->start;
+
+   /* limit in bytes for raw access */
+   info[13] = (0x06 << 22) | ((sf->width << log2cpp) - 1);
+
+   info[1] = nve4_su_format_map[sf->base.format];
+
+#if 0
+   switch (util_format_get_blocksizebits(sf->base.format)) {
+   case  16: info[1] |= 1 << 16; break;
+   case  32: info[1] |= 2 << 16; break;
+   case  64: info[1] |= 3 << 16; break;
+   case 128: info[1] |= 4 << 16; break;
+   default:
+      break;
+   }
+#else
+   info[1] |= log2cpp << 16;
+   info[1] |=  0x4000;
+   info[1] |= (0x0f00 & nve4_su_format_aux_map[sf->base.format]);
+#endif
+
+   if (res->base.target == PIPE_BUFFER) {
+      info[0]  = address >> 8;
+      info[2]  = sf->width - 1;
+      info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22;
+      info[3]  = 0;
+      info[4]  = 0;
+      info[5]  = 0;
+      info[6]  = 0;
+      info[7]  = 0;
+      info[14] = 0;
+      info[15] = 0;
+   } else {
+      struct nv50_miptree *mt = nv50_miptree(&res->base);
+      struct nv50_miptree_level *lvl = &mt->level[sf->base.u.tex.level];
+      const unsigned z = sf->base.u.tex.first_layer;
+
+      if (z) {
+         if (mt->layout_3d) {
+            address += nvc0_mt_zslice_offset(mt, psf->u.tex.level, z);
+            /* doesn't work if z passes z-tile boundary */
+            assert(sf->depth == 1);
+         } else {
+            address += mt->layer_stride * z;
+         }
+      }
+      info[0]  = address >> 8;
+      info[2]  = sf->width - 1;
+      /* NOTE: this is really important: */
+      info[2] |= (0xff & nve4_su_format_aux_map[sf->base.format]) << 22;
+      info[3]  = (0x88 << 24) | (lvl->pitch / 64);
+      info[4]  = sf->height - 1;
+      info[4] |= (lvl->tile_mode & 0x0f0) << 25;
+      info[4] |= NVC0_TILE_SHIFT_Y(lvl->tile_mode) << 22;
+      info[5]  = mt->layer_stride >> 8;
+      info[6]  = sf->depth - 1;
+      info[6] |= (lvl->tile_mode & 0xf00) << 21;
+      info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22;
+      info[7]  = 0;
+      info[14] = mt->ms_x;
+      info[15] = mt->ms_y;
+   }
+}
+
+static INLINE void
+nvc0_update_surface_bindings(struct nvc0_context *nvc0)
+{
+   /* TODO */
+}
+
+static INLINE void
+nve4_update_surface_bindings(struct nvc0_context *nvc0)
+{
+   /* TODO */
+}
+
+void
+nvc0_validate_surfaces(struct nvc0_context *nvc0)
+{
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
+      nve4_update_surface_bindings(nvc0);
+   } else {
+      nvc0_update_surface_bindings(nvc0);
+   }
+}
+
+
+static const uint8_t nve4_su_format_map[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = NVE4_IMAGE_FORMAT_RGBA32_FLOAT,
+   [PIPE_FORMAT_R32G32B32A32_SINT] = NVE4_IMAGE_FORMAT_RGBA32_SINT,
+   [PIPE_FORMAT_R32G32B32A32_UINT] = NVE4_IMAGE_FORMAT_RGBA32_UINT,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = NVE4_IMAGE_FORMAT_RGBA16_FLOAT,
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = NVE4_IMAGE_FORMAT_RGBA16_UNORM,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = NVE4_IMAGE_FORMAT_RGBA16_SNORM,
+   [PIPE_FORMAT_R16G16B16A16_SINT] = NVE4_IMAGE_FORMAT_RGBA16_SINT,
+   [PIPE_FORMAT_R16G16B16A16_UINT] = NVE4_IMAGE_FORMAT_RGBA16_UINT,
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = NVE4_IMAGE_FORMAT_RGBA8_UNORM,
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = NVE4_IMAGE_FORMAT_RGBA8_SNORM,
+   [PIPE_FORMAT_R8G8B8A8_SINT] = NVE4_IMAGE_FORMAT_RGBA8_SINT,
+   [PIPE_FORMAT_R8G8B8A8_UINT] = NVE4_IMAGE_FORMAT_RGBA8_UINT,
+   [PIPE_FORMAT_R11G11B10_FLOAT] = NVE4_IMAGE_FORMAT_R11G11B10_FLOAT,
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = NVE4_IMAGE_FORMAT_RGB10_A2_UNORM,
+/* [PIPE_FORMAT_R10G10B10A2_UINT] = NVE4_IMAGE_FORMAT_RGB10_A2_UINT, */
+   [PIPE_FORMAT_R32G32_FLOAT] = NVE4_IMAGE_FORMAT_RG32_FLOAT,
+   [PIPE_FORMAT_R32G32_SINT] = NVE4_IMAGE_FORMAT_RG32_SINT,
+   [PIPE_FORMAT_R32G32_UINT] = NVE4_IMAGE_FORMAT_RG32_UINT,
+   [PIPE_FORMAT_R16G16_FLOAT] = NVE4_IMAGE_FORMAT_RG16_FLOAT,
+   [PIPE_FORMAT_R16G16_UNORM] = NVE4_IMAGE_FORMAT_RG16_UNORM,
+   [PIPE_FORMAT_R16G16_SNORM] = NVE4_IMAGE_FORMAT_RG16_SNORM,
+   [PIPE_FORMAT_R16G16_SINT] = NVE4_IMAGE_FORMAT_RG16_SINT,
+   [PIPE_FORMAT_R16G16_UINT] = NVE4_IMAGE_FORMAT_RG16_UINT,
+   [PIPE_FORMAT_R8G8_UNORM] = NVE4_IMAGE_FORMAT_RG8_UNORM,
+   [PIPE_FORMAT_R8G8_SNORM] = NVE4_IMAGE_FORMAT_RG8_SNORM,
+   [PIPE_FORMAT_R8G8_SINT] = NVE4_IMAGE_FORMAT_RG8_SINT,
+   [PIPE_FORMAT_R8G8_UINT] = NVE4_IMAGE_FORMAT_RG8_UINT,
+   [PIPE_FORMAT_R32_FLOAT] = NVE4_IMAGE_FORMAT_R32_FLOAT,
+   [PIPE_FORMAT_R32_SINT] = NVE4_IMAGE_FORMAT_R32_SINT,
+   [PIPE_FORMAT_R32_UINT] = NVE4_IMAGE_FORMAT_R32_UINT,
+   [PIPE_FORMAT_R16_FLOAT] = NVE4_IMAGE_FORMAT_R16_FLOAT,
+   [PIPE_FORMAT_R16_UNORM] = NVE4_IMAGE_FORMAT_R16_UNORM,
+   [PIPE_FORMAT_R16_SNORM] = NVE4_IMAGE_FORMAT_R16_SNORM,
+   [PIPE_FORMAT_R16_SINT] = NVE4_IMAGE_FORMAT_R16_SINT,
+   [PIPE_FORMAT_R16_UINT] = NVE4_IMAGE_FORMAT_R16_UINT,
+   [PIPE_FORMAT_R8_UNORM] = NVE4_IMAGE_FORMAT_R8_UNORM,
+   [PIPE_FORMAT_R8_SNORM] = NVE4_IMAGE_FORMAT_R8_SNORM,
+   [PIPE_FORMAT_R8_SINT] = NVE4_IMAGE_FORMAT_R8_SINT,
+   [PIPE_FORMAT_R8_UINT] = NVE4_IMAGE_FORMAT_R8_UINT,
+};
+
+/* Auxiliary format description values for surface instructions.
+ * (log2(bytes per pixel) << 12) | (unk8 << 8) | unk22
+ */
+static const uint16_t nve4_su_format_aux_map[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x4842,
+   [PIPE_FORMAT_R32G32B32A32_SINT] = 0x4842,
+   [PIPE_FORMAT_R32G32B32A32_UINT] = 0x4842,
+
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_SINT] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_UINT] = 0x3933,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3933,
+
+   [PIPE_FORMAT_R32G32_FLOAT] = 0x3433,
+   [PIPE_FORMAT_R32G32_SINT] = 0x3433,
+   [PIPE_FORMAT_R32G32_UINT] = 0x3433,
+
+   [PIPE_FORMAT_R10G10B10A2_UNORM] = 0x2a24,
+/* [PIPE_FORMAT_R10G10B10A2_UINT] = 0x2a24, */
+   [PIPE_FORMAT_R8G8B8A8_UNORM] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_SNORM] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_SINT] = 0x2a24,
+   [PIPE_FORMAT_R8G8B8A8_UINT] = 0x2a24,
+   [PIPE_FORMAT_R11G11B10_FLOAT] = 0x2a24,
+
+   [PIPE_FORMAT_R16G16_UNORM] = 0x2524,
+   [PIPE_FORMAT_R16G16_SNORM] = 0x2524,
+   [PIPE_FORMAT_R16G16_SINT] = 0x2524,
+   [PIPE_FORMAT_R16G16_UINT] = 0x2524,
+   [PIPE_FORMAT_R16G16_FLOAT] = 0x2524,
+
+   [PIPE_FORMAT_R32_SINT] = 0x2024,
+   [PIPE_FORMAT_R32_UINT] = 0x2024,
+   [PIPE_FORMAT_R32_FLOAT] = 0x2024,
+
+   [PIPE_FORMAT_R8G8_UNORM] = 0x1615,
+   [PIPE_FORMAT_R8G8_SNORM] = 0x1615,
+   [PIPE_FORMAT_R8G8_SINT] = 0x1615,
+   [PIPE_FORMAT_R8G8_UINT] = 0x1615,
+
+   [PIPE_FORMAT_R16_UNORM] = 0x1115,
+   [PIPE_FORMAT_R16_SNORM] = 0x1115,
+   [PIPE_FORMAT_R16_SINT] = 0x1115,
+   [PIPE_FORMAT_R16_UINT] = 0x1115,
+   [PIPE_FORMAT_R16_FLOAT] = 0x1115,
+
+   [PIPE_FORMAT_R8_UNORM] = 0x0206,
+   [PIPE_FORMAT_R8_SNORM] = 0x0206,
+   [PIPE_FORMAT_R8_SINT] = 0x0206,
+   [PIPE_FORMAT_R8_UINT] = 0x0206
+};
+
+/* NOTE: These are hardcoded offsets for the shader library.
+ * TODO: Automate them.
+ */
+static const uint16_t nve4_suldp_lib_offset[PIPE_FORMAT_COUNT] =
+{
+   [PIPE_FORMAT_R32G32B32A32_FLOAT] = 0x218,
+   [PIPE_FORMAT_R32G32B32A32_SINT]  = 0x218,
+   [PIPE_FORMAT_R32G32B32A32_UINT]  = 0x218,
+   [PIPE_FORMAT_R16G16B16A16_UNORM] = 0x248,
+   [PIPE_FORMAT_R16G16B16A16_SNORM] = 0x2b8,
+   [PIPE_FORMAT_R16G16B16A16_SINT]  = 0x330,
+   [PIPE_FORMAT_R16G16B16A16_UINT]  = 0x388,
+   [PIPE_FORMAT_R16G16B16A16_FLOAT] = 0x3d8,
+   [PIPE_FORMAT_R32G32_FLOAT]       = 0x428,
+   [PIPE_FORMAT_R32G32_SINT]        = 0x468,
+   [PIPE_FORMAT_R32G32_UINT]        = 0x468,
+   [PIPE_FORMAT_R10G10B10A2_UNORM]  = 0x4a8,
+/* [PIPE_FORMAT_R10G10B10A2_UINT]   = 0x530, */
+   [PIPE_FORMAT_R8G8B8A8_UNORM]     = 0x588,
+   [PIPE_FORMAT_R8G8B8A8_SNORM]     = 0x5f8,
+   [PIPE_FORMAT_R8G8B8A8_SINT]      = 0x670,
+   [PIPE_FORMAT_R8G8B8A8_UINT]      = 0x6c8,
+   [PIPE_FORMAT_B5G6R5_UNORM]       = 0x718,
+   [PIPE_FORMAT_B5G5R5X1_UNORM]     = 0x7a0,
+   [PIPE_FORMAT_R16G16_UNORM]       = 0x828,
+   [PIPE_FORMAT_R16G16_SNORM]       = 0x890,
+   [PIPE_FORMAT_R16G16_SINT]        = 0x8f0,
+   [PIPE_FORMAT_R16G16_UINT]        = 0x948,
+   [PIPE_FORMAT_R16G16_FLOAT]       = 0x998,
+   [PIPE_FORMAT_R32_FLOAT]          = 0x9e8,
+   [PIPE_FORMAT_R32_SINT]           = 0xa30,
+   [PIPE_FORMAT_R32_UINT]           = 0xa30,
+   [PIPE_FORMAT_R8G8_UNORM]         = 0xa78,
+   [PIPE_FORMAT_R8G8_SNORM]         = 0xae0,
+   [PIPE_FORMAT_R8G8_UINT]          = 0xb48,
+   [PIPE_FORMAT_R8G8_SINT]          = 0xb98,
+   [PIPE_FORMAT_R16_UNORM]          = 0xbe8,
+   [PIPE_FORMAT_R16_SNORM]          = 0xc48,
+   [PIPE_FORMAT_R16_SINT]           = 0xca0,
+   [PIPE_FORMAT_R16_UINT]           = 0xce8,
+   [PIPE_FORMAT_R16_FLOAT]          = 0xd30,
+   [PIPE_FORMAT_R8_UNORM]           = 0xd88,
+   [PIPE_FORMAT_R8_SNORM]           = 0xde0,
+   [PIPE_FORMAT_R8_SINT]            = 0xe38,
+   [PIPE_FORMAT_R8_UINT]            = 0xe88,
+   [PIPE_FORMAT_R11G11B10_FLOAT]    = 0xed0
+};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
new file mode 100644
index 00000000000..82f1ffcd329
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -0,0 +1,558 @@
+
+#include "util/u_format.h"
+
+#include "nvc0/nvc0_context.h"
+
+#include "nv50/nv50_defs.xml.h"
+
+struct nvc0_transfer {
+   struct pipe_transfer base;
+   struct nv50_m2mf_rect rect[2];
+   uint32_t nblocksx;
+   uint16_t nblocksy;
+   uint16_t nlayers;
+};
+
+static void
+nvc0_m2mf_transfer_rect(struct nvc0_context *nvc0,
+                        const struct nv50_m2mf_rect *dst,
+                        const struct nv50_m2mf_rect *src,
+                        uint32_t nblocksx, uint32_t nblocksy)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nouveau_bufctx *bctx = nvc0->bufctx;
+   const int cpp = dst->cpp;
+   uint32_t src_ofst = src->base;
+   uint32_t dst_ofst = dst->base;
+   uint32_t height = nblocksy;
+   uint32_t sy = src->y;
+   uint32_t dy = dst->y;
+   uint32_t exec = (1 << 20);
+
+   assert(dst->cpp == src->cpp);
+
+   nouveau_bufctx_refn(bctx, 0, src->bo, src->domain | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(bctx, 0, dst->bo, dst->domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   if (nouveau_bo_memtype(src->bo)) {
+      BEGIN_NVC0(push, NVC0_M2MF(TILING_MODE_IN), 5);
+      PUSH_DATA (push, src->tile_mode);
+      PUSH_DATA (push, src->width * cpp);
+      PUSH_DATA (push, src->height);
+      PUSH_DATA (push, src->depth);
+      PUSH_DATA (push, src->z);
+   } else {
+      src_ofst += src->y * src->pitch + src->x * cpp;
+
+      BEGIN_NVC0(push, NVC0_M2MF(PITCH_IN), 1);
+      PUSH_DATA (push, src->width * cpp);
+
+      exec |= NVC0_M2MF_EXEC_LINEAR_IN;
+   }
+
+   if (nouveau_bo_memtype(dst->bo)) {
+      BEGIN_NVC0(push, NVC0_M2MF(TILING_MODE_OUT), 5);
+      PUSH_DATA (push, dst->tile_mode);
+      PUSH_DATA (push, dst->width * cpp);
+      PUSH_DATA (push, dst->height);
+      PUSH_DATA (push, dst->depth);
+      PUSH_DATA (push, dst->z);
+   } else {
+      dst_ofst += dst->y * dst->pitch + dst->x * cpp;
+
+      BEGIN_NVC0(push, NVC0_M2MF(PITCH_OUT), 1);
+      PUSH_DATA (push, dst->width * cpp);
+
+      exec |= NVC0_M2MF_EXEC_LINEAR_OUT;
+   }
+
+   while (height) {
+      int line_count = height > 2047 ? 2047 : height;
+
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_IN_HIGH), 2);
+      PUSH_DATAh(push, src->bo->offset + src_ofst);
+      PUSH_DATA (push, src->bo->offset + src_ofst);
+
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, dst->bo->offset + dst_ofst);
+      PUSH_DATA (push, dst->bo->offset + dst_ofst);
+
+      if (!(exec & NVC0_M2MF_EXEC_LINEAR_IN)) {
+         BEGIN_NVC0(push, NVC0_M2MF(TILING_POSITION_IN_X), 2);
+         PUSH_DATA (push, src->x * cpp);
+         PUSH_DATA (push, sy);
+      } else {
+         src_ofst += line_count * src->pitch;
+      }
+      if (!(exec & NVC0_M2MF_EXEC_LINEAR_OUT)) {
+         BEGIN_NVC0(push, NVC0_M2MF(TILING_POSITION_OUT_X), 2);
+         PUSH_DATA (push, dst->x * cpp);
+         PUSH_DATA (push, dy);
+      } else {
+         dst_ofst += line_count * dst->pitch;
+      }
+
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, nblocksx * cpp);
+      PUSH_DATA (push, line_count);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, exec);
+
+      height -= line_count;
+      sy += line_count;
+      dy += line_count;
+   }
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+static void
+nve4_m2mf_transfer_rect(struct nvc0_context *nvc0,
+                        const struct nv50_m2mf_rect *dst,
+                        const struct nv50_m2mf_rect *src,
+                        uint32_t nblocksx, uint32_t nblocksy)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nouveau_bufctx *bctx = nvc0->bufctx;
+   uint32_t exec;
+   uint32_t src_base = src->base;
+   uint32_t dst_base = dst->base;
+   const int cpp = dst->cpp;
+
+   assert(dst->cpp == src->cpp);
+
+   nouveau_bufctx_refn(bctx, 0, dst->bo, dst->domain | NOUVEAU_BO_WR);
+   nouveau_bufctx_refn(bctx, 0, src->bo, src->domain | NOUVEAU_BO_RD);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   exec = 0x200 /* 2D_ENABLE */ | 0x6 /* UNK */;
+
+   if (!nouveau_bo_memtype(dst->bo)) {
+      assert(!dst->z);
+      dst_base += dst->y * dst->pitch + dst->x * cpp;
+      exec |= 0x100; /* DST_MODE_2D_LINEAR */
+   }
+   if (!nouveau_bo_memtype(src->bo)) {
+      assert(!src->z);
+      src_base += src->y * src->pitch + src->x * cpp;
+      exec |= 0x080; /* SRC_MODE_2D_LINEAR */
+   }
+
+   BEGIN_NVC0(push, SUBC_COPY(0x070c), 6);
+   PUSH_DATA (push, 0x1000 | dst->tile_mode);
+   PUSH_DATA (push, dst->pitch);
+   PUSH_DATA (push, dst->height);
+   PUSH_DATA (push, dst->depth);
+   PUSH_DATA (push, dst->z);
+   PUSH_DATA (push, (dst->y << 16) | (dst->x * cpp));
+
+   BEGIN_NVC0(push, SUBC_COPY(0x0728), 6);
+   PUSH_DATA (push, 0x1000 | src->tile_mode);
+   PUSH_DATA (push, src->pitch);
+   PUSH_DATA (push, src->height);
+   PUSH_DATA (push, src->depth);
+   PUSH_DATA (push, src->z);
+   PUSH_DATA (push, (src->y << 16) | (src->x * cpp));
+
+   BEGIN_NVC0(push, SUBC_COPY(0x0400), 8);
+   PUSH_DATAh(push, src->bo->offset + src_base);
+   PUSH_DATA (push, src->bo->offset + src_base);
+   PUSH_DATAh(push, dst->bo->offset + dst_base);
+   PUSH_DATA (push, dst->bo->offset + dst_base);
+   PUSH_DATA (push, src->pitch);
+   PUSH_DATA (push, dst->pitch);
+   PUSH_DATA (push, nblocksx * cpp);
+   PUSH_DATA (push, nblocksy);
+
+   BEGIN_NVC0(push, SUBC_COPY(0x0300), 1);
+   PUSH_DATA (push, exec);
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+void
+nvc0_m2mf_push_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                      unsigned size, const void *data)
+{
+   struct nvc0_context *nvc0 = nvc0_context(&nv->pipe);
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   uint32_t *src = (uint32_t *)data;
+   unsigned count = (size + 3) / 4;
+
+   nouveau_bufctx_refn(nvc0->bufctx, 0, dst, domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   while (count) {
+      unsigned nr;
+
+      if (!PUSH_SPACE(push, 16))
+         break;
+      nr = PUSH_AVAIL(push);
+      assert(nr >= 16);
+      nr = MIN2(count, nr - 9);
+      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
+
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, dst->offset + offset);
+      PUSH_DATA (push, dst->offset + offset);
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, 0x100111);
+
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_NIC0(push, NVC0_M2MF(DATA), nr);
+      PUSH_DATAp(push, src, nr);
+
+      count -= nr;
+      src += nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+void
+nve4_p2mf_push_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
+                      unsigned size, const void *data)
+{
+   struct nvc0_context *nvc0 = nvc0_context(&nv->pipe);
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   uint32_t *src = (uint32_t *)data;
+   unsigned count = (size + 3) / 4;
+
+   nouveau_bufctx_refn(nvc0->bufctx, 0, dst, domain | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, nvc0->bufctx);
+   nouveau_pushbuf_validate(push);
+
+   while (count) {
+      unsigned nr;
+
+      if (!PUSH_SPACE(push, 16))
+         break;
+      nr = PUSH_AVAIL(push);
+      assert(nr >= 16);
+      nr = MIN2(count, nr - 8);
+      nr = MIN2(nr, (NV04_PFIFO_MAX_PACKET_LEN - 1));
+
+      BEGIN_NVC0(push, NVE4_P2MF(DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, dst->offset + offset);
+      PUSH_DATA (push, dst->offset + offset);
+      BEGIN_NVC0(push, NVE4_P2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, MIN2(size, nr * 4));
+      PUSH_DATA (push, 1);
+      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
+      BEGIN_1IC0(push, NVE4_P2MF(EXEC), nr + 1);
+      PUSH_DATA (push, 0x1001);
+      PUSH_DATAp(push, src, nr);
+
+      count -= nr;
+      src += nr;
+      offset += nr * 4;
+      size -= nr * 4;
+   }
+
+   nouveau_bufctx_reset(nvc0->bufctx, 0);
+}
+
+static void
+nvc0_m2mf_copy_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned dstoff, unsigned dstdom,
+                      struct nouveau_bo *src, unsigned srcoff, unsigned srcdom,
+                      unsigned size)
+{
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   struct nouveau_bufctx *bctx = nvc0_context(&nv->pipe)->bufctx;
+
+   nouveau_bufctx_refn(bctx, 0, src, srcdom | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(bctx, 0, dst, dstdom | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   while (size) {
+      unsigned bytes = MIN2(size, 1 << 17);
+
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
+      PUSH_DATAh(push, dst->offset + dstoff);
+      PUSH_DATA (push, dst->offset + dstoff);
+      BEGIN_NVC0(push, NVC0_M2MF(OFFSET_IN_HIGH), 2);
+      PUSH_DATAh(push, src->offset + srcoff);
+      PUSH_DATA (push, src->offset + srcoff);
+      BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, bytes);
+      PUSH_DATA (push, 1);
+      BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
+      PUSH_DATA (push, (1 << NVC0_M2MF_EXEC_INC__SHIFT) |
+                 NVC0_M2MF_EXEC_LINEAR_IN | NVC0_M2MF_EXEC_LINEAR_OUT);
+
+      srcoff += bytes;
+      dstoff += bytes;
+      size -= bytes;
+   }
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+static void
+nve4_m2mf_copy_linear(struct nouveau_context *nv,
+                      struct nouveau_bo *dst, unsigned dstoff, unsigned dstdom,
+                      struct nouveau_bo *src, unsigned srcoff, unsigned srcdom,
+                      unsigned size)
+{
+   struct nouveau_pushbuf *push = nv->pushbuf;
+   struct nouveau_bufctx *bctx = nvc0_context(&nv->pipe)->bufctx;
+
+   nouveau_bufctx_refn(bctx, 0, src, srcdom | NOUVEAU_BO_RD);
+   nouveau_bufctx_refn(bctx, 0, dst, dstdom | NOUVEAU_BO_WR);
+   nouveau_pushbuf_bufctx(push, bctx);
+   nouveau_pushbuf_validate(push);
+
+   BEGIN_NVC0(push, SUBC_COPY(0x0400), 4);
+   PUSH_DATAh(push, src->offset + srcoff);
+   PUSH_DATA (push, src->offset + srcoff);
+   PUSH_DATAh(push, dst->offset + dstoff);
+   PUSH_DATA (push, dst->offset + dstoff);
+   BEGIN_NVC0(push, SUBC_COPY(0x0418), 1);
+   PUSH_DATA (push, size);
+   BEGIN_NVC0(push, SUBC_COPY(0x0300), 1);
+   PUSH_DATA (push, 0x186);
+
+   nouveau_bufctx_reset(bctx, 0);
+}
+
+
+static INLINE boolean
+nvc0_mt_transfer_can_map_directly(struct nv50_miptree *mt)
+{
+   if (mt->base.domain == NOUVEAU_BO_VRAM)
+      return FALSE;
+   if (mt->base.base.usage != PIPE_USAGE_STAGING)
+      return FALSE;
+   return !nouveau_bo_memtype(mt->base.bo);
+}
+
+static INLINE boolean
+nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
+{
+   if (!mt->base.mm) {
+      uint32_t access = (usage & PIPE_TRANSFER_WRITE) ?
+         NOUVEAU_BO_WR : NOUVEAU_BO_RD;
+      return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client);
+   }
+   if (usage & PIPE_TRANSFER_WRITE)
+      return !mt->base.fence || nouveau_fence_wait(mt->base.fence);
+   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr);
+}
+
+void *
+nvc0_miptree_transfer_map(struct pipe_context *pctx,
+                          struct pipe_resource *res,
+                          unsigned level,
+                          unsigned usage,
+                          const struct pipe_box *box,
+                          struct pipe_transfer **ptransfer)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pctx);
+   struct nouveau_device *dev = nvc0->screen->base.device;
+   struct nv50_miptree *mt = nv50_miptree(res);
+   struct nvc0_transfer *tx;
+   uint32_t size;
+   int ret;
+   unsigned flags = 0;
+
+   if (nvc0_mt_transfer_can_map_directly(mt)) {
+      ret = !nvc0_mt_sync(nvc0, mt, usage);
+      if (!ret)
+         ret = nouveau_bo_map(mt->base.bo, 0, NULL);
+      if (ret &&
+          (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+         return NULL;
+      if (!ret)
+         usage |= PIPE_TRANSFER_MAP_DIRECTLY;
+   } else
+   if (usage & PIPE_TRANSFER_MAP_DIRECTLY)
+      return NULL;
+
+   tx = CALLOC_STRUCT(nvc0_transfer);
+   if (!tx)
+      return NULL;
+
+   pipe_resource_reference(&tx->base.resource, res);
+
+   tx->base.level = level;
+   tx->base.usage = usage;
+   tx->base.box = *box;
+
+   if (util_format_is_plain(res->format)) {
+      tx->nblocksx = box->width << mt->ms_x;
+      tx->nblocksy = box->height << mt->ms_y;
+   } else {
+      tx->nblocksx = util_format_get_nblocksx(res->format, box->width);
+      tx->nblocksy = util_format_get_nblocksy(res->format, box->height);
+   }
+   tx->nlayers = box->depth;
+
+   tx->base.stride = tx->nblocksx * util_format_get_blocksize(res->format);
+   tx->base.layer_stride = tx->nblocksy * tx->base.stride;
+
+   if (usage & PIPE_TRANSFER_MAP_DIRECTLY) {
+      tx->base.stride = align(tx->base.stride, 128);
+      *ptransfer = &tx->base;
+      return mt->base.bo->map + mt->base.offset;
+   }
+
+   nv50_m2mf_rect_setup(&tx->rect[0], res, level, box->x, box->y, box->z);
+
+   size = tx->base.layer_stride;
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0,
+                        size * tx->nlayers, NULL, &tx->rect[1].bo);
+   if (ret) {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      FREE(tx);
+      return NULL;
+   }
+
+   tx->rect[1].cpp = tx->rect[0].cpp;
+   tx->rect[1].width = tx->nblocksx;
+   tx->rect[1].height = tx->nblocksy;
+   tx->rect[1].depth = 1;
+   tx->rect[1].pitch = tx->base.stride;
+   tx->rect[1].domain = NOUVEAU_BO_GART;
+
+   if (usage & PIPE_TRANSFER_READ) {
+      unsigned base = tx->rect[0].base;
+      unsigned z = tx->rect[0].z;
+      unsigned i;
+      for (i = 0; i < tx->nlayers; ++i) {
+         nvc0->m2mf_copy_rect(nvc0, &tx->rect[1], &tx->rect[0],
+                              tx->nblocksx, tx->nblocksy);
+         if (mt->layout_3d)
+            tx->rect[0].z++;
+         else
+            tx->rect[0].base += mt->layer_stride;
+         tx->rect[1].base += size;
+      }
+      tx->rect[0].z = z;
+      tx->rect[0].base = base;
+      tx->rect[1].base = 0;
+   }
+
+   if (tx->rect[1].bo->map) {
+      *ptransfer = &tx->base;
+      return tx->rect[1].bo->map;
+   }
+
+   if (usage & PIPE_TRANSFER_READ)
+      flags = NOUVEAU_BO_RD;
+   if (usage & PIPE_TRANSFER_WRITE)
+      flags |= NOUVEAU_BO_WR;
+
+   ret = nouveau_bo_map(tx->rect[1].bo, flags, nvc0->screen->base.client);
+   if (ret) {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      nouveau_bo_ref(NULL, &tx->rect[1].bo);
+      FREE(tx);
+      return NULL;
+   }
+
+   *ptransfer = &tx->base;
+   return tx->rect[1].bo->map;
+}
+
+void
+nvc0_miptree_transfer_unmap(struct pipe_context *pctx,
+                            struct pipe_transfer *transfer)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pctx);
+   struct nvc0_transfer *tx = (struct nvc0_transfer *)transfer;
+   struct nv50_miptree *mt = nv50_miptree(tx->base.resource);
+   unsigned i;
+
+   if (tx->base.usage & PIPE_TRANSFER_MAP_DIRECTLY) {
+      pipe_resource_reference(&transfer->resource, NULL);
+
+      FREE(tx);
+      return;
+   }
+
+   if (tx->base.usage & PIPE_TRANSFER_WRITE) {
+      for (i = 0; i < tx->nlayers; ++i) {
+         nvc0->m2mf_copy_rect(nvc0, &tx->rect[0], &tx->rect[1],
+                              tx->nblocksx, tx->nblocksy);
+         if (mt->layout_3d)
+            tx->rect[0].z++;
+         else
+            tx->rect[0].base += mt->layer_stride;
+         tx->rect[1].base += tx->nblocksy * tx->base.stride;
+      }
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_transfers_wr, 1);
+   }
+   if (tx->base.usage & PIPE_TRANSFER_READ)
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, tex_transfers_rd, 1);
+
+   nouveau_bo_ref(NULL, &tx->rect[1].bo);
+   pipe_resource_reference(&transfer->resource, NULL);
+
+   FREE(tx);
+}
+
+/* This happens rather often with DTD9/st. */
+void
+nvc0_cb_push(struct nouveau_context *nv,
+             struct nouveau_bo *bo, unsigned domain,
+             unsigned base, unsigned size,
+             unsigned offset, unsigned words, const uint32_t *data)
+{
+   struct nouveau_pushbuf *push = nv->pushbuf;
+
+   NOUVEAU_DRV_STAT(nv->screen, constbuf_upload_count, 1);
+   NOUVEAU_DRV_STAT(nv->screen, constbuf_upload_bytes, words * 4);
+
+   assert(!(offset & 3));
+   size = align(size, 0x100);
+
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, size);
+   PUSH_DATAh(push, bo->offset + base);
+   PUSH_DATA (push, bo->offset + base);
+
+   while (words) {
+      unsigned nr = PUSH_AVAIL(push);
+      nr = MIN2(nr, words);
+      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);
+
+      PUSH_SPACE(push, nr + 2);
+      PUSH_REFN (push, bo, NOUVEAU_BO_WR | domain);
+      BEGIN_1IC0(push, NVC0_3D(CB_POS), nr + 1);
+      PUSH_DATA (push, offset);
+      PUSH_DATAp(push, data, nr);
+
+      words -= nr;
+      data += nr;
+      offset += nr * 4;
+   }
+}
+
+void
+nvc0_init_transfer_functions(struct nvc0_context *nvc0)
+{
+   if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS) {
+      nvc0->m2mf_copy_rect = nve4_m2mf_transfer_rect;
+      nvc0->base.copy_data = nve4_m2mf_copy_linear;
+      nvc0->base.push_data = nve4_p2mf_push_linear;
+   } else {
+      nvc0->m2mf_copy_rect = nvc0_m2mf_transfer_rect;
+      nvc0->base.copy_data = nvc0_m2mf_copy_linear;
+      nvc0->base.push_data = nvc0_m2mf_push_linear;
+   }
+   nvc0->base.push_cb = nvc0_cb_push;
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
new file mode 100644
index 00000000000..c4bc7dc693b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -0,0 +1,891 @@
+/*
+ * Copyright 2010 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+
+#include "nvc0/nvc0_3d.xml.h"
+
+void
+nvc0_vertex_state_delete(struct pipe_context *pipe,
+                         void *hwcso)
+{
+   struct nvc0_vertex_stateobj *so = hwcso;
+
+   if (so->translate)
+      so->translate->release(so->translate);
+   FREE(hwcso);
+}
+
+void *
+nvc0_vertex_state_create(struct pipe_context *pipe,
+                         unsigned num_elements,
+                         const struct pipe_vertex_element *elements)
+{
+    struct nvc0_vertex_stateobj *so;
+    struct translate_key transkey;
+    unsigned i;
+    unsigned src_offset_max = 0;
+
+    so = MALLOC(sizeof(*so) +
+                num_elements * sizeof(struct nvc0_vertex_element));
+    if (!so)
+        return NULL;
+    so->num_elements = num_elements;
+    so->instance_elts = 0;
+    so->instance_bufs = 0;
+    so->shared_slots = FALSE;
+    so->need_conversion = FALSE;
+
+    memset(so->vb_access_size, 0, sizeof(so->vb_access_size));
+
+    for (i = 0; i < PIPE_MAX_ATTRIBS; ++i)
+       so->min_instance_div[i] = 0xffffffff;
+
+    transkey.nr_elements = 0;
+    transkey.output_stride = 0;
+
+    for (i = 0; i < num_elements; ++i) {
+        const struct pipe_vertex_element *ve = &elements[i];
+        const unsigned vbi = ve->vertex_buffer_index;
+        unsigned size;
+        enum pipe_format fmt = ve->src_format;
+
+        so->element[i].pipe = elements[i];
+        so->element[i].state = nvc0_format_table[fmt].vtx;
+
+        if (!so->element[i].state) {
+            switch (util_format_get_nr_components(fmt)) {
+            case 1: fmt = PIPE_FORMAT_R32_FLOAT; break;
+            case 2: fmt = PIPE_FORMAT_R32G32_FLOAT; break;
+            case 3: fmt = PIPE_FORMAT_R32G32B32_FLOAT; break;
+            case 4: fmt = PIPE_FORMAT_R32G32B32A32_FLOAT; break;
+            default:
+                assert(0);
+                FREE(so);
+                return NULL;
+            }
+            so->element[i].state = nvc0_format_table[fmt].vtx;
+            so->need_conversion = TRUE;
+        }
+        size = util_format_get_blocksize(fmt);
+
+        src_offset_max = MAX2(src_offset_max, ve->src_offset);
+
+        if (so->vb_access_size[vbi] < (ve->src_offset + size))
+           so->vb_access_size[vbi] = ve->src_offset + size;
+
+        if (unlikely(ve->instance_divisor)) {
+           so->instance_elts |= 1 << i;
+           so->instance_bufs |= 1 << vbi;
+           if (ve->instance_divisor < so->min_instance_div[vbi])
+              so->min_instance_div[vbi] = ve->instance_divisor;
+        }
+
+        if (1) {
+            unsigned ca;
+            unsigned j = transkey.nr_elements++;
+
+            ca = util_format_description(fmt)->channel[0].size / 8;
+            if (ca != 1 && ca != 2)
+               ca = 4;
+
+            transkey.element[j].type = TRANSLATE_ELEMENT_NORMAL;
+            transkey.element[j].input_format = ve->src_format;
+            transkey.element[j].input_buffer = vbi;
+            transkey.element[j].input_offset = ve->src_offset;
+            transkey.element[j].instance_divisor = ve->instance_divisor;
+
+            transkey.output_stride = align(transkey.output_stride, ca);
+            transkey.element[j].output_format = fmt;
+            transkey.element[j].output_offset = transkey.output_stride;
+            transkey.output_stride += size;
+
+            so->element[i].state_alt = so->element[i].state;
+            so->element[i].state_alt |= transkey.element[j].output_offset << 7;
+        }
+
+        so->element[i].state |= i << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT;
+    }
+    transkey.output_stride = align(transkey.output_stride, 4);
+
+    so->size = transkey.output_stride;
+    so->translate = translate_create(&transkey);
+
+    if (so->instance_elts || src_offset_max >= (1 << 14))
+       return so;
+    so->shared_slots = TRUE;
+
+    for (i = 0; i < num_elements; ++i) {
+       const unsigned b = elements[i].vertex_buffer_index;
+       const unsigned s = elements[i].src_offset;
+       so->element[i].state &= ~NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__MASK;
+       so->element[i].state |= b << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT;
+       so->element[i].state |= s << NVC0_3D_VERTEX_ATTRIB_FORMAT_OFFSET__SHIFT;
+    }
+    return so;
+}
+
+#define NVC0_3D_VERTEX_ATTRIB_INACTIVE                                       \
+   NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |                                 \
+   NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 | NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST
+
+#define VTX_ATTR(a, c, t, s)                            \
+   ((NVC0_3D_VTX_ATTR_DEFINE_TYPE_##t) |                \
+    (NVC0_3D_VTX_ATTR_DEFINE_SIZE_##s) |                \
+    ((a) << NVC0_3D_VTX_ATTR_DEFINE_ATTR__SHIFT) |      \
+    ((c) << NVC0_3D_VTX_ATTR_DEFINE_COMP__SHIFT))
+
+static void
+nvc0_set_constant_vertex_attrib(struct nvc0_context *nvc0, const unsigned a)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct pipe_vertex_element *ve = &nvc0->vertex->element[a].pipe;
+   struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[ve->vertex_buffer_index];
+   uint32_t mode;
+   const struct util_format_description *desc;
+   void *dst;
+   const void *src = (const uint8_t *)vb->user_buffer + ve->src_offset;
+   assert(!vb->buffer);
+
+   desc = util_format_description(ve->src_format);
+
+   PUSH_SPACE(push, 6);
+   BEGIN_NVC0(push, NVC0_3D(VTX_ATTR_DEFINE), 5);
+   dst = &push->cur[1];
+   if (desc->channel[0].pure_integer) {
+      if (desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
+         mode = VTX_ATTR(a, 4, SINT, 32);
+         desc->unpack_rgba_sint(dst, 0, src, 0, 1, 1);
+      } else {
+         mode = VTX_ATTR(a, 4, UINT, 32);
+         desc->unpack_rgba_uint(dst, 0, src, 0, 1, 1);
+      }
+   } else {
+      mode = VTX_ATTR(a, 4, FLOAT, 32);
+      desc->unpack_rgba_float(dst, 0, src, 0, 1, 1);
+   }
+   push->cur[0] = mode;
+   push->cur += 5;
+}
+
+static INLINE void
+nvc0_user_vbuf_range(struct nvc0_context *nvc0, int vbi,
+                     uint32_t *base, uint32_t *size)
+{
+   if (unlikely(nvc0->vertex->instance_bufs & (1 << vbi))) {
+      const uint32_t div = nvc0->vertex->min_instance_div[vbi];
+      *base = nvc0->instance_off * nvc0->vtxbuf[vbi].stride;
+      *size = (nvc0->instance_max / div) * nvc0->vtxbuf[vbi].stride +
+         nvc0->vertex->vb_access_size[vbi];
+   } else {
+      /* NOTE: if there are user buffers, we *must* have index bounds */
+      assert(nvc0->vb_elt_limit != ~0);
+      *base = nvc0->vb_elt_first * nvc0->vtxbuf[vbi].stride;
+      *size = nvc0->vb_elt_limit * nvc0->vtxbuf[vbi].stride +
+         nvc0->vertex->vb_access_size[vbi];
+   }
+}
+
+static INLINE void
+nvc0_release_user_vbufs(struct nvc0_context *nvc0)
+{
+   if (nvc0->vbo_user) {
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
+      nouveau_scratch_done(&nvc0->base);
+   }
+}
+
+static void
+nvc0_update_user_vbufs(struct nvc0_context *nvc0)
+{
+   uint64_t address[PIPE_MAX_ATTRIBS];
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   int i;
+   uint32_t written = 0;
+
+   PUSH_SPACE(push, nvc0->vertex->num_elements * 8);
+   for (i = 0; i < nvc0->vertex->num_elements; ++i) {
+      struct pipe_vertex_element *ve = &nvc0->vertex->element[i].pipe;
+      const unsigned b = ve->vertex_buffer_index;
+      struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[b];
+      uint32_t base, size;
+
+      if (!(nvc0->vbo_user & (1 << b)))
+         continue;
+      if (!vb->stride) {
+         nvc0_set_constant_vertex_attrib(nvc0, i);
+         continue;
+      }
+      nvc0_user_vbuf_range(nvc0, b, &base, &size);
+
+      if (!(written & (1 << b))) {
+         struct nouveau_bo *bo;
+         const uint32_t bo_flags = NOUVEAU_BO_RD | NOUVEAU_BO_GART;
+         written |= 1 << b;
+         address[b] = nouveau_scratch_data(&nvc0->base, vb->user_buffer,
+                                           base, size, &bo);
+         if (bo)
+            BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, bo_flags, bo);
+
+         NOUVEAU_DRV_STAT(&nvc0->screen->base, user_buffer_upload_bytes, size);
+      }
+
+      BEGIN_1IC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_SELECT), 5);
+      PUSH_DATA (push, i);
+      PUSH_DATAh(push, address[b] + base + size - 1);
+      PUSH_DATA (push, address[b] + base + size - 1);
+      PUSH_DATAh(push, address[b] + ve->src_offset);
+      PUSH_DATA (push, address[b] + ve->src_offset);
+   }
+   nvc0->base.vbo_dirty = TRUE;
+}
+
+static void
+nvc0_update_user_vbufs_shared(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint32_t mask = nvc0->vbo_user & ~nvc0->constant_vbos;
+
+   PUSH_SPACE(push, nvc0->num_vtxbufs * 8);
+   while (mask) {
+      struct nouveau_bo *bo;
+      const uint32_t bo_flags = NOUVEAU_BO_RD | NOUVEAU_BO_GART;
+      uint64_t address;
+      uint32_t base, size;
+      const int b = ffs(mask) - 1;
+      mask &= ~(1 << b);
+
+      nvc0_user_vbuf_range(nvc0, b, &base, &size);
+
+      address = nouveau_scratch_data(&nvc0->base, nvc0->vtxbuf[b].user_buffer,
+                                     base, size, &bo);
+      if (bo)
+         BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, bo_flags, bo);
+
+      BEGIN_1IC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_SELECT), 5);
+      PUSH_DATA (push, b);
+      PUSH_DATAh(push, address + base + size - 1);
+      PUSH_DATA (push, address + base + size - 1);
+      PUSH_DATAh(push, address);
+      PUSH_DATA (push, address);
+
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, user_buffer_upload_bytes, size);
+   }
+
+   mask = nvc0->state.constant_elts;
+   while (mask) {
+      int i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+      nvc0_set_constant_vertex_attrib(nvc0, i);
+   }
+}
+
+static void
+nvc0_validate_vertex_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const struct nvc0_vertex_stateobj *vertex = nvc0->vertex;
+   uint32_t refd = 0;
+   unsigned i;
+
+   PUSH_SPACE(push, vertex->num_elements * 8);
+   for (i = 0; i < vertex->num_elements; ++i) {
+      const struct nvc0_vertex_element *ve;
+      const struct pipe_vertex_buffer *vb;
+      struct nv04_resource *res;
+      unsigned b;
+      unsigned limit, offset;
+
+      if (nvc0->state.constant_elts & (1 << i))
+         continue;
+      ve = &vertex->element[i];
+      b = ve->pipe.vertex_buffer_index;
+      vb = &nvc0->vtxbuf[b];
+
+      if (!vb->buffer) {
+         if (vb->stride) {
+            if (ve->pipe.instance_divisor) {
+               BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_DIVISOR(i)), 1);
+               PUSH_DATA (push, ve->pipe.instance_divisor);
+            }
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 1);
+            PUSH_DATA (push, (1 << 12) | vb->stride);
+         }
+         /* address/value set in nvc0_update_user_vbufs */
+         continue;
+      }
+      res = nv04_resource(vb->buffer);
+      offset = ve->pipe.src_offset + vb->buffer_offset;
+      limit = vb->buffer->width0 - 1;
+
+      if (unlikely(ve->pipe.instance_divisor)) {
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 4);
+         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATAh(push, res->address + offset);
+         PUSH_DATA (push, res->address + offset);
+         PUSH_DATA (push, ve->pipe.instance_divisor);
+      } else {
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 3);
+         PUSH_DATA (push, (1 << 12) | vb->stride);
+         PUSH_DATAh(push, res->address + offset);
+         PUSH_DATA (push, res->address + offset);
+      }
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
+      PUSH_DATAh(push, res->address + limit);
+      PUSH_DATA (push, res->address + limit);
+
+      if (!(refd & (1 << b))) {
+         refd |= 1 << b;
+         BCTX_REFN(nvc0->bufctx_3d, VTX, res, RD);
+      }
+   }
+   if (nvc0->vbo_user)
+      nvc0_update_user_vbufs(nvc0);
+}
+
+static void
+nvc0_validate_vertex_buffers_shared(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned b;
+   const uint32_t mask = nvc0->vbo_user;
+
+   PUSH_SPACE(push, nvc0->num_vtxbufs * 8);
+   for (b = 0; b < nvc0->num_vtxbufs; ++b) {
+      struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[b];
+      struct nv04_resource *buf;
+      uint32_t offset, limit;
+
+      if (mask & (1 << b)) {
+         if (vb->stride) {
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 1);
+            PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
+         }
+         /* address/value set in nvc0_update_user_vbufs_shared */
+         continue;
+      }
+      buf = nv04_resource(vb->buffer);
+      offset = vb->buffer_offset;
+      limit = buf->base.width0 - 1;
+
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(b)), 3);
+      PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | vb->stride);
+      PUSH_DATAh(push, buf->address + offset);
+      PUSH_DATA (push, buf->address + offset);
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(b)), 2);
+      PUSH_DATAh(push, buf->address + limit);
+      PUSH_DATA (push, buf->address + limit);
+
+      BCTX_REFN(nvc0->bufctx_3d, VTX, buf, RD);
+   }
+   if (nvc0->vbo_user)
+      nvc0_update_user_vbufs_shared(nvc0);
+}
+
+void
+nvc0_vertex_arrays_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_vertex_stateobj *vertex = nvc0->vertex;
+   struct nvc0_vertex_element *ve;
+   uint32_t const_vbos;
+   unsigned i;
+   uint8_t vbo_mode;
+   boolean update_vertex;
+
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX);
+
+   if (unlikely(vertex->need_conversion) ||
+       unlikely(nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS)) {
+      vbo_mode = 3;
+   } else {
+      vbo_mode = (nvc0->vbo_user && nvc0->vbo_push_hint) ? 1 : 0;
+   }
+   const_vbos = vbo_mode ? 0 : nvc0->constant_vbos;
+
+   update_vertex = (nvc0->dirty & NVC0_NEW_VERTEX) ||
+      (const_vbos != nvc0->state.constant_vbos) ||
+      (vbo_mode != nvc0->state.vbo_mode);
+
+   if (update_vertex) {
+      const unsigned n = MAX2(vertex->num_elements, nvc0->state.num_vtxelts);
+
+      nvc0->state.constant_vbos = const_vbos;
+      nvc0->state.constant_elts = 0;
+      nvc0->state.num_vtxelts = vertex->num_elements;
+      nvc0->state.vbo_mode = vbo_mode;
+
+      if (unlikely(vbo_mode)) {
+         if (unlikely(nvc0->state.instance_elts & 3)) {
+            /* translate mode uses only 2 vertex buffers */
+            nvc0->state.instance_elts &= ~3;
+            PUSH_SPACE(push, 3);
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_PER_INSTANCE(0)), 2);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, 0);
+         }
+
+         PUSH_SPACE(push, n * 2 + 4);
+
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(0)), n);
+         for (i = 0; i < vertex->num_elements; ++i)
+            PUSH_DATA(push, vertex->element[i].state_alt);
+         for (; i < n; ++i)
+            PUSH_DATA(push, NVC0_3D_VERTEX_ATTRIB_INACTIVE);
+
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(0)), 1);
+         PUSH_DATA (push, (1 << 12) | vertex->size);
+         for (i = 1; i < n; ++i)
+            IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
+      } else {
+         uint32_t *restrict data;
+
+         if (unlikely(vertex->instance_elts != nvc0->state.instance_elts)) {
+            nvc0->state.instance_elts = vertex->instance_elts;
+            assert(n); /* if (n == 0), both masks should be 0 */
+            PUSH_SPACE(push, 3);
+            BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
+            PUSH_DATA (push, n);
+            PUSH_DATA (push, vertex->instance_elts);
+         }
+
+         PUSH_SPACE(push, n * 2 + 1);
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(0)), n);
+         data = push->cur;
+         push->cur += n;
+         for (i = 0; i < vertex->num_elements; ++i) {
+            ve = &vertex->element[i];
+            data[i] = ve->state;
+            if (unlikely(const_vbos & (1 << ve->pipe.vertex_buffer_index))) {
+               nvc0->state.constant_elts |= 1 << i;
+               data[i] |= NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST;
+               IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
+            }
+         }
+         for (; i < n; ++i) {
+            data[i] = NVC0_3D_VERTEX_ATTRIB_INACTIVE;
+            IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
+         }
+      }
+   }
+   if (nvc0->state.vbo_mode) /* using translate, don't set up arrays here */
+      return;
+
+   if (vertex->shared_slots)
+      nvc0_validate_vertex_buffers_shared(nvc0);
+   else
+      nvc0_validate_vertex_buffers(nvc0);
+}
+
+void
+nvc0_idxbuf_validate(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv04_resource *buf = nv04_resource(nvc0->idxbuf.buffer);
+
+   assert(buf);
+   assert(nouveau_resource_mapped_by_gpu(&buf->base));
+
+   PUSH_SPACE(push, 6);
+   BEGIN_NVC0(push, NVC0_3D(INDEX_ARRAY_START_HIGH), 5);
+   PUSH_DATAh(push, buf->address + nvc0->idxbuf.offset);
+   PUSH_DATA (push, buf->address + nvc0->idxbuf.offset);
+   PUSH_DATAh(push, buf->address + buf->base.width0 - 1);
+   PUSH_DATA (push, buf->address + buf->base.width0 - 1);
+   PUSH_DATA (push, nvc0->idxbuf.index_size >> 1);
+
+   BCTX_REFN(nvc0->bufctx_3d, IDX, buf, RD);
+}
+
+#define NVC0_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
+
+static INLINE unsigned
+nvc0_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NVC0_PRIM_GL_CASE(POINTS);
+   NVC0_PRIM_GL_CASE(LINES);
+   NVC0_PRIM_GL_CASE(LINE_LOOP);
+   NVC0_PRIM_GL_CASE(LINE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLES);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLE_FAN);
+   NVC0_PRIM_GL_CASE(QUADS);
+   NVC0_PRIM_GL_CASE(QUAD_STRIP);
+   NVC0_PRIM_GL_CASE(POLYGON);
+   NVC0_PRIM_GL_CASE(LINES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
+   /*
+   NVC0_PRIM_GL_CASE(PATCHES); */
+   default:
+      return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
+   }
+}
+
+static void
+nvc0_draw_vbo_kick_notify(struct nouveau_pushbuf *push)
+{
+   struct nvc0_screen *screen = push->user_priv;
+
+   nouveau_fence_update(&screen->base, TRUE);
+
+   NOUVEAU_DRV_STAT(&screen->base, pushbuf_count, 1);
+}
+
+static void
+nvc0_draw_arrays(struct nvc0_context *nvc0,
+                 unsigned mode, unsigned start, unsigned count,
+                 unsigned instance_count)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned prim;
+
+   if (nvc0->state.index_bias) {
+      PUSH_SPACE(push, 1);
+      IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+      nvc0->state.index_bias = 0;
+   }
+
+   prim = nvc0_prim_gl(mode);
+
+   while (instance_count--) {
+      PUSH_SPACE(push, 6);
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, prim);
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
+      PUSH_DATA (push, start);
+      PUSH_DATA (push, count);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
+
+      prim |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_array, 1);
+}
+
+static void
+nvc0_draw_elements_inline_u08(struct nouveau_pushbuf *push, const uint8_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 3) {
+      unsigned i;
+      PUSH_SPACE(push, 4);
+      BEGIN_NIC0(push, NVC0_3D(VB_ELEMENT_U32), count & 3);
+      for (i = 0; i < (count & 3); ++i)
+         PUSH_DATA(push, *map++);
+      count &= ~3;
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 4) / 4;
+
+      PUSH_SPACE(push, nr + 1);
+      BEGIN_NIC0(push, NVC0_3D(VB_ELEMENT_U8), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push,
+                  (map[3] << 24) | (map[2] << 16) | (map[1] << 8) | map[0]);
+         map += 4;
+      }
+      count -= nr * 4;
+   }
+}
+
+static void
+nvc0_draw_elements_inline_u16(struct nouveau_pushbuf *push, const uint16_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      count &= ~1;
+      PUSH_SPACE(push, 2);
+      BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 2) / 2;
+
+      PUSH_SPACE(push, nr + 1);
+      BEGIN_NIC0(push, NVC0_3D(VB_ELEMENT_U16), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+      count -= nr * 2;
+   }
+}
+
+static void
+nvc0_draw_elements_inline_u32(struct nouveau_pushbuf *push, const uint32_t *map,
+                              unsigned start, unsigned count)
+{
+   map += start;
+
+   while (count) {
+      const unsigned nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN);
+
+      PUSH_SPACE(push, nr + 1);
+      BEGIN_NIC0(push, NVC0_3D(VB_ELEMENT_U32), nr);
+      PUSH_DATAp(push, map, nr);
+
+      map += nr;
+      count -= nr;
+   }
+}
+
+static void
+nvc0_draw_elements_inline_u32_short(struct nouveau_pushbuf *push,
+                                    const uint32_t *map,
+                                    unsigned start, unsigned count)
+{
+   map += start;
+
+   if (count & 1) {
+      count--;
+      PUSH_SPACE(push, 1);
+      BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+      PUSH_DATA (push, *map++);
+   }
+   while (count) {
+      unsigned i, nr = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN * 2) / 2;
+
+      PUSH_SPACE(push, nr + 1);
+      BEGIN_NIC0(push, NVC0_3D(VB_ELEMENT_U16), nr);
+      for (i = 0; i < nr; ++i) {
+         PUSH_DATA(push, (map[1] << 16) | map[0]);
+         map += 2;
+      }
+      count -= nr * 2;
+   }
+}
+
+static void
+nvc0_draw_elements(struct nvc0_context *nvc0, boolean shorten,
+                   unsigned mode, unsigned start, unsigned count,
+                   unsigned instance_count, int32_t index_bias)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   unsigned prim;
+   const unsigned index_size = nvc0->idxbuf.index_size;
+
+   prim = nvc0_prim_gl(mode);
+
+   if (index_bias != nvc0->state.index_bias) {
+      PUSH_SPACE(push, 2);
+      BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 1);
+      PUSH_DATA (push, index_bias);
+      nvc0->state.index_bias = index_bias;
+   }
+
+   if (nvc0->idxbuf.buffer) {
+      PUSH_SPACE(push, 1);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), prim);
+      do {
+         PUSH_SPACE(push, 7);
+         BEGIN_NVC0(push, NVC0_3D(INDEX_BATCH_FIRST), 2);
+         PUSH_DATA (push, start);
+         PUSH_DATA (push, count);
+         if (--instance_count) {
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_END_GL), 2);
+            PUSH_DATA (push, 0);
+            PUSH_DATA (push, prim | NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT);
+         }
+      } while (instance_count);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
+   } else {
+      const void *data = nvc0->idxbuf.user_buffer;
+
+      while (instance_count--) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
+         PUSH_DATA (push, prim);
+         switch (index_size) {
+         case 1:
+            nvc0_draw_elements_inline_u08(push, data, start, count);
+            break;
+         case 2:
+            nvc0_draw_elements_inline_u16(push, data, start, count);
+            break;
+         case 4:
+            if (shorten)
+               nvc0_draw_elements_inline_u32_short(push, data, start, count);
+            else
+               nvc0_draw_elements_inline_u32(push, data, start, count);
+            break;
+         default:
+            assert(0);
+            return;
+         }
+         PUSH_SPACE(push, 1);
+         IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
+
+         prim |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+      }
+   }
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_indexed, 1);
+}
+
+static void
+nvc0_draw_stream_output(struct nvc0_context *nvc0,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_so_target *so = nvc0_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned mode = nvc0_prim_gl(info->mode);
+   unsigned num_instances = info->instance_count;
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      PUSH_SPACE(push, 2);
+      IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
+      nvc0_query_fifo_wait(push, so->pq);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
+
+      NOUVEAU_DRV_STAT(&nvc0->screen->base, gpu_serialize_count, 1);
+   }
+
+   while (num_instances--) {
+      PUSH_SPACE(push, 8);
+      BEGIN_NVC0(push, NVC0_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, mode);
+      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BASE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_STRIDE), 1);
+      PUSH_DATA (push, so->stride);
+      BEGIN_NVC0(push, NVC0_3D(DRAW_TFB_BYTES), 1);
+      nvc0_query_pushbuf_submit(push, so->pq, 0x4);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_END_GL), 0);
+
+      mode |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   }
+}
+
+void
+nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
+   nvc0->vb_elt_first = info->min_index + info->index_bias;
+   nvc0->vb_elt_limit = info->max_index - info->min_index;
+   nvc0->instance_off = info->start_instance;
+   nvc0->instance_max = info->instance_count - 1;
+
+   /* For picking only a few vertices from a large user buffer, push is better,
+    * if index count is larger and we expect repeated vertices, suggest upload.
+    */
+   nvc0->vbo_push_hint =
+      info->indexed && (nvc0->vb_elt_limit >= (info->count * 2));
+
+   /* Check whether we want to switch vertex-submission mode. */
+   if (nvc0->vbo_user && !(nvc0->dirty & (NVC0_NEW_ARRAYS | NVC0_NEW_VERTEX))) {
+      if (nvc0->vbo_push_hint != !!nvc0->state.vbo_mode)
+         if (nvc0->state.vbo_mode != 3)
+            nvc0->dirty |= NVC0_NEW_ARRAYS;
+
+      if (!(nvc0->dirty & NVC0_NEW_ARRAYS) && nvc0->state.vbo_mode == 0) {
+         if (nvc0->vertex->shared_slots)
+            nvc0_update_user_vbufs_shared(nvc0);
+         else
+            nvc0_update_user_vbufs(nvc0);
+      }
+   }
+
+   /* 8 as minimum to avoid immediate double validation of new buffers */
+   nvc0_state_validate(nvc0, ~0, 8);
+
+   push->kick_notify = nvc0_draw_vbo_kick_notify;
+
+   if (nvc0->state.vbo_mode) {
+      nvc0_push_vbo(nvc0, info);
+      push->kick_notify = nvc0_default_kick_notify;
+      return;
+   }
+
+   /* space for base instance, flush, and prim restart */
+   PUSH_SPACE(push, 8);
+
+   if (nvc0->state.instance_base != info->start_instance) {
+      nvc0->state.instance_base = info->start_instance;
+      /* NOTE: this does not affect the shader input, should it ? */
+      BEGIN_NVC0(push, NVC0_3D(VB_INSTANCE_BASE), 1);
+      PUSH_DATA (push, info->start_instance);
+   }
+
+   if (nvc0->base.vbo_dirty) {
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
+      nvc0->base.vbo_dirty = FALSE;
+   }
+
+   if (info->indexed) {
+      boolean shorten = info->max_index <= 65535;
+
+      if (info->primitive_restart != nvc0->state.prim_restart) {
+         if (info->primitive_restart) {
+            BEGIN_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 2);
+            PUSH_DATA (push, 1);
+            PUSH_DATA (push, info->restart_index);
+
+            if (info->restart_index > 65535)
+               shorten = FALSE;
+         } else {
+            IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+         }
+         nvc0->state.prim_restart = info->primitive_restart;
+      } else
+      if (info->primitive_restart) {
+         BEGIN_NVC0(push, NVC0_3D(PRIM_RESTART_INDEX), 1);
+         PUSH_DATA (push, info->restart_index);
+
+         if (info->restart_index > 65535)
+            shorten = FALSE;
+      }
+
+      nvc0_draw_elements(nvc0, shorten,
+                         info->mode, info->start, info->count,
+                         info->instance_count, info->index_bias);
+   } else
+   if (unlikely(info->count_from_stream_output)) {
+      nvc0_draw_stream_output(nvc0, info);
+   } else {
+      nvc0_draw_arrays(nvc0,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
+   }
+   push->kick_notify = nvc0_default_kick_notify;
+
+   nvc0_release_user_vbufs(nvc0);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
new file mode 100644
index 00000000000..51e751cfa57
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -0,0 +1,649 @@
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "translate/translate.h"
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_resource.h"
+
+#include "nvc0/nvc0_3d.xml.h"
+
+struct push_context {
+   struct nouveau_pushbuf *push;
+
+   struct translate *translate;
+   void *dest;
+   const void *idxbuf;
+
+   uint32_t vertex_size;
+   uint32_t restart_index;
+   uint32_t instance_id;
+
+   boolean prim_restart;
+   boolean need_vertex_id;
+
+   struct {
+      boolean enabled;
+      boolean value;
+      unsigned stride;
+      const uint8_t *data;
+   } edgeflag;
+};
+
+static void nvc0_push_upload_vertex_ids(struct push_context *,
+                                        struct nvc0_context *,
+                                        const struct pipe_draw_info *);
+
+static void
+nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
+{
+   ctx->push = nvc0->base.pushbuf;
+
+   ctx->translate = nvc0->vertex->translate;
+   ctx->vertex_size = nvc0->vertex->size;
+
+   ctx->need_vertex_id =
+      nvc0->vertprog->vp.need_vertex_id && (nvc0->vertex->num_elements < 32);
+
+   ctx->edgeflag.value = TRUE;
+   ctx->edgeflag.enabled = nvc0->vertprog->vp.edgeflag < PIPE_MAX_ATTRIBS;
+
+   /* silence warnings */
+   ctx->edgeflag.data = NULL;
+   ctx->edgeflag.stride = 0;
+}
+
+static INLINE void
+nvc0_vertex_configure_translate(struct nvc0_context *nvc0, int32_t index_bias)
+{
+   struct translate *translate = nvc0->vertex->translate;
+   unsigned i;
+
+   for (i = 0; i < nvc0->num_vtxbufs; ++i) {
+      const uint8_t *map;
+      const struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[i];
+
+      if (likely(!vb->buffer))
+         map = (const uint8_t *)vb->user_buffer;
+      else
+         map = nouveau_resource_map_offset(&nvc0->base,
+            nv04_resource(vb->buffer), vb->buffer_offset, NOUVEAU_BO_RD);
+
+      if (index_bias && !unlikely(nvc0->vertex->instance_bufs & (1 << i)))
+         map += (intptr_t)index_bias * vb->stride;
+
+      translate->set_buffer(translate, i, map, vb->stride, ~0);
+   }
+}
+
+static INLINE void
+nvc0_push_map_idxbuf(struct push_context *ctx, struct nvc0_context *nvc0)
+{
+   if (nvc0->idxbuf.buffer) {
+      struct nv04_resource *buf = nv04_resource(nvc0->idxbuf.buffer);
+      ctx->idxbuf = nouveau_resource_map_offset(&nvc0->base,
+         buf, nvc0->idxbuf.offset, NOUVEAU_BO_RD);
+   } else {
+      ctx->idxbuf = nvc0->idxbuf.user_buffer;
+   }
+}
+
+static INLINE void
+nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
+                       int32_t index_bias)
+{
+   unsigned attr = nvc0->vertprog->vp.edgeflag;
+   struct pipe_vertex_element *ve = &nvc0->vertex->element[attr].pipe;
+   struct pipe_vertex_buffer *vb = &nvc0->vtxbuf[ve->vertex_buffer_index];
+   struct nv04_resource *buf = nv04_resource(vb->buffer);
+   unsigned offset = vb->buffer_offset + ve->src_offset;
+
+   ctx->edgeflag.stride = vb->stride;
+   ctx->edgeflag.data = nouveau_resource_map_offset(&nvc0->base,
+                           buf, offset, NOUVEAU_BO_RD);
+   if (index_bias)
+      ctx->edgeflag.data += (intptr_t)index_bias * vb->stride;
+}
+
+static INLINE unsigned
+prim_restart_search_i08(const uint8_t *elts, unsigned push, uint8_t index)
+{
+   unsigned i;
+   for (i = 0; i < push && elts[i] != index; ++i);
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i16(const uint16_t *elts, unsigned push, uint16_t index)
+{
+   unsigned i;
+   for (i = 0; i < push && elts[i] != index; ++i);
+   return i;
+}
+
+static INLINE unsigned
+prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
+{
+   unsigned i;
+   for (i = 0; i < push && elts[i] != index; ++i);
+   return i;
+}
+
+static INLINE boolean
+ef_value(const struct push_context *ctx, uint32_t index)
+{
+   float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
+   return *pf ? TRUE : FALSE;
+}
+
+static INLINE boolean
+ef_toggle(struct push_context *ctx)
+{
+   ctx->edgeflag.value = !ctx->edgeflag.value;
+   return ctx->edgeflag.value;
+}
+
+static INLINE unsigned
+ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   return i;
+}
+
+static INLINE unsigned
+ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   return i;
+}
+
+static INLINE unsigned
+ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   return i;
+}
+
+static INLINE unsigned
+ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n && ef_value(ctx, start++) == ctx->edgeflag.value; ++i);
+   return i;
+}
+
+static INLINE void *
+nvc0_push_setup_vertex_array(struct nvc0_context *nvc0, const unsigned count)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nouveau_bo *bo;
+   uint64_t va;
+   const unsigned size = count * nvc0->vertex->size;
+
+   void *const dest = nouveau_scratch_get(&nvc0->base, size, &va, &bo);
+
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_START_HIGH(0)), 2);
+   PUSH_DATAh(push, va);
+   PUSH_DATA (push, va);
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(0)), 2);
+   PUSH_DATAh(push, va + size - 1);
+   PUSH_DATA (push, va + size - 1);
+
+   BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
+                bo);
+   nouveau_pushbuf_validate(push);
+
+   return dest;
+}
+
+static void
+disp_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
+{
+   struct nouveau_pushbuf *push = ctx->push;
+   struct translate *translate = ctx->translate;
+   const uint8_t *restrict elts = (uint8_t *)ctx->idxbuf + start;
+   unsigned pos = 0;
+
+   do {
+      unsigned nR = count;
+
+      if (unlikely(ctx->prim_restart))
+         nR = prim_restart_search_i08(elts, nR, ctx->restart_index);
+
+      translate->run_elts8(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
+      count -= nR;
+      ctx->dest += nR * ctx->vertex_size;
+
+      while (nR) {
+         unsigned nE = nR;
+
+         if (unlikely(ctx->edgeflag.enabled))
+            nE = ef_toggle_search_i08(ctx, elts, nR);
+
+         PUSH_SPACE(push, 4);
+         if (likely(nE >= 2)) {
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
+            PUSH_DATA (push, pos);
+            PUSH_DATA (push, nE);
+         } else
+         if (nE) {
+            if (pos <= 0xff) {
+               IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_U32), pos);
+            } else {
+               BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+               PUSH_DATA (push, pos);
+            }
+         }
+         if (unlikely(nE != nR))
+            IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
+
+         pos += nE;
+         elts += nE;
+         nR -= nE;
+      }
+      if (count) {
+         BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (push, ctx->restart_index);
+         ++elts;
+         ctx->dest += ctx->vertex_size;
+         ++pos;
+         --count;
+      }
+   } while (count);
+}
+
+static void
+disp_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
+{
+   struct nouveau_pushbuf *push = ctx->push;
+   struct translate *translate = ctx->translate;
+   const uint16_t *restrict elts = (uint16_t *)ctx->idxbuf + start;
+   unsigned pos = 0;
+
+   do {
+      unsigned nR = count;
+
+      if (unlikely(ctx->prim_restart))
+         nR = prim_restart_search_i16(elts, nR, ctx->restart_index);
+
+      translate->run_elts16(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
+      count -= nR;
+      ctx->dest += nR * ctx->vertex_size;
+
+      while (nR) {
+         unsigned nE = nR;
+
+         if (unlikely(ctx->edgeflag.enabled))
+            nE = ef_toggle_search_i16(ctx, elts, nR);
+
+         PUSH_SPACE(push, 4);
+         if (likely(nE >= 2)) {
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
+            PUSH_DATA (push, pos);
+            PUSH_DATA (push, nE);
+         } else
+         if (nE) {
+            if (pos <= 0xff) {
+               IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_U32), pos);
+            } else {
+               BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+               PUSH_DATA (push, pos);
+            }
+         }
+         if (unlikely(nE != nR))
+            IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
+
+         pos += nE;
+         elts += nE;
+         nR -= nE;
+      }
+      if (count) {
+         BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (push, ctx->restart_index);
+         ++elts;
+         ctx->dest += ctx->vertex_size;
+         ++pos;
+         --count;
+      }
+   } while (count);
+}
+
+static void
+disp_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
+{
+   struct nouveau_pushbuf *push = ctx->push;
+   struct translate *translate = ctx->translate;
+   const uint32_t *restrict elts = (uint32_t *)ctx->idxbuf + start;
+   unsigned pos = 0;
+
+   do {
+      unsigned nR = count;
+
+      if (unlikely(ctx->prim_restart))
+         nR = prim_restart_search_i32(elts, nR, ctx->restart_index);
+
+      translate->run_elts(translate, elts, nR, 0, ctx->instance_id, ctx->dest);
+      count -= nR;
+      ctx->dest += nR * ctx->vertex_size;
+
+      while (nR) {
+         unsigned nE = nR;
+
+         if (unlikely(ctx->edgeflag.enabled))
+            nE = ef_toggle_search_i32(ctx, elts, nR);
+
+         PUSH_SPACE(push, 4);
+         if (likely(nE >= 2)) {
+            BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
+            PUSH_DATA (push, pos);
+            PUSH_DATA (push, nE);
+         } else
+         if (nE) {
+            if (pos <= 0xff) {
+               IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_U32), pos);
+            } else {
+               BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+               PUSH_DATA (push, pos);
+            }
+         }
+         if (unlikely(nE != nR))
+            IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
+
+         pos += nE;
+         elts += nE;
+         nR -= nE;
+      }
+      if (count) {
+         BEGIN_NVC0(push, NVC0_3D(VB_ELEMENT_U32), 1);
+         PUSH_DATA (push, ctx->restart_index);
+         ++elts;
+         ctx->dest += ctx->vertex_size;
+         ++pos;
+         --count;
+      }
+   } while (count);
+}
+
+static void
+disp_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
+{
+   struct nouveau_pushbuf *push = ctx->push;
+   struct translate *translate = ctx->translate;
+   unsigned pos = 0;
+
+   translate->run(translate, start, count, 0, ctx->instance_id, ctx->dest);
+   do {
+      unsigned nr = count;
+
+      if (unlikely(ctx->edgeflag.enabled))
+         nr = ef_toggle_search_seq(ctx, start + pos, nr);
+
+      PUSH_SPACE(push, 4);
+      if (likely(nr)) {
+         BEGIN_NVC0(push, NVC0_3D(VERTEX_BUFFER_FIRST), 2);
+         PUSH_DATA (push, pos);
+         PUSH_DATA (push, nr);
+      }
+      if (unlikely(nr != count))
+         IMMED_NVC0(push, NVC0_3D(EDGEFLAG), ef_toggle(ctx));
+
+      pos += nr;
+      count -= nr;
+   } while (count);
+}
+
+
+#define NVC0_PRIM_GL_CASE(n) \
+   case PIPE_PRIM_##n: return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_##n
+
+static INLINE unsigned
+nvc0_prim_gl(unsigned prim)
+{
+   switch (prim) {
+   NVC0_PRIM_GL_CASE(POINTS);
+   NVC0_PRIM_GL_CASE(LINES);
+   NVC0_PRIM_GL_CASE(LINE_LOOP);
+   NVC0_PRIM_GL_CASE(LINE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLES);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP);
+   NVC0_PRIM_GL_CASE(TRIANGLE_FAN);
+   NVC0_PRIM_GL_CASE(QUADS);
+   NVC0_PRIM_GL_CASE(QUAD_STRIP);
+   NVC0_PRIM_GL_CASE(POLYGON);
+   NVC0_PRIM_GL_CASE(LINES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(LINE_STRIP_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLES_ADJACENCY);
+   NVC0_PRIM_GL_CASE(TRIANGLE_STRIP_ADJACENCY);
+   /*
+   NVC0_PRIM_GL_CASE(PATCHES); */
+   default:
+      return NVC0_3D_VERTEX_BEGIN_GL_PRIMITIVE_POINTS;
+   }
+}
+
+void
+nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
+{
+   struct push_context ctx;
+   unsigned i, index_size;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
+   unsigned prim;
+
+   nvc0_push_context_init(nvc0, &ctx);
+
+   nvc0_vertex_configure_translate(nvc0, info->index_bias);
+
+   if (unlikely(ctx.edgeflag.enabled))
+      nvc0_push_map_edgeflag(&ctx, nvc0, info->index_bias);
+
+   ctx.prim_restart = info->primitive_restart;
+   ctx.restart_index = info->restart_index;
+
+   if (info->indexed) {
+      nvc0_push_map_idxbuf(&ctx, nvc0);
+      index_size = nvc0->idxbuf.index_size;
+
+      if (info->primitive_restart) {
+         BEGIN_NVC0(ctx.push, NVC0_3D(PRIM_RESTART_ENABLE), 2);
+         PUSH_DATA (ctx.push, 1);
+         PUSH_DATA (ctx.push, info->restart_index);
+      } else
+      if (nvc0->state.prim_restart) {
+         IMMED_NVC0(ctx.push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+      }
+      nvc0->state.prim_restart = info->primitive_restart;
+   } else {
+      if (unlikely(info->count_from_stream_output)) {
+         struct pipe_context *pipe = &nvc0->base.pipe;
+         struct nvc0_so_target *targ;
+         targ = nvc0_so_target(info->count_from_stream_output);
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         vert_count /= targ->stride;
+      }
+      ctx.idxbuf = NULL; /* shut up warnings */
+      index_size = 0;
+   }
+
+   ctx.instance_id = info->start_instance;
+
+   prim = nvc0_prim_gl(info->mode);
+   do {
+      PUSH_SPACE(ctx.push, 9);
+
+      ctx.dest = nvc0_push_setup_vertex_array(nvc0, vert_count);
+      if (unlikely(!ctx.dest))
+         break;
+
+      if (unlikely(ctx.need_vertex_id))
+         nvc0_push_upload_vertex_ids(&ctx, nvc0, info);
+
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_ARRAY_FLUSH), 0);
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (ctx.push, prim);
+      switch (index_size) {
+      case 1:
+         disp_vertices_i08(&ctx, info->start, vert_count);
+         break;
+      case 2:
+         disp_vertices_i16(&ctx, info->start, vert_count);
+         break;
+      case 4:
+         disp_vertices_i32(&ctx, info->start, vert_count);
+         break;
+      default:
+         assert(index_size == 0);
+         disp_vertices_seq(&ctx, info->start, vert_count);
+         break;
+      }
+      PUSH_SPACE(ctx.push, 1);
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_END_GL), 0);
+
+      if (--inst_count) {
+         prim |= NVC0_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+         ++ctx.instance_id;
+      }
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
+      nouveau_scratch_done(&nvc0->base);
+   } while (inst_count);
+
+
+   /* reset state and unmap buffers (no-op) */
+
+   if (unlikely(!ctx.edgeflag.value)) {
+      PUSH_SPACE(ctx.push, 1);
+      IMMED_NVC0(ctx.push, NVC0_3D(EDGEFLAG), 1);
+   }
+
+   if (unlikely(ctx.need_vertex_id)) {
+      PUSH_SPACE(ctx.push, 4);
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_ID_REPLACE), 0);
+      BEGIN_NVC0(ctx.push, NVC0_3D(VERTEX_ATTRIB_FORMAT(1)), 1);
+      PUSH_DATA (ctx.push,
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_FLOAT |
+                 NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32);
+      IMMED_NVC0(ctx.push, NVC0_3D(VERTEX_ARRAY_FETCH(1)), 0);
+   }
+
+   if (info->indexed)
+      nouveau_resource_unmap(nv04_resource(nvc0->idxbuf.buffer));
+   for (i = 0; i < nvc0->num_vtxbufs; ++i)
+      nouveau_resource_unmap(nv04_resource(nvc0->vtxbuf[i].buffer));
+
+   NOUVEAU_DRV_STAT(&nvc0->screen->base, draw_calls_fallback_count, 1);
+}
+
+static INLINE void
+copy_indices_u8(uint32_t *dst, const uint8_t *elts, uint32_t bias, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n; ++i)
+      dst[i] = elts[i] + bias;
+}
+
+static INLINE void
+copy_indices_u16(uint32_t *dst, const uint16_t *elts, uint32_t bias, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n; ++i)
+      dst[i] = elts[i] + bias;
+}
+
+static INLINE void
+copy_indices_u32(uint32_t *dst, const uint32_t *elts, uint32_t bias, unsigned n)
+{
+   unsigned i;
+   for (i = 0; i < n; ++i)
+      dst[i] = elts[i] + bias;
+}
+
+static void
+nvc0_push_upload_vertex_ids(struct push_context *ctx,
+                            struct nvc0_context *nvc0,
+                            const struct pipe_draw_info *info)
+
+{
+   struct nouveau_pushbuf *push = ctx->push;
+   struct nouveau_bo *bo;
+   uint64_t va;
+   uint32_t *data;
+   uint32_t format;
+   unsigned index_size = nvc0->idxbuf.index_size;
+   unsigned i;
+   unsigned a = nvc0->vertex->num_elements;
+
+   if (!index_size || info->index_bias)
+      index_size = 4;
+   data = (uint32_t *)nouveau_scratch_get(&nvc0->base,
+                                          info->count * index_size, &va, &bo);
+
+   BCTX_REFN_bo(nvc0->bufctx_3d, VTX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
+                bo);
+   nouveau_pushbuf_validate(push);
+
+   if (info->indexed) {
+      if (!info->index_bias) {
+         memcpy(data, ctx->idxbuf, info->count * index_size);
+      } else {
+         switch (nvc0->idxbuf.index_size) {
+         case 1:
+            copy_indices_u8(data, ctx->idxbuf, info->index_bias, info->count);
+            break;
+         case 2:
+            copy_indices_u16(data, ctx->idxbuf, info->index_bias, info->count);
+            break;
+         default:
+            copy_indices_u32(data, ctx->idxbuf, info->index_bias, info->count);
+            break;
+         }
+      }
+   } else {
+      for (i = 0; i < info->count; ++i)
+         data[i] = i + (info->start + info->index_bias);
+   }
+
+   format = (1 << NVC0_3D_VERTEX_ATTRIB_FORMAT_BUFFER__SHIFT) |
+      NVC0_3D_VERTEX_ATTRIB_FORMAT_TYPE_UINT;
+
+   switch (index_size) {
+   case 1:
+      format |= NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_8;
+      break;
+   case 2:
+      format |= NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_16;
+      break;
+   default:
+      format |= NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32;
+      break;
+   }
+
+   PUSH_SPACE(push, 12);
+
+   if (unlikely(nvc0->state.instance_elts & 2)) {
+      nvc0->state.instance_elts &= ~2;
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_PER_INSTANCE(1)), 0);
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ATTRIB_FORMAT(a)), 1);
+   PUSH_DATA (push, format);
+
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(1)), 3);
+   PUSH_DATA (push, NVC0_3D_VERTEX_ARRAY_FETCH_ENABLE | index_size);
+   PUSH_DATAh(push, va);
+   PUSH_DATA (push, va);
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ARRAY_LIMIT_HIGH(1)), 2);
+   PUSH_DATAh(push, va + info->count * index_size - 1);
+   PUSH_DATA (push, va + info->count * index_size - 1);
+
+#define NVC0_3D_VERTEX_ID_REPLACE_SOURCE_ATTR_X(a) \
+   (((0x80 + (a) * 0x10) / 4) << NVC0_3D_VERTEX_ID_REPLACE_SOURCE__SHIFT)
+
+   BEGIN_NVC0(push, NVC0_3D(VERTEX_ID_REPLACE), 1);
+   PUSH_DATA (push, NVC0_3D_VERTEX_ID_REPLACE_SOURCE_ATTR_X(a) | 1);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
new file mode 100644
index 00000000000..5871f590e0e
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_video.h"
+
+#include "util/u_sampler.h"
+#include "util/u_format.h"
+
+static void
+nvc0_decoder_decode_bitstream(struct pipe_video_codec *decoder,
+                              struct pipe_video_buffer *video_target,
+                              struct pipe_picture_desc *picture,
+                              unsigned num_buffers,
+                              const void *const *data,
+                              const unsigned *num_bytes)
+{
+   struct nouveau_vp3_decoder *dec = (struct nouveau_vp3_decoder *)decoder;
+   struct nouveau_vp3_video_buffer *target = (struct nouveau_vp3_video_buffer *)video_target;
+   uint32_t comm_seq = ++dec->fence_seq;
+   union pipe_desc desc;
+
+   unsigned vp_caps, is_ref, ret;
+   struct nouveau_vp3_video_buffer *refs[16] = {};
+
+   desc.base = picture;
+
+   assert(target->base.buffer_format == PIPE_FORMAT_NV12);
+
+   ret = nvc0_decoder_bsp(dec, desc, target, comm_seq,
+                          num_buffers, data, num_bytes,
+                          &vp_caps, &is_ref, refs);
+
+   /* did we decode bitstream correctly? */
+   assert(ret == 2);
+
+   nvc0_decoder_vp(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+   nvc0_decoder_ppp(dec, desc, target, comm_seq);
+}
+
+struct pipe_video_codec *
+nvc0_create_decoder(struct pipe_context *context,
+                    const struct pipe_video_codec *templ)
+{
+   struct nouveau_screen *screen = &((struct nvc0_context *)context)->screen->base;
+   struct nouveau_vp3_decoder *dec;
+   struct nouveau_pushbuf **push;
+   union nouveau_bo_config cfg;
+   bool kepler = screen->device->chipset >= 0xe0;
+
+   cfg.nvc0.tile_mode = 0x10;
+   cfg.nvc0.memtype = 0xfe;
+
+   int ret, i;
+   uint32_t codec = 1, ppp_codec = 3;
+   uint32_t timeout;
+   u32 tmp_size = 0;
+
+   if (getenv("XVMC_VL"))
+       return vl_create_decoder(context, templ);
+
+   if (templ->entrypoint != PIPE_VIDEO_ENTRYPOINT_BITSTREAM) {
+      debug_printf("%x\n", templ->entrypoint);
+      return NULL;
+   }
+
+   dec = CALLOC_STRUCT(nouveau_vp3_decoder);
+   if (!dec)
+      return NULL;
+   dec->client = screen->client;
+   dec->base = *templ;
+   nouveau_vp3_decoder_init_common(&dec->base);
+
+   if (!kepler) {
+      dec->bsp_idx = 5;
+      dec->vp_idx = 6;
+      dec->ppp_idx = 7;
+   } else {
+      dec->bsp_idx = 2;
+      dec->vp_idx = 2;
+      dec->ppp_idx = 2;
+   }
+
+   for (i = 0; i < 3; ++i)
+      if (i && !kepler) {
+         dec->channel[i] = dec->channel[0];
+         dec->pushbuf[i] = dec->pushbuf[0];
+      } else {
+         void *data;
+         u32 size;
+         struct nvc0_fifo nvc0_args = {};
+         struct nve0_fifo nve0_args = {};
+
+         if (!kepler) {
+            size = sizeof(nvc0_args);
+            data = &nvc0_args;
+         } else {
+            unsigned engine[] = {
+               NVE0_FIFO_ENGINE_BSP,
+               NVE0_FIFO_ENGINE_VP,
+               NVE0_FIFO_ENGINE_PPP
+            };
+
+            nve0_args.engine = engine[i];
+            size = sizeof(nve0_args);
+            data = &nve0_args;
+         }
+
+         ret = nouveau_object_new(&screen->device->object, 0,
+                                  NOUVEAU_FIFO_CHANNEL_CLASS,
+                                  data, size, &dec->channel[i]);
+
+         if (!ret)
+            ret = nouveau_pushbuf_new(screen->client, dec->channel[i], 4,
+                                   32 * 1024, true, &dec->pushbuf[i]);
+         if (ret)
+            break;
+      }
+   push = dec->pushbuf;
+
+   if (!kepler) {
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x90b1, NULL, 0, &dec->bsp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x90b2, NULL, 0, &dec->vp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x90b3, NULL, 0, &dec->ppp);
+   } else {
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[0], 0x95b1, 0x95b1, NULL, 0, &dec->bsp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[1], 0x95b2, 0x95b2, NULL, 0, &dec->vp);
+      if (!ret)
+         ret = nouveau_object_new(dec->channel[2], 0x90b3, 0x90b3, NULL, 0, &dec->ppp);
+   }
+   if (ret)
+      goto fail;
+
+   BEGIN_NVC0(push[0], SUBC_BSP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[0], dec->bsp->handle);
+
+   BEGIN_NVC0(push[1], SUBC_VP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[1], dec->vp->handle);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push[2], dec->ppp->handle);
+
+   dec->base.context = context;
+   dec->base.decode_bitstream = nvc0_decoder_decode_bitstream;
+
+   for (i = 0; i < NOUVEAU_VP3_VIDEO_QDEPTH && !ret; ++i)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0, 1 << 20, &cfg, &dec->bsp_bo[i]);
+   if (!ret)
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                           0x100, 4 << 20, &cfg, &dec->inter_bo[0]);
+   if (!ret) {
+      if (!kepler)
+         nouveau_bo_ref(dec->inter_bo[0], &dec->inter_bo[1]);
+      else
+         ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM,
+                              0x100, dec->inter_bo[0]->size, &cfg,
+                              &dec->inter_bo[1]);
+   }
+   if (ret)
+      goto fail;
+
+   switch (u_reduce_video_profile(templ->profile)) {
+   case PIPE_VIDEO_FORMAT_MPEG12: {
+      codec = 1;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4: {
+      codec = 4;
+      tmp_size = mb(templ->height)*16 * mb(templ->width)*16;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_VC1: {
+      ppp_codec = codec = 2;
+      tmp_size = mb(templ->height)*16 * mb(templ->width)*16;
+      assert(templ->max_references <= 2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4_AVC: {
+      codec = 3;
+      dec->tmp_stride = 16 * mb_half(templ->width) * nouveau_vp3_video_align(templ->height) * 3 / 2;
+      tmp_size = dec->tmp_stride * (templ->max_references + 1);
+      assert(templ->max_references <= 16);
+      break;
+   }
+   default:
+      fprintf(stderr, "invalid codec\n");
+      goto fail;
+   }
+
+   if (screen->device->chipset < 0xd0) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x4000, &cfg, &dec->fw_bo);
+      if (ret)
+         goto fail;
+
+      ret = nouveau_vp3_load_firmware(dec, templ->profile, screen->device->chipset);
+      if (ret)
+         goto fw_fail;
+   }
+
+   if (codec != 3) {
+      ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                           0x400, &cfg, &dec->bitplane_bo);
+      if (ret)
+         goto fail;
+   }
+
+   dec->ref_stride = mb(templ->width)*16 * (mb_half(templ->height)*32 + nouveau_vp3_video_align(templ->height)/2);
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_VRAM, 0,
+                        dec->ref_stride * (templ->max_references+2) + tmp_size,
+                        &cfg, &dec->ref_bo);
+   if (ret)
+      goto fail;
+
+   timeout = 0;
+
+   BEGIN_NVC0(push[0], SUBC_BSP(0x200), 2);
+   PUSH_DATA (push[0], codec);
+   PUSH_DATA (push[0], timeout);
+
+   BEGIN_NVC0(push[1], SUBC_VP(0x200), 2);
+   PUSH_DATA (push[1], codec);
+   PUSH_DATA (push[1], timeout);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(0x200), 2);
+   PUSH_DATA (push[2], ppp_codec);
+   PUSH_DATA (push[2], timeout);
+
+   ++dec->fence_seq;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   ret = nouveau_bo_new(screen->device, NOUVEAU_BO_GART|NOUVEAU_BO_MAP,
+                        0, 0x1000, NULL, &dec->fence_bo);
+   if (ret)
+      goto fail;
+
+   nouveau_bo_map(dec->fence_bo, NOUVEAU_BO_RDWR, screen->client);
+   dec->fence_map = dec->fence_bo->map;
+   dec->fence_map[0] = dec->fence_map[4] = dec->fence_map[8] = 0;
+   dec->comm = (struct comm *)(dec->fence_map + (COMM_OFFSET/sizeof(*dec->fence_map)));
+
+   /* So lets test if the fence is working? */
+   nouveau_pushbuf_space(push[0], 6, 1, 0);
+   PUSH_REFN (push[0], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NVC0(push[0], SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_bo->offset);
+   PUSH_DATA (push[0], dec->fence_seq);
+
+   BEGIN_NVC0(push[0], SUBC_BSP(0x304), 1);
+   PUSH_DATA (push[0], 0);
+   PUSH_KICK (push[0]);
+
+   nouveau_pushbuf_space(push[1], 6, 1, 0);
+   PUSH_REFN (push[1], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NVC0(push[1], SUBC_VP(0x240), 3);
+   PUSH_DATAh(push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push[1], dec->fence_seq);
+
+   BEGIN_NVC0(push[1], SUBC_VP(0x304), 1);
+   PUSH_DATA (push[1], 0);
+   PUSH_KICK (push[1]);
+
+   nouveau_pushbuf_space(push[2], 6, 1, 0);
+   PUSH_REFN (push[2], dec->fence_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR);
+   BEGIN_NVC0(push[2], SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push[2], dec->fence_seq);
+
+   BEGIN_NVC0(push[2], SUBC_PPP(0x304), 1);
+   PUSH_DATA (push[2], 0);
+   PUSH_KICK (push[2]);
+
+   usleep(100);
+   while (dec->fence_seq > dec->fence_map[0] ||
+          dec->fence_seq > dec->fence_map[4] ||
+          dec->fence_seq > dec->fence_map[8]) {
+      debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+      usleep(100);
+   }
+   debug_printf("%u: %u %u %u\n", dec->fence_seq, dec->fence_map[0], dec->fence_map[4], dec->fence_map[8]);
+#endif
+
+   return &dec->base;
+
+fw_fail:
+   debug_printf("Cannot create decoder without firmware..\n");
+   dec->base.destroy(&dec->base);
+   return NULL;
+
+fail:
+   debug_printf("Creation failed: %s (%i)\n", strerror(-ret), ret);
+   dec->base.destroy(&dec->base);
+   return NULL;
+}
+
+struct pipe_video_buffer *
+nvc0_video_buffer_create(struct pipe_context *pipe,
+                         const struct pipe_video_buffer *templat)
+{
+   return nouveau_vp3_video_buffer_create(
+         pipe, templat, NVC0_RESOURCE_FLAG_VIDEO);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video.h b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
new file mode 100644
index 00000000000..9ee0280f8ea
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_screen.h"
+#include "nouveau_vp3_video.h"
+
+#include "vl/vl_decoder.h"
+#include "vl/vl_types.h"
+
+#include "util/u_video.h"
+
+extern unsigned
+nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nouveau_vp3_video_buffer *refs[16]);
+
+extern void
+nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nouveau_vp3_video_buffer *refs[16]);
+
+extern void
+nvc0_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target, unsigned comm_seq);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
new file mode 100644
index 00000000000..40696fa779f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_bsp.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_video.h"
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+static void dump_comm_bsp(struct comm *comm)
+{
+   unsigned idx = comm->bsp_cur_index & 0xf;
+   debug_printf("Cur seq: %x, bsp byte ofs: %x\n", comm->bsp_cur_index, comm->byte_ofs);
+   debug_printf("Status: %08x, pos: %08x\n", comm->status[idx], comm->pos[idx]);
+}
+#endif
+
+unsigned
+nvc0_decoder_bsp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                 struct nouveau_vp3_video_buffer *target,
+                 unsigned comm_seq, unsigned num_buffers,
+                 const void *const *data, const unsigned *num_bytes,
+                 unsigned *vp_caps, unsigned *is_ref,
+                 struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[0];
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   uint32_t bsp_addr, comm_addr, inter_addr;
+   uint32_t slice_size, bucket_size, ring_size;
+   uint32_t caps;
+   int ret;
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   unsigned fence_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->bitplane_bo, NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+   if (!dec->bitplane_bo)
+      num_refs--;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   ret = nouveau_bo_map(bsp_bo, NOUVEAU_BO_WR, dec->client);
+   if (ret) {
+      debug_printf("map failed: %i %s\n", ret, strerror(-ret));
+      return -1;
+   }
+
+   caps = nouveau_vp3_bsp(dec, desc, target, comm_seq,
+                          num_buffers, data, num_bytes);
+
+   nouveau_vp3_vp_caps(dec, desc, target, comm_seq, vp_caps, is_ref, refs);
+
+   nouveau_pushbuf_space(push, 6 + (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC ? 9 : 7) + fence_extra + 2, num_refs, 0);
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   bsp_addr = bsp_bo->offset >> 8;
+   inter_addr = inter_bo->offset >> 8;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   memset(dec->comm, 0, 0x200);
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET) >> 8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+
+   BEGIN_NVC0(push, SUBC_BSP(0x700), 5);
+   PUSH_DATA (push, caps); // 700 cmd
+   PUSH_DATA (push, bsp_addr + 1); // 704 strparm_bsp
+   PUSH_DATA (push, bsp_addr + 7); // 708 str addr
+   PUSH_DATA (push, comm_addr); // 70c comm
+   PUSH_DATA (push, comm_seq); // 710 seq
+
+   if (codec != PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      u32 bitplane_addr;
+
+      bitplane_addr = dec->bitplane_bo->offset >> 8;
+
+      nouveau_vp3_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NVC0(push, SUBC_BSP(0x400), 6);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 408 interdata addr
+      PUSH_DATA (push, ring_size << 8); // 40c interdata_size
+      PUSH_DATA (push, bitplane_addr); // 410 BITPLANE_DATA
+      PUSH_DATA (push, 0x400); // 414 BITPLANE_DATA_SIZE
+   } else {
+      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      BEGIN_NVC0(push, SUBC_BSP(0x400), 8);
+      PUSH_DATA (push, bsp_addr); // 400 picparm addr
+      PUSH_DATA (push, inter_addr); // 404 interparm addr
+      PUSH_DATA (push, slice_size << 8); // 408 interparm size?
+      PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 40c interdata addr
+      PUSH_DATA (push, ring_size << 8); // 410 interdata size
+      PUSH_DATA (push, inter_addr + slice_size); // 414 bucket?
+      PUSH_DATA (push, bucket_size << 8); // 418 bucket size? unshifted..
+      PUSH_DATA (push, 0); // 41c targets
+      // TODO: Double check 414 / 418 with nvidia trace
+   }
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_BSP(0x240), 3);
+   PUSH_DATAh(push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_bo->offset);
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("b%u: %u\n", dec->fence_seq, dec->fence_map[0]);
+            dump_comm_bsp(dec->comm);
+         }
+      } while (dec->fence_seq > dec->fence_map[0]);
+   }
+
+   dump_comm_bsp(dec->comm);
+   return dec->comm->status[comm_seq & 0xf];
+#else
+   BEGIN_NVC0(push, SUBC_BSP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+   return 2;
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_ppp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_ppp.c
new file mode 100644
index 00000000000..4ceec4fbffc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_ppp.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_video.h"
+
+static void
+nvc0_decoder_setup_ppp(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target, uint32_t low700) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   uint32_t stride_in = mb(dec->base.width);
+   uint32_t stride_out = mb(target->resources[0]->width0);
+   uint32_t dec_h = mb(dec->base.height);
+   uint32_t dec_w = mb(dec->base.width);
+   uint64_t in_addr;
+   uint32_t y2, cbcr, cbcr2, i;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { NULL, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+   };
+   unsigned num_refs = sizeof(bo_refs)/sizeof(*bo_refs);
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+      bo_refs[i].bo = mt->base.bo;
+   }
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+   nouveau_vp3_ycbcr_offsets(dec, &y2, &cbcr, &cbcr2);
+
+   BEGIN_NVC0(push, SUBC_PPP(0x700), 10);
+   in_addr = nouveau_vp3_video_addr(dec, target) >> 8;
+
+   PUSH_DATA (push, (stride_out << 24) | (stride_out << 16) | low700); // 700
+   PUSH_DATA (push, (stride_in << 24) | (stride_in << 16) | (dec_h << 8) | dec_w); // 704
+   assert(dec_w == stride_in);
+
+   /* Input: */
+   PUSH_DATA (push, in_addr); // 708
+   PUSH_DATA (push, in_addr + y2); // 70c
+   PUSH_DATA (push, in_addr + cbcr); // 710
+   PUSH_DATA (push, in_addr + cbcr2); // 714
+
+   for (i = 0; i < 2; ++i) {
+      struct nv50_miptree *mt = (struct nv50_miptree *)target->resources[i];
+
+      PUSH_DATA (push, mt->base.address >> 8);
+      PUSH_DATA (push, (mt->base.address + mt->total_size/2) >> 8);
+      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+   }
+}
+
+static uint32_t
+nvc0_decoder_vc1_ppp(struct nouveau_vp3_decoder *dec, struct pipe_vc1_picture_desc *desc, struct nouveau_vp3_video_buffer *target) {
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+
+   nvc0_decoder_setup_ppp(dec, target, 0x1412);
+   assert(!desc->deblockEnable);
+   assert(!(dec->base.width & 0xf));
+   assert(!(dec->base.height & 0xf));
+
+   BEGIN_NVC0(push, SUBC_PPP(0x400), 1);
+   PUSH_DATA (push, desc->pquant << 11);
+
+   // 728 = wtf?
+   return 0x10;
+}
+
+void
+nvc0_decoder_ppp(struct nouveau_vp3_decoder *dec, union pipe_desc desc, struct nouveau_vp3_video_buffer *target, unsigned comm_seq) {
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_pushbuf *push = dec->pushbuf[2];
+   unsigned ppp_caps = 0x10;
+   unsigned fence_extra = 0;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   nouveau_pushbuf_space(push, 11 + (codec == PIPE_VIDEO_FORMAT_VC1 ? 2 : 0) + 3 + fence_extra + 2, 4, 0);
+
+   switch (codec) {
+   case PIPE_VIDEO_FORMAT_MPEG12: {
+      unsigned mpeg2 = dec->base.profile != PIPE_VIDEO_PROFILE_MPEG1;
+      nvc0_decoder_setup_ppp(dec, target, 0x1410 | mpeg2);
+      break;
+   }
+   case PIPE_VIDEO_FORMAT_MPEG4: nvc0_decoder_setup_ppp(dec, target, 0x1414); break;
+   case PIPE_VIDEO_FORMAT_VC1: ppp_caps = nvc0_decoder_vc1_ppp(dec, desc.vc1, target); break;
+   case PIPE_VIDEO_FORMAT_MPEG4_AVC: nvc0_decoder_setup_ppp(dec, target, 0x1413); break;
+   default: assert(0);
+   }
+   BEGIN_NVC0(push, SUBC_PPP(0x734), 2);
+   PUSH_DATA (push, comm_seq);
+   PUSH_DATA (push, ppp_caps);
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_PPP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x20));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK (push);
+
+   {
+      unsigned spin = 0;
+
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff)
+            debug_printf("p%u: %u\n", dec->fence_seq, dec->fence_map[8]);
+      } while (dec->fence_seq > dec->fence_map[8]);
+   }
+#else
+   BEGIN_NVC0(push, SUBC_PPP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
new file mode 100644
index 00000000000..0d152b9624f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright 2011-2013 Maarten Lankhorst
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nvc0/nvc0_video.h"
+#include <sys/mman.h>
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+static void dump_comm_vp(struct nouveau_vp3_decoder *dec, struct comm *comm, u32 comm_seq,
+                         struct nouveau_bo *inter_bo, unsigned slice_size)
+{
+	unsigned i, idx = comm->pvp_cur_index & 0xf;
+	debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+#if 0
+	debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+	debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+
+	for (i = 0; i != comm->irq_index; ++i)
+		debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+	for (i = 0; i != comm->parse_endpos_index; ++i)
+		debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+#endif
+	debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+	if (comm->status_vp[idx] == 1)
+		return;
+
+	if ((comm->pvp_stage & 0xff) != 0xff) {
+		unsigned *map;
+		assert(nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client) >= 0);
+		map = inter_bo->map;
+		for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+			debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+		}
+		munmap(inter_bo->map, inter_bo->size);
+		inter_bo->map = NULL;
+	}
+	assert((comm->pvp_stage & 0xff) == 0xff);
+}
+#endif
+
+static void
+nvc0_decoder_kick_ref(struct nouveau_vp3_decoder *dec, struct nouveau_vp3_video_buffer *target)
+{
+   dec->refs[target->valid_ref].vidbuf = NULL;
+   dec->refs[target->valid_ref].last_used = 0;
+//   debug_printf("Unreffed %p\n", target);
+}
+
+void
+nvc0_decoder_vp(struct nouveau_vp3_decoder *dec, union pipe_desc desc,
+                struct nouveau_vp3_video_buffer *target, unsigned comm_seq,
+                unsigned caps, unsigned is_ref,
+                struct nouveau_vp3_video_buffer *refs[16])
+{
+   struct nouveau_pushbuf *push = dec->pushbuf[1];
+   uint32_t bsp_addr, comm_addr, inter_addr, ucode_addr, pic_addr[17], last_addr, null_addr;
+   uint32_t slice_size, bucket_size, ring_size, i;
+   enum pipe_video_format codec = u_reduce_video_profile(dec->base.profile);
+   struct nouveau_bo *bsp_bo = dec->bsp_bo[comm_seq % NOUVEAU_VP3_VIDEO_QDEPTH];
+   struct nouveau_bo *inter_bo = dec->inter_bo[comm_seq & 1];
+   u32 fence_extra = 0, codec_extra = 0;
+   struct nouveau_pushbuf_refn bo_refs[] = {
+      { inter_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { dec->ref_bo, NOUVEAU_BO_WR | NOUVEAU_BO_VRAM },
+      { bsp_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+#if NOUVEAU_VP3_DEBUG_FENCE
+      { dec->fence_bo, NOUVEAU_BO_WR | NOUVEAU_BO_GART },
+#endif
+      { dec->fw_bo, NOUVEAU_BO_RD | NOUVEAU_BO_VRAM },
+   };
+   int num_refs = sizeof(bo_refs)/sizeof(*bo_refs) - !dec->fw_bo;
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   fence_extra = 4;
+#endif
+
+   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      nouveau_vp3_inter_sizes(dec, desc.h264->slice_count, &slice_size, &bucket_size, &ring_size);
+      codec_extra += 2;
+   } else
+      nouveau_vp3_inter_sizes(dec, 1, &slice_size, &bucket_size, &ring_size);
+
+   if (dec->base.max_references > 2)
+      codec_extra += 1 + (dec->base.max_references - 2);
+
+   pic_addr[16] = nouveau_vp3_video_addr(dec, target) >> 8;
+   last_addr = null_addr = nouveau_vp3_video_addr(dec, NULL) >> 8;
+
+   for (i = 0; i < dec->base.max_references; ++i) {
+      if (!refs[i])
+         pic_addr[i] = last_addr;
+      else if (dec->refs[refs[i]->valid_ref].vidbuf == refs[i])
+         last_addr = pic_addr[i] = nouveau_vp3_video_addr(dec, refs[i]) >> 8;
+      else
+         pic_addr[i] = null_addr;
+   }
+   if (!is_ref)
+      nvc0_decoder_kick_ref(dec, target);
+
+   nouveau_pushbuf_space(push, 8 + 3 * (codec != PIPE_VIDEO_FORMAT_MPEG12) +
+              6 + codec_extra + fence_extra + 2, num_refs, 0);
+
+   nouveau_pushbuf_refn(push, bo_refs, num_refs);
+
+   bsp_addr = bsp_bo->offset >> 8;
+#if NOUVEAU_VP3_DEBUG_FENCE
+   comm_addr = (dec->fence_bo->offset + COMM_OFFSET)>>8;
+#else
+   comm_addr = bsp_addr + (COMM_OFFSET>>8);
+#endif
+   inter_addr = inter_bo->offset >> 8;
+   if (dec->fw_bo)
+      ucode_addr = dec->fw_bo->offset >> 8;
+   else
+      ucode_addr = 0;
+
+   BEGIN_NVC0(push, SUBC_VP(0x700), 7);
+   PUSH_DATA (push, caps); // 700
+   PUSH_DATA (push, comm_seq); // 704
+   PUSH_DATA (push, 0); // 708 fuc targets, ignored for nvc0
+   PUSH_DATA (push, dec->fw_sizes); // 70c
+   PUSH_DATA (push, bsp_addr+(VP_OFFSET>>8)); // 710 picparm_addr
+   PUSH_DATA (push, inter_addr); // 714 inter_parm
+   PUSH_DATA (push, inter_addr + slice_size + bucket_size); // 718 inter_data_ofs
+
+   if (bucket_size) {
+      uint64_t tmpimg_addr = dec->ref_bo->offset + dec->ref_stride * (dec->base.max_references+2);
+
+      BEGIN_NVC0(push, SUBC_VP(0x71c), 2);
+      PUSH_DATA (push, tmpimg_addr >> 8); // 71c
+      PUSH_DATA (push, inter_addr + slice_size); // 720 bucket_ofs
+   }
+
+   BEGIN_NVC0(push, SUBC_VP(0x724), 5);
+   PUSH_DATA (push, comm_addr); // 724
+   PUSH_DATA (push, ucode_addr); // 728
+   PUSH_DATA (push, pic_addr[16]); // 734
+   PUSH_DATA (push, pic_addr[0]); // 72c
+   PUSH_DATA (push, pic_addr[1]); // 730
+
+   if (dec->base.max_references > 2) {
+      int i;
+
+      BEGIN_NVC0(push, SUBC_VP(0x400), dec->base.max_references - 2);
+      for (i = 2; i < dec->base.max_references; ++i) {
+         assert(0x400 + (i - 2) * 4 < 0x438);
+         PUSH_DATA (push, pic_addr[i]);
+      }
+   }
+
+   if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+      BEGIN_NVC0(push, SUBC_VP(0x438), 1);
+      PUSH_DATA (push, desc.h264->slice_count);
+   }
+
+   //debug_printf("Decoding %08lx with %08lx and %08lx\n", pic_addr[16], pic_addr[0], pic_addr[1]);
+
+#if NOUVEAU_VP3_DEBUG_FENCE
+   BEGIN_NVC0(push, SUBC_VP(0x240), 3);
+   PUSH_DATAh(push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, (dec->fence_bo->offset + 0x10));
+   PUSH_DATA (push, dec->fence_seq);
+
+   BEGIN_NVC0(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 1);
+   PUSH_KICK(push);
+
+   {
+      unsigned spin = 0;
+      do {
+         usleep(100);
+         if ((spin++ & 0xff) == 0xff) {
+            debug_printf("v%u: %u\n", dec->fence_seq, dec->fence_map[4]);
+            dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+         }
+      } while (dec->fence_seq > dec->fence_map[4]);
+   }
+   dump_comm_vp(dec, dec->comm, comm_seq, inter_bo, slice_size << 8);
+#else
+   BEGIN_NVC0(push, SUBC_VP(0x300), 1);
+   PUSH_DATA (push, 0);
+   PUSH_KICK (push);
+#endif
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
new file mode 100644
index 00000000000..3514d9dc3d0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_winsys.h
@@ -0,0 +1,144 @@
+
+#ifndef __NVC0_WINSYS_H__
+#define __NVC0_WINSYS_H__
+
+#include <stdint.h>
+#include <unistd.h>
+
+#include "pipe/p_defines.h"
+
+#include "nouveau_winsys.h"
+#include "nouveau_buffer.h"
+
+#ifndef NV04_PFIFO_MAX_PACKET_LEN
+#define NV04_PFIFO_MAX_PACKET_LEN 2047
+#endif
+
+
+static INLINE void
+nv50_add_bufctx_resident_bo(struct nouveau_bufctx *bufctx, int bin,
+                            unsigned flags, struct nouveau_bo *bo)
+{
+   nouveau_bufctx_refn(bufctx, bin, bo, flags)->priv = NULL;
+}
+
+static INLINE void
+nvc0_add_resident(struct nouveau_bufctx *bufctx, int bin,
+                  struct nv04_resource *res, unsigned flags)
+{
+   struct nouveau_bufref *ref =
+      nouveau_bufctx_refn(bufctx, bin, res->bo, flags | res->domain);
+   ref->priv = res;
+   ref->priv_data = flags;
+}
+
+#define BCTX_REFN_bo(ctx, bin, fl, bo) \
+   nv50_add_bufctx_resident_bo(ctx, NVC0_BIND_##bin, fl, bo);
+
+#define BCTX_REFN(bctx, bin, res, acc) \
+   nvc0_add_resident(bctx, NVC0_BIND_##bin, res, NOUVEAU_BO_##acc)
+
+static INLINE void
+PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
+{
+   struct nouveau_pushbuf_refn ref = { bo, flags };
+   nouveau_pushbuf_refn(push, &ref, 1);
+}
+
+
+#define SUBC_3D(m) 0, (m)
+#define NVC0_3D(n) SUBC_3D(NVC0_3D_##n)
+#define NVE4_3D(n) SUBC_3D(NVE4_3D_##n)
+
+#define SUBC_COMPUTE(m) 1, (m)
+#define NVC0_COMPUTE(n) SUBC_COMPUTE(NVC0_COMPUTE_##n)
+#define NVE4_COMPUTE(n) SUBC_COMPUTE(NVE4_COMPUTE_##n)
+
+#define SUBC_M2MF(m) 2, (m)
+#define SUBC_P2MF(m) 2, (m)
+#define NVC0_M2MF(n) SUBC_M2MF(NVC0_M2MF_##n)
+#define NVE4_P2MF(n) SUBC_P2MF(NVE4_P2MF_##n)
+
+#define SUBC_2D(m) 3, (m)
+#define NVC0_2D(n) SUBC_2D(NVC0_2D_##n)
+
+#define SUBC_COPY(m) 4, (m)
+#define NVE4_COPY(m) SUBC_COPY(NVE4_COPY_##n)
+
+#define SUBC_SW(m) 7, (m)
+
+static INLINE uint32_t
+NVC0_FIFO_PKHDR_SQ(int subc, int mthd, unsigned size)
+{
+   return 0x20000000 | (size << 16) | (subc << 13) | (mthd >> 2);
+}
+
+static INLINE uint32_t
+NVC0_FIFO_PKHDR_NI(int subc, int mthd, unsigned size)
+{
+   return 0x60000000 | (size << 16) | (subc << 13) | (mthd >> 2);
+}
+
+static INLINE uint32_t
+NVC0_FIFO_PKHDR_IL(int subc, int mthd, uint8_t data)
+{
+   return 0x80000000 | (data << 16) | (subc << 13) | (mthd >> 2);
+}
+
+static INLINE uint32_t
+NVC0_FIFO_PKHDR_1I(int subc, int mthd, unsigned size)
+{
+   return 0xa0000000 | (size << 16) | (subc << 13) | (mthd >> 2);
+}
+
+
+static INLINE uint8_t
+nouveau_bo_memtype(const struct nouveau_bo *bo)
+{
+   return bo->config.nvc0.memtype;
+}
+
+
+static INLINE void
+PUSH_DATAh(struct nouveau_pushbuf *push, uint64_t data)
+{
+   *push->cur++ = (uint32_t)(data >> 32);
+}
+
+static INLINE void
+BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
+{
+#ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, size + 1);
+#endif
+   PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(subc, mthd, size));
+}
+
+static INLINE void
+BEGIN_NIC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
+{
+#ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, size + 1);
+#endif
+   PUSH_DATA (push, NVC0_FIFO_PKHDR_NI(subc, mthd, size));
+}
+
+static INLINE void
+BEGIN_1IC0(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
+{
+#ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, size + 1);
+#endif
+   PUSH_DATA (push, NVC0_FIFO_PKHDR_1I(subc, mthd, size));
+}
+
+static INLINE void
+IMMED_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, uint8_t data)
+{
+#ifndef NVC0_PUSH_EXPLICIT_SPACE_CHECKING
+   PUSH_SPACE(push, 1);
+#endif
+   PUSH_DATA (push, NVC0_FIFO_PKHDR_IL(subc, mthd, data));
+}
+
+#endif /* __NVC0_WINSYS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
new file mode 100644
index 00000000000..06c914fb5e6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -0,0 +1,652 @@
+/*
+ * Copyright 2012 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Christoph Bumiller
+ */
+
+#include "nvc0/nvc0_context.h"
+#include "nvc0/nvc0_compute.h"
+#include "nvc0/nve4_compute.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+#ifdef DEBUG
+static void nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *);
+#endif
+
+
+int
+nve4_screen_compute_setup(struct nvc0_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   unsigned i;
+   int ret;
+   uint32_t obj_class;
+
+   switch (dev->chipset & 0xf0) {
+   case 0xf0:
+      obj_class = NVF0_COMPUTE_CLASS; /* GK110 */
+      break;
+   case 0xe0:
+      obj_class = NVE4_COMPUTE_CLASS; /* GK104 */
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret) {
+      NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret);
+      return ret;
+   }
+
+   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL,
+                        &screen->parm);
+   if (ret)
+      return ret;
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->oclass);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEMP_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->offset);
+   /* No idea why there are 2. Divide size by 2 to be safe.
+    * Actually this might be per-MP TEMP size and looks like I'm only using
+    * 2 MPs instead of all 8.
+    */
+   BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(0)), 3);
+   PUSH_DATAh(push, screen->tls->size / screen->mp_count);
+   PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
+   PUSH_DATA (push, 0xff);
+   BEGIN_NVC0(push, NVE4_COMPUTE(MP_TEMP_SIZE_HIGH(1)), 3);
+   PUSH_DATAh(push, screen->tls->size / screen->mp_count);
+   PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff);
+   PUSH_DATA (push, 0xff);
+
+   /* Unified address space ? Who needs that ? Certainly not OpenCL.
+    *
+    * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be
+    *  accessible. We cannot prevent that at the moment, so expect failure.
+    */
+   BEGIN_NVC0(push, NVE4_COMPUTE(LOCAL_BASE), 1);
+   PUSH_DATA (push, 1 << 24);
+   BEGIN_NVC0(push, NVE4_COMPUTE(SHARED_BASE), 1);
+   PUSH_DATA (push, 2 << 24);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(CODE_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->text->offset);
+   PUSH_DATA (push, screen->text->offset);
+
+   BEGIN_NVC0(push, SUBC_COMPUTE(0x0310), 1);
+   PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300);
+
+   /* NOTE: these do not affect the state used by the 3D object */
+   BEGIN_NVC0(push, NVE4_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1);
+   BEGIN_NVC0(push, NVE4_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1);
+
+   if (obj_class >= NVF0_COMPUTE_CLASS) {
+      BEGIN_NVC0(push, SUBC_COMPUTE(0x0248), 1);
+      PUSH_DATA (push, 0x100);
+      BEGIN_NIC0(push, SUBC_COMPUTE(0x0248), 63);
+      for (i = 63; i >= 1; --i)
+         PUSH_DATA(push, 0x38000 | i);
+      IMMED_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 0);
+      IMMED_NVC0(push, SUBC_COMPUTE(0x518), 0);
+   }
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(TEX_CB_INDEX), 1);
+   PUSH_DATA (push, 0); /* does not interefere with 3D */
+
+   if (obj_class >= NVF0_COMPUTE_CLASS)
+      IMMED_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
+
+   /* MS sample coordinate offsets: these do not work with _ALT modes ! */
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 64);
+   PUSH_DATA (push, 1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+   PUSH_DATA (push, 0); /* 0 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 1); /* 1 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0); /* 2 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 1); /* 3 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 2); /* 4 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 3); /* 5 */
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 2); /* 6 */
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, 3); /* 7 */
+   PUSH_DATA (push, 1);
+
+#ifdef DEBUG
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
+   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 28);
+   PUSH_DATA (push, 1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 8);
+   PUSH_DATA (push, 1);
+   PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
+   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO);
+   PUSH_DATA (push, screen->tls->offset);
+   PUSH_DATAh(push, screen->tls->offset);
+   PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */
+   PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */
+   PUSH_DATA (push, 0); /* warp cfstack size */
+#endif
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+
+   return 0;
+}
+
+
+static void
+nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv50_surface *sf;
+   struct nv04_resource *res;
+   uint32_t mask;
+   unsigned i;
+   const unsigned t = 1;
+
+   mask = nvc0->surfaces_dirty[t];
+   while (mask) {
+      i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+
+      /*
+       * NVE4's surface load/store instructions receive all the information
+       * directly instead of via binding points, so we have to supply them.
+       */
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 64);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 17);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+      nve4_set_surface_info(push, nvc0->surfaces[t][i], screen);
+
+      sf = nv50_surface(nvc0->surfaces[t][i]);
+      if (sf) {
+         res = nv04_resource(sf->base.texture);
+
+         if (sf->base.writable)
+            BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
+         else
+            BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
+      }
+   }
+   if (nvc0->surfaces_dirty[t]) {
+      BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+      PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+   }
+
+   /* re-reference non-dirty surfaces */
+   mask = nvc0->surfaces_valid[t] & ~nvc0->surfaces_dirty[t];
+   while (mask) {
+      i = ffs(mask) - 1;
+      mask &= ~(1 << i);
+
+      sf = nv50_surface(nvc0->surfaces[t][i]);
+      res = nv04_resource(sf->base.texture);
+
+      if (sf->base.writable)
+         BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR);
+      else
+         BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD);
+   }
+
+   nvc0->surfaces_dirty[t] = 0;
+}
+
+
+/* Thankfully, textures with samplers follow the normal rules. */
+static void
+nve4_compute_validate_samplers(struct nvc0_context *nvc0)
+{
+   boolean need_flush = nve4_validate_tsc(nvc0, 5);
+   if (need_flush) {
+      BEGIN_NVC0(nvc0->base.pushbuf, NVE4_COMPUTE(TSC_FLUSH), 1);
+      PUSH_DATA (nvc0->base.pushbuf, 0);
+   }
+}
+/* (Code duplicated at bottom for various non-convincing reasons.
+ *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
+ *  entries to avoid a subchannel switch.
+ *  Same for texture cache flushes.
+ *  Also, the bufctx differs, and more IFs in the 3D version looks ugly.)
+ */
+static void nve4_compute_validate_textures(struct nvc0_context *);
+
+static void
+nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
+   unsigned i, n;
+   uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
+
+   if (!dirty)
+      return;
+   i = ffs(dirty) - 1;
+   n = util_logbase2(dirty) + 1 - i;
+   assert(n);
+
+   address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, address);
+   PUSH_DATA (push, address);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, n * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + n);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+   PUSH_DATAp(push, &nvc0->tex_handles[s][i], n);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+
+   nvc0->textures_dirty[s] = 0;
+   nvc0->samplers_dirty[s] = 0;
+}
+
+
+static boolean
+nve4_compute_state_validate(struct nvc0_context *nvc0)
+{
+   if (!nvc0_compute_validate_program(nvc0))
+      return FALSE;
+   if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
+      nve4_compute_validate_textures(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
+      nve4_compute_validate_samplers(nvc0);
+   if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS))
+       nve4_compute_set_tex_handles(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES)
+      nve4_compute_validate_surfaces(nvc0);
+   if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS)
+      nvc0_validate_global_residents(nvc0,
+                                     nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
+
+   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, FALSE);
+
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
+      return FALSE;
+   if (unlikely(nvc0->state.flushed))
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, TRUE);
+
+   return TRUE;
+}
+
+
+static void
+nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
+                          const uint *block_layout,
+                          const uint *grid_layout)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_program *cp = nvc0->compprog;
+
+   if (cp->parm_size) {
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, screen->parm->offset);
+      PUSH_DATA (push, screen->parm->offset);
+      BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, cp->parm_size);
+      PUSH_DATA (push, 0x1);
+      BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, input, cp->parm_size / 4);
+   }
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 7 * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + 7);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+   PUSH_DATAp(push, block_layout, 3);
+   PUSH_DATAp(push, grid_layout, 3);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
+static INLINE uint8_t
+nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
+{
+   if (shared_size > (32 << 10))
+      return NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1;
+   if (shared_size > (16 << 10))
+      return NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1;
+   return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
+}
+
+static void
+nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
+                               struct nve4_cp_launch_desc *desc,
+                               uint32_t label,
+                               const uint *block_layout,
+                               const uint *grid_layout)
+{
+   const struct nvc0_screen *screen = nvc0->screen;
+   const struct nvc0_program *cp = nvc0->compprog;
+   unsigned i;
+
+   nve4_cp_launch_desc_init_default(desc);
+
+   desc->entry = nvc0_program_symbol_offset(cp, label);
+
+   desc->griddim_x = grid_layout[0];
+   desc->griddim_y = grid_layout[1];
+   desc->griddim_z = grid_layout[2];
+   desc->blockdim_x = block_layout[0];
+   desc->blockdim_y = block_layout[1];
+   desc->blockdim_z = block_layout[2];
+
+   desc->shared_size = align(cp->cp.smem_size, 0x100);
+   desc->local_size_p = align(cp->cp.lmem_size, 0x10);
+   desc->local_size_n = 0;
+   desc->cstack_size = 0x800;
+   desc->cache_split = nve4_compute_derive_cache_split(nvc0, cp->cp.smem_size);
+
+   desc->gpr_alloc = cp->num_gprs;
+   desc->bar_alloc = cp->num_barriers;
+
+   for (i = 0; i < 7; ++i) {
+      const unsigned s = 5;
+      if (nvc0->constbuf[s][i].u.buf)
+         nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+   }
+   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+}
+
+static INLINE struct nve4_cp_launch_desc *
+nve4_compute_alloc_launch_desc(struct nouveau_context *nv,
+                               struct nouveau_bo **pbo, uint64_t *pgpuaddr)
+{
+   uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo);
+   if (!ptr)
+      return NULL;
+   if (*pgpuaddr & 255) {
+      unsigned adj = 256 - (*pgpuaddr & 255);
+      ptr += adj;
+      *pgpuaddr += adj;
+   }
+   return (struct nve4_cp_launch_desc *)ptr;
+}
+
+void
+nve4_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label,
+                 const void *input)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nve4_cp_launch_desc *desc;
+   uint64_t desc_gpuaddr;
+   struct nouveau_bo *desc_bo;
+   int ret;
+
+   desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr);
+   if (!desc) {
+      ret = -1;
+      goto out;
+   }
+   BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
+                desc_bo);
+
+   ret = !nve4_compute_state_validate(nvc0);
+   if (ret)
+      goto out;
+
+   nve4_compute_setup_launch_desc(nvc0, desc, label, block_layout, grid_layout);
+#ifdef DEBUG
+   if (debug_get_num_option("NV50_PROG_DEBUG", 0))
+      nve4_compute_dump_launch_desc(desc);
+#endif
+
+   nve4_compute_upload_input(nvc0, input, block_layout, grid_layout);
+
+   /* upload descriptor and flush */
+#if 0
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, desc_gpuaddr);
+   PUSH_DATA (push, desc_gpuaddr);
+   BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 256);
+   PUSH_DATA (push, 1);
+   BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 1 + (256 / 4));
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+   BEGIN_NVC0(push, NVE4_COMPUTE(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
+#endif
+   BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH_DESC_ADDRESS), 1);
+   PUSH_DATA (push, desc_gpuaddr >> 8);
+   BEGIN_NVC0(push, NVE4_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0x3);
+   BEGIN_NVC0(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+out:
+   if (ret)
+      NOUVEAU_ERR("Failed to launch grid !\n");
+   nouveau_scratch_done(&nvc0->base);
+   nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC);
+}
+
+
+#define NVE4_TIC_ENTRY_INVALID 0x000fffff
+
+static void
+nve4_compute_validate_textures(struct nvc0_context *nvc0)
+{
+   struct nouveau_bo *txc = nvc0->screen->txc;
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const unsigned s = 5;
+   unsigned i;
+   uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+   unsigned n[2] = { 0, 0 };
+
+   for (i = 0; i < nvc0->num_textures[s]; ++i) {
+      struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]);
+      struct nv04_resource *res;
+      const boolean dirty = !!(nvc0->textures_dirty[s] & (1 << i));
+
+      if (!tic) {
+         nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+         continue;
+      }
+      res = nv04_resource(tic->pipe.texture);
+
+      if (tic->id < 0) {
+         tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic);
+
+         PUSH_SPACE(push, 16);
+         BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, txc->offset + (tic->id * 32));
+         PUSH_DATA (push, txc->offset + (tic->id * 32));
+         BEGIN_NVC0(push, NVE4_COMPUTE(UPLOAD_LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, 32);
+         PUSH_DATA (push, 1);
+         BEGIN_1IC0(push, NVE4_COMPUTE(UPLOAD_EXEC), 9);
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+         PUSH_DATAp(push, &tic->tic[0], 8);
+
+         commands[0][n[0]++] = (tic->id << 4) | 1;
+      } else
+      if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+         commands[1][n[1]++] = (tic->id << 4) | 1;
+      }
+      nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32);
+
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      res->status |=  NOUVEAU_BUFFER_STATUS_GPU_READING;
+
+      nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID;
+      nvc0->tex_handles[s][i] |= tic->id;
+      if (dirty)
+         BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD);
+   }
+   for (; i < nvc0->state.num_textures[s]; ++i)
+      nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID;
+
+   if (n[0]) {
+      BEGIN_NIC0(push, NVE4_COMPUTE(TIC_FLUSH), n[0]);
+      PUSH_DATAp(push, commands[0], n[0]);
+   }
+   if (n[1]) {
+      BEGIN_NIC0(push, NVE4_COMPUTE(TEX_CACHE_CTL), n[1]);
+      PUSH_DATAp(push, commands[1], n[1]);
+   }
+
+   nvc0->state.num_textures[s] = nvc0->num_textures[s];
+}
+
+
+#ifdef DEBUG
+static const char *nve4_cache_split_name(unsigned value)
+{
+   switch (value) {
+   case NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1: return "16K_SHARED_48K_L1";
+   case NVE4_3D_CACHE_SPLIT_32K_SHARED_32K_L1: return "32K_SHARED_32K_L1";
+   case NVC0_3D_CACHE_SPLIT_48K_SHARED_16K_L1: return "48K_SHARED_16K_L1";
+   default:
+      return "(invalid)";
+   }
+}
+
+static void
+nve4_compute_dump_launch_desc(const struct nve4_cp_launch_desc *desc)
+{
+   const uint32_t *data = (const uint32_t *)desc;
+   unsigned i;
+   boolean zero = FALSE;
+
+   debug_printf("COMPUTE LAUNCH DESCRIPTOR:\n");
+
+   for (i = 0; i < sizeof(*desc); i += 4) {
+      if (data[i / 4]) {
+         debug_printf("[%x]: 0x%08x\n", i, data[i / 4]);
+         zero = FALSE;
+      } else
+      if (!zero) {
+         debug_printf("...\n");
+         zero = TRUE;
+      }
+   }
+
+   debug_printf("entry = 0x%x\n", desc->entry);
+   debug_printf("grid dimensions = %ux%ux%u\n",
+                desc->griddim_x, desc->griddim_y, desc->griddim_z);
+   debug_printf("block dimensions = %ux%ux%u\n",
+                desc->blockdim_x, desc->blockdim_y, desc->blockdim_z);
+   debug_printf("s[] size: 0x%x\n", desc->shared_size);
+   debug_printf("l[] size: -0x%x / +0x%x\n",
+                desc->local_size_n, desc->local_size_p);
+   debug_printf("stack size: 0x%x\n", desc->cstack_size);
+   debug_printf("barrier count: %u\n", desc->bar_alloc);
+   debug_printf("$r count: %u\n", desc->gpr_alloc);
+   debug_printf("cache split: %s\n", nve4_cache_split_name(desc->cache_split));
+
+   for (i = 0; i < 8; ++i) {
+      uint64_t address;
+      uint32_t size = desc->cb[i].size;
+      boolean valid = !!(desc->cb_mask & (1 << i));
+
+      address = ((uint64_t)desc->cb[i].address_h << 32) | desc->cb[i].address_l;
+
+      if (!valid && !address && !size)
+         continue;
+      debug_printf("CB[%u]: address = 0x%"PRIx64", size 0x%x%s\n",
+                   i, address, size, valid ? "" : "  (invalid)");
+   }
+}
+#endif
+
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
+static void
+nve4_compute_trap_info(struct nvc0_context *nvc0)
+{
+   struct nvc0_screen *screen = nvc0->screen;
+   struct nouveau_bo *bo = screen->parm;
+   int ret, i;
+   volatile struct nve4_mp_trap_info *info;
+   uint8_t *map;
+
+   ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client);
+   if (ret)
+      return;
+   map = (uint8_t *)bo->map;
+   info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO);
+
+   if (info->lock) {
+      debug_printf("trapstat = %08x\n", info->trapstat);
+      debug_printf("warperr = %08x\n", info->warperr);
+      debug_printf("PC = %x\n", info->pc);
+      debug_printf("tid = %u %u %u\n",
+                   info->tid[0], info->tid[1], info->tid[2]);
+      debug_printf("ctaid = %u %u %u\n",
+                   info->ctaid[0], info->ctaid[1], info->ctaid[2]);
+      for (i = 0; i <= 63; ++i)
+         debug_printf("$r%i = %08x\n", i, info->r[i]);
+      for (i = 0; i <= 6; ++i)
+         debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1);
+      debug_printf("$c = %x\n", info->flags >> 12);
+   }
+   info->lock = 0;
+}
+#endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
new file mode 100644
index 00000000000..79862b7dcd8
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -0,0 +1,131 @@
+
+#ifndef NVE4_COMPUTE_H
+#define NVE4_COMPUTE_H
+
+#include "nv50/nv50_defs.xml.h"
+#include "nvc0/nve4_compute.xml.h"
+
+/* Input space is implemented as c0[], to which we bind the screen->parm bo.
+ */
+#define NVE4_CP_INPUT_USER           0x0000
+#define NVE4_CP_INPUT_USER_LIMIT     0x1000
+#define NVE4_CP_INPUT_GRID_INFO(i)  (0x1000 + (i) * 4)
+#define NVE4_CP_INPUT_NTID(i)       (0x1000 + (i) * 4)
+#define NVE4_CP_INPUT_NCTAID(i)     (0x100c + (i) * 4)
+#define NVE4_CP_INPUT_GRIDID         0x1018
+#define NVE4_CP_INPUT_TEX(i)        (0x1040 + (i) * 4)
+#define NVE4_CP_INPUT_TEX_STRIDE     4
+#define NVE4_CP_INPUT_TEX_MAX        32
+#define NVE4_CP_INPUT_MS_OFFSETS     0x10c0
+#define NVE4_CP_INPUT_SUF_STRIDE     64
+#define NVE4_CP_INPUT_SUF(i)        (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
+#define NVE4_CP_INPUT_SUF_MAX        32
+#define NVE4_CP_INPUT_TRAP_INFO_PTR  0x1900
+#define NVE4_CP_INPUT_TEMP_PTR       0x1908
+#define NVE4_CP_INPUT_MP_TEMP_SIZE   0x1910
+#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914
+#define NVE4_CP_INPUT_CSTACK_SIZE    0x1918
+#define NVE4_CP_INPUT_SIZE           0x1a00
+#define NVE4_CP_PARAM_TRAP_INFO      0x2000
+#define NVE4_CP_PARAM_TRAP_INFO_SZ  (1 << 16)
+#define NVE4_CP_PARAM_SIZE          (NVE4_CP_PARAM_TRAP_INFO + (1 << 16))
+
+struct nve4_cp_launch_desc
+{
+   u32 unk0[8];
+   u32 entry;
+   u32 unk9[3];
+   u32 griddim_x    : 31;
+   u32 unk12        : 1;
+   u16 griddim_y;
+   u16 griddim_z;
+   u32 unk14[3];
+   u16 shared_size; /* must be aligned to 0x100 */
+   u16 unk15;
+   u16 unk16;
+   u16 blockdim_x;
+   u16 blockdim_y;
+   u16 blockdim_z;
+   u32 cb_mask      : 8;
+   u32 unk20_8      : 21;
+   u32 cache_split  : 2;
+   u32 unk20_31     : 1;
+   u32 unk21[8];
+   struct {
+      u32 address_l;
+      u32 address_h : 8;
+      u32 reserved  : 7;
+      u32 size      : 17;
+   } cb[8];
+   u32 local_size_p : 20;
+   u32 unk45_20     : 7;
+   u32 bar_alloc    : 5;
+   u32 local_size_n : 20;
+   u32 unk46_20     : 4;
+   u32 gpr_alloc    : 8;
+   u32 cstack_size  : 20;
+   u32 unk47_20     : 12;
+   u32 unk48[16];
+};
+
+static INLINE void
+nve4_cp_launch_desc_init_default(struct nve4_cp_launch_desc *desc)
+{
+   memset(desc, 0, sizeof(*desc));
+
+   desc->unk0[7]  = 0xbc000000;
+   desc->unk9[2]  = 0x44014000;
+   desc->unk47_20 = 0x300;
+}
+
+static INLINE void
+nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
+                           unsigned index,
+                           struct nouveau_bo *bo,
+                           uint32_t base, uint16_t size)
+{
+   uint64_t address = bo->offset + base;
+
+   assert(index < 8);
+   assert(!(base & 0xff));
+   assert(size <= 65536);
+
+   desc->cb[index].address_l = address;
+   desc->cb[index].address_h = address >> 32;
+   desc->cb[index].size = size;
+
+   desc->cb_mask |= 1 << index;
+}
+
+static INLINE void
+nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
+                               unsigned index,
+                               const struct nvc0_constbuf *cb)
+{
+   assert(index < 8);
+
+   if (!cb->u.buf) {
+      desc->cb_mask &= ~(1 << index);
+   } else {
+      const struct nv04_resource *buf = nv04_resource(cb->u.buf);
+      assert(!cb->user);
+      nve4_cp_launch_desc_set_cb(desc, index,
+                                 buf->bo, buf->offset + cb->offset, cb->size);
+   }
+}
+
+struct nve4_mp_trap_info {
+   u32 lock;
+   u32 pc;
+   u32 trapstat;
+   u32 warperr;
+   u32 tid[3];
+   u32 ctaid[3];
+   u32 pad028[2];
+   u32 r[64];
+   u32 flags;
+   u32 pad134[3];
+   u32 s[0x3000];
+};
+
+#endif /* NVE4_COMPUTE_H */
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
new file mode 100644
index 00000000000..e971fc1ac6b
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
@@ -0,0 +1,429 @@
+#ifndef NVE4_COMPUTE_XML
+#define NVE4_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- nve4_compute.xml (  10168 bytes, from 2013-06-04 13:57:02)
+- copyright.xml    (   6452 bytes, from 2012-04-16 22:51:01)
+- nvchipsets.xml   (   3954 bytes, from 2013-06-04 13:57:02)
+- nv_object.xml    (  14395 bytes, from 2013-06-04 13:57:02)
+- nv_defs.xml      (   4437 bytes, from 2012-04-16 22:51:01)
+- nv50_defs.xml    (  16877 bytes, from 2013-07-17 09:10:01)
+- nve4_p2mf.xml    (   2373 bytes, from 2013-06-04 13:57:02)
+
+Copyright (C) 2006-2013 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+
+#define NVE4_COMPUTE_UNK0144					0x00000144
+
+#define NVE4_COMPUTE_UPLOAD					0x00000000
+
+#define NVE4_COMPUTE_UPLOAD_LINE_LENGTH_IN			0x00000180
+
+#define NVE4_COMPUTE_UPLOAD_LINE_COUNT				0x00000184
+
+#define NVE4_COMPUTE_UPLOAD_DST_ADDRESS_HIGH			0x00000188
+
+#define NVE4_COMPUTE_UPLOAD_DST_ADDRESS_LOW			0x0000018c
+
+#define NVE4_COMPUTE_UPLOAD_DST_PITCH				0x00000190
+
+#define NVE4_COMPUTE_UPLOAD_DST_TILE_MODE			0x00000194
+
+#define NVE4_COMPUTE_UPLOAD_DST_WIDTH				0x00000198
+
+#define NVE4_COMPUTE_UPLOAD_DST_HEIGHT				0x0000019c
+
+#define NVE4_COMPUTE_UPLOAD_DST_DEPTH				0x000001a0
+
+#define NVE4_COMPUTE_UPLOAD_DST_Z				0x000001a4
+
+#define NVE4_COMPUTE_UPLOAD_DST_X				0x000001a8
+
+#define NVE4_COMPUTE_UPLOAD_DST_Y				0x000001ac
+
+#define NVE4_COMPUTE_UPLOAD_EXEC				0x000001b0
+#define NVE4_COMPUTE_UPLOAD_EXEC_LINEAR				0x00000001
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNK1__MASK			0x0000007e
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNK1__SHIFT			1
+#define NVE4_COMPUTE_UPLOAD_EXEC_BUF_NOTIFY			0x00000300
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNK12__MASK			0x0000f000
+#define NVE4_COMPUTE_UPLOAD_EXEC_UNK12__SHIFT			12
+
+#define NVE4_COMPUTE_UPLOAD_DATA				0x000001b4
+
+#define NVE4_COMPUTE_UPLOAD_QUERY_ADDRESS_HIGH			0x000001dc
+
+#define NVE4_COMPUTE_UPLOAD_QUERY_ADDRESS_LOW			0x000001e0
+
+#define NVE4_COMPUTE_UPLOAD_QUERY_SEQUENCE			0x000001e4
+
+#define NVE4_COMPUTE_UPLOAD_UNK01F0				0x000001f0
+
+#define NVE4_COMPUTE_UPLOAD_UNK01F4				0x000001f4
+
+#define NVE4_COMPUTE_UPLOAD_UNK01F8				0x000001f8
+
+#define NVE4_COMPUTE_UPLOAD_UNK01FC				0x000001fc
+
+#define NVE4_COMPUTE_SHARED_BASE				0x00000214
+
+#define NVE4_COMPUTE_MEM_BARRIER				0x0000021c
+#define NVE4_COMPUTE_MEM_BARRIER_UNK0__MASK			0x00000007
+#define NVE4_COMPUTE_MEM_BARRIER_UNK0__SHIFT			0
+#define NVE4_COMPUTE_MEM_BARRIER_UNK4				0x00000010
+#define NVE4_COMPUTE_MEM_BARRIER_UNK12				0x00001000
+
+#define NVE4_COMPUTE_UNK0240					0x00000240
+
+#define NVE4_COMPUTE_UNK244_TIC_FLUSH				0x00000244
+
+#define NVE4_COMPUTE_UNK0248					0x00000248
+#define NVE4_COMPUTE_UNK0248_UNK0__MASK				0x0000003f
+#define NVE4_COMPUTE_UNK0248_UNK0__SHIFT			0
+#define NVE4_COMPUTE_UNK0248_UNK8__MASK				0x00ffff00
+#define NVE4_COMPUTE_UNK0248_UNK8__SHIFT			8
+
+#define NVE4_COMPUTE_UNK0274					0x00000274
+
+#define NVE4_COMPUTE_UNK0278					0x00000278
+
+#define NVE4_COMPUTE_UNK027C					0x0000027c
+
+#define NVE4_COMPUTE_UNK0280					0x00000280
+
+#define NVE4_COMPUTE_UNK0284					0x00000284
+
+#define NVE4_COMPUTE_UNK0288					0x00000288
+
+#define NVE4_COMPUTE_UNK0290					0x00000290
+
+#define NVE4_COMPUTE_UNK02B0					0x000002b0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS			0x000002b4
+#define NVE4_COMPUTE_LAUNCH_DESC_ADDRESS__SHR			8
+
+#define NVE4_COMPUTE_UNK02B8					0x000002b8
+
+#define NVE4_COMPUTE_LAUNCH					0x000002bc
+
+#define NVE4_COMPUTE_MP_TEMP_SIZE(i0)			       (0x000002e4 + 0xc*(i0))
+#define NVE4_COMPUTE_MP_TEMP_SIZE__ESIZE			0x0000000c
+#define NVE4_COMPUTE_MP_TEMP_SIZE__LEN				0x00000002
+
+#define NVE4_COMPUTE_MP_TEMP_SIZE_HIGH(i0)		       (0x000002e4 + 0xc*(i0))
+
+#define NVE4_COMPUTE_MP_TEMP_SIZE_LOW(i0)		       (0x000002e8 + 0xc*(i0))
+
+#define NVE4_COMPUTE_MP_TEMP_SIZE_MASK(i0)		       (0x000002ec + 0xc*(i0))
+
+#define NVE4_COMPUTE_UNK0310					0x00000310
+
+#define NVE4_COMPUTE_FIRMWARE(i0)			       (0x00000500 + 0x4*(i0))
+#define NVE4_COMPUTE_FIRMWARE__ESIZE				0x00000004
+#define NVE4_COMPUTE_FIRMWARE__LEN				0x00000020
+
+#define NVE4_COMPUTE_LOCAL_BASE					0x0000077c
+
+#define NVE4_COMPUTE_TEMP_ADDRESS_HIGH				0x00000790
+
+#define NVE4_COMPUTE_TEMP_ADDRESS_LOW				0x00000794
+
+#define NVE4_COMPUTE_UNK0D94					0x00000d94
+
+#define NVE4_COMPUTE_WATCHDOG_TIMER				0x00000de4
+
+#define NVE4_COMPUTE_UNK0F44(i0)			       (0x00000f44 + 0x4*(i0))
+#define NVE4_COMPUTE_UNK0F44__ESIZE				0x00000004
+#define NVE4_COMPUTE_UNK0F44__LEN				0x00000004
+
+#define NVE4_COMPUTE_UNK1040(i0)			       (0x00001040 + 0x4*(i0))
+#define NVE4_COMPUTE_UNK1040__ESIZE				0x00000004
+#define NVE4_COMPUTE_UNK1040__LEN				0x0000000c
+
+#define NVE4_COMPUTE_UNK1288_TIC_FLUSH				0x00001288
+
+#define NVE4_COMPUTE_TSC_FLUSH					0x00001330
+#define NVE4_COMPUTE_TSC_FLUSH_SPECIFIC				0x00000001
+#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NVE4_COMPUTE_TSC_FLUSH_ENTRY__SHIFT			4
+
+#define NVE4_COMPUTE_TIC_FLUSH					0x00001334
+#define NVE4_COMPUTE_TIC_FLUSH_SPECIFIC				0x00000001
+#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NVE4_COMPUTE_TIC_FLUSH_ENTRY__SHIFT			4
+
+#define NVE4_COMPUTE_TEX_CACHE_CTL				0x00001338
+#define NVE4_COMPUTE_TEX_CACHE_CTL_UNK0				0x00000001
+#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__MASK			0x03fffff0
+#define NVE4_COMPUTE_TEX_CACHE_CTL_ENTRY__SHIFT			4
+
+#define NVE4_COMPUTE_UNK1424_TSC_FLUSH				0x00001424
+
+#define NVE4_COMPUTE_COND_ADDRESS_HIGH				0x00001550
+
+#define NVE4_COMPUTE_COND_ADDRESS_LOW				0x00001554
+
+#define NVE4_COMPUTE_COND_MODE					0x00001558
+#define NVE4_COMPUTE_COND_MODE_NEVER				0x00000000
+#define NVE4_COMPUTE_COND_MODE_ALWAYS				0x00000001
+#define NVE4_COMPUTE_COND_MODE_RES_NON_ZERO			0x00000002
+#define NVE4_COMPUTE_COND_MODE_EQUAL				0x00000003
+#define NVE4_COMPUTE_COND_MODE_NOT_EQUAL			0x00000004
+
+#define NVE4_COMPUTE_TSC_ADDRESS_HIGH				0x0000155c
+
+#define NVE4_COMPUTE_TSC_ADDRESS_LOW				0x00001560
+
+#define NVE4_COMPUTE_TSC_LIMIT					0x00001564
+
+#define NVE4_COMPUTE_TIC_ADDRESS_HIGH				0x00001574
+
+#define NVE4_COMPUTE_TIC_ADDRESS_LOW				0x00001578
+
+#define NVE4_COMPUTE_TIC_LIMIT					0x0000157c
+
+#define NVE4_COMPUTE_CODE_ADDRESS_HIGH				0x00001608
+
+#define NVE4_COMPUTE_CODE_ADDRESS_LOW				0x0000160c
+
+#define NVE4_COMPUTE_UNK1690					0x00001690
+
+#define NVE4_COMPUTE_FLUSH					0x00001698
+#define NVE4_COMPUTE_FLUSH_CODE					0x00000001
+#define NVE4_COMPUTE_FLUSH_GLOBAL				0x00000010
+#define NVE4_COMPUTE_FLUSH_CB					0x00001000
+
+#define NVE4_COMPUTE_UNK1944					0x00001944
+
+#define NVE4_COMPUTE_DELAY					0x00001a24
+
+#define NVE4_COMPUTE_UNK1A2C(i0)			       (0x00001a2c + 0x4*(i0))
+#define NVE4_COMPUTE_UNK1A2C__ESIZE				0x00000004
+#define NVE4_COMPUTE_UNK1A2C__LEN				0x00000005
+
+#define NVE4_COMPUTE_QUERY_ADDRESS_HIGH				0x00001b00
+
+#define NVE4_COMPUTE_QUERY_ADDRESS_LOW				0x00001b04
+
+#define NVE4_COMPUTE_QUERY_SEQUENCE				0x00001b08
+
+#define NVE4_COMPUTE_QUERY_GET					0x00001b0c
+#define NVE4_COMPUTE_QUERY_GET_MODE__MASK			0x00000003
+#define NVE4_COMPUTE_QUERY_GET_MODE__SHIFT			0
+#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE			0x00000000
+#define NVE4_COMPUTE_QUERY_GET_MODE_WRITE_INTR_NRHOST		0x00000003
+#define NVE4_COMPUTE_QUERY_GET_INTR				0x00100000
+#define NVE4_COMPUTE_QUERY_GET_SHORT				0x10000000
+
+#define NVE4_COMPUTE_TEX_CB_INDEX				0x00002608
+
+#define NVE4_COMPUTE_UNK260C					0x0000260c
+
+#define NVE4_COMPUTE_MP_PM_SET(i0)			       (0x0000335c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NVE4_COMPUTE_MP_PM_SET__LEN				0x00000008
+
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL(i0)			       (0x0000337c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL__LEN			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_NONE			0x00000000
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_USER			0x00000001
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH			0x00000003
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC			0x00000004
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE			0x00000005
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST			0x0000001b
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH			0x0000001c
+
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL(i0)			       (0x0000338c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL__LEN			0x00000004
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_NONE			0x00000000
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_WARP			0x00000002
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_REPLAY			0x00000008
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_TRANSACTION			0x0000000e
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1				0x00000010
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM				0x00000011
+
+#define NVE4_COMPUTE_MP_PM_SRCSEL(i0)			       (0x0000339c + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE			0x00000004
+#define NVE4_COMPUTE_MP_PM_SRCSEL__LEN				0x00000008
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__MASK			0x00000003
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP0__SHIFT			0
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__MASK			0x0000001c
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG0__SHIFT			2
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__MASK			0x00000060
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP1__SHIFT			5
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__MASK			0x00000380
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG1__SHIFT			7
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__MASK			0x00000c00
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP2__SHIFT			10
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__MASK			0x00007000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG2__SHIFT			12
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__MASK			0x00018000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP3__SHIFT			15
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__MASK			0x000e0000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG3__SHIFT			17
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__MASK			0x00300000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP4__SHIFT			20
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__MASK			0x01c00000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG4__SHIFT			22
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__MASK			0x06000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_GRP5__SHIFT			25
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__MASK			0x38000000
+#define NVE4_COMPUTE_MP_PM_SRCSEL_SIG5__SHIFT			27
+
+#define NVE4_COMPUTE_MP_PM_FUNC(i0)			       (0x000033bc + 0x4*(i0))
+#define NVE4_COMPUTE_MP_PM_FUNC__ESIZE				0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC__LEN				0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__MASK			0x0000000f
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE__SHIFT			0
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP			0x00000000
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_PULSE		0x00000001
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_B6				0x00000002
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK3			0x00000003
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6			0x00000004
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_LOGOP_B6_PULSE		0x00000005
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK6			0x00000006
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK7			0x00000007
+#define NVE4_COMPUTE_MP_PM_FUNC_MODE_UNK8			0x00000008
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__MASK			0x000ffff0
+#define NVE4_COMPUTE_MP_PM_FUNC_FUNC__SHIFT			4
+
+#define NVE4_COMPUTE_MP_PM_UNK33DC				0x000033dc
+
+#define NVE4_COMPUTE_LAUNCH_DESC__SIZE				0x00000100
+#define NVE4_COMPUTE_LAUNCH_DESC_6				0x00000018
+#define NVE4_COMPUTE_LAUNCH_DESC_6_NOTIFY__MASK			0x00000c00
+#define NVE4_COMPUTE_LAUNCH_DESC_6_NOTIFY__SHIFT		10
+
+#define NVE4_COMPUTE_LAUNCH_DESC_PROG_START			0x00000020
+
+#define NVE4_COMPUTE_LAUNCH_DESC_12				0x00000030
+#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__MASK		0x7fffffff
+#define NVE4_COMPUTE_LAUNCH_DESC_12_GRIDDIM_X__SHIFT		0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ			0x00000034
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__MASK		0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Y__SHIFT		0
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__MASK		0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_GRIDDIM_YZ_Z__SHIFT		16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_17				0x00000044
+#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__MASK		0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_17_SHARED_ALLOC__SHIFT		0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_18				0x00000048
+#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__MASK		0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_18_BLOCKDIM_X__SHIFT		16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ			0x0000004c
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__MASK		0x0000ffff
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Y__SHIFT		0
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__MASK		0xffff0000
+#define NVE4_COMPUTE_LAUNCH_DESC_BLOCKDIM_YZ_Z__SHIFT		16
+
+#define NVE4_COMPUTE_LAUNCH_DESC_20				0x00000050
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__MASK		0x000000ff
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CB_VALID__SHIFT		0
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__MASK		0x60000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT__SHIFT		29
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_16K_SHARED_48K_L1	0x20000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_32K_SHARED_32K_L1	0x40000000
+#define NVE4_COMPUTE_LAUNCH_DESC_20_CACHE_SPLIT_48K_SHARED_16K_L1	0x60000000
+
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0(i0)	       (0x00000074 + 0x8*(i0))
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__ESIZE		0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0__LEN		0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__MASK	0xffffffff
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_0_ADDRESS_LOW__SHIFT	0
+
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1(i0)	       (0x00000078 + 0x8*(i0))
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__ESIZE		0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1__LEN		0x00000008
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__MASK	0x000000ff
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_ADDRESS_HIGH__SHIFT	0
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__MASK		0xffff8000
+#define NVE4_COMPUTE_LAUNCH_DESC_CB_CONFIG_1_SIZE__SHIFT	15
+
+#define NVE4_COMPUTE_LAUNCH_DESC_45				0x000000b4
+#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__MASK	0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_45_LOCAL_POS_ALLOC__SHIFT	0
+#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__MASK		0xf8000000
+#define NVE4_COMPUTE_LAUNCH_DESC_45_BARRIER_ALLOC__SHIFT	27
+
+#define NVE4_COMPUTE_LAUNCH_DESC_46				0x000000b8
+#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__MASK	0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_46_LOCAL_NEG_ALLOC__SHIFT	0
+#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__MASK		0x3f000000
+#define NVE4_COMPUTE_LAUNCH_DESC_46_GPR_ALLOC__SHIFT		24
+
+#define NVE4_COMPUTE_LAUNCH_DESC_47				0x000000bc
+#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__MASK	0x000fffff
+#define NVE4_COMPUTE_LAUNCH_DESC_47_WARP_CSTACK_SIZE__SHIFT	0
+
+
+#endif /* NVE4_COMPUTE_XML */
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h b/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
new file mode 100644
index 00000000000..68a742fadfe
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
@@ -0,0 +1,107 @@
+#ifndef RNNDB_NVE4_P2MF_XML
+#define RNNDB_NVE4_P2MF_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://0x04.net/cgit/index.cgi/rules-ng-ng
+git clone git://0x04.net/rules-ng-ng
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/nve4_p2mf.xml    (   1400 bytes, from 2012-04-14 21:29:11)
+- ./rnndb/copyright.xml  (   6452 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv_object.xml  (  12736 bytes, from 2012-04-14 21:30:24)
+- ./rnndb/nvchipsets.xml (   3701 bytes, from 2012-03-22 20:40:59)
+- ./rnndb/nv_defs.xml    (   4437 bytes, from 2011-08-11 18:25:12)
+- ./rnndb/nv50_defs.xml  (   5468 bytes, from 2011-08-11 18:25:12)
+
+Copyright (C) 2006-2012 by the following authors:
+- Artur Huillet <arthur.huillet@free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br@users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn@users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955@student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik@users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag@users.sf.net> (lumag)
+- EdB <edb_@users.sf.net> (edb_)
+- Erik Waling <erikwailing@users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez@riseup.net> (curro)
+- imirkin <imirkin@users.sf.net> (imirkin)
+- jb17bsome <jb17bsome@bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy@users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym@gmail.com> (lordheavy)
+- Luca Barbieri <luca@luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003@gmail.com> (stillunknown)
+- Marcin Kościelnicki <koriakin@0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey@gmail.com> (careym)
+- Matthieu Castet <matthieu.castet@parrot.com> (mat-c)
+- nvidiaman <nvidiaman@users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin@gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq@iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter@users.sf.net> (ironpeter)
+- Richard Hughes <hughsient@users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar@users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier@users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin@gmail.com> (marcheu)
+- sturmflut <sturmflut@users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt@246tNt.com>
+- Victor Stinner <victor.stinner@haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj@gmail.com> (miathan6)
+- Younes Manton <younes.m@gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NVE4_P2MF_LINE_LENGTH_IN				0x00000180
+
+#define NVE4_P2MF_LINE_COUNT					0x00000184
+
+#define NVE4_P2MF_DST_ADDRESS_HIGH				0x00000188
+
+#define NVE4_P2MF_DST_ADDRESS_LOW				0x0000018c
+
+#define NVE4_P2MF_DST_TILE_MODE					0x00000194
+
+#define NVE4_P2MF_DST_PITCH					0x00000198
+
+#define NVE4_P2MF_DST_HEIGHT					0x0000019c
+
+#define NVE4_P2MF_DST_DEPTH					0x000001a0
+
+#define NVE4_P2MF_DST_Z						0x000001a4
+
+#define NVE4_P2MF_DST_X						0x000001a8
+
+#define NVE4_P2MF_DST_Y						0x000001ac
+
+#define NVE4_P2MF_EXEC						0x000001b0
+#define NVE4_P2MF_EXEC_LINEAR					0x00000001
+#define NVE4_P2MF_EXEC_UNK12					0x00001000
+
+#define NVE4_P2MF_DATA						0x000001b4
+
+
+#endif /* RNNDB_NVE4_P2MF_XML */