25 files changed, 6222 insertions, 0 deletions
diff --git a/src/gallium/drivers/nv40/Makefile b/src/gallium/drivers/nv40/Makefile
new file mode 100644
index 00000000000..9c8eadf7e44
--- /dev/null
+++ b/src/gallium/drivers/nv40/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+DRIVER_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_fragtex.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_screen.c \
+	nv40_state.c \
+	nv40_state_blend.c \
+	nv40_state_emit.c \
+	nv40_state_fb.c \
+	nv40_state_rasterizer.c \
+	nv40_state_scissor.c \
+	nv40_state_stipple.c \
+	nv40_state_viewport.c \
+	nv40_state_zsa.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv40/nv40_clear.c b/src/gallium/drivers/nv40/nv40_clear.c
new file mode 100644
index 00000000000..59efd620e32
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
new file mode 100644
index 00000000000..cc63dd734bc
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+	FREE(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_context *nv40;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+	nv40->screen = screen;
+	nv40->pctx_id = pctx_id;
+
+	nv40->nvws = nvws;
+
+	nv40->pipe.winsys = ws;
+	nv40->pipe.screen = pscreen;
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_query_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv40->draw = draw_create();
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
new file mode 100644
index 00000000000..adcfbdd85a8
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -0,0 +1,233 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv40_screen *ctx = nv40->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv40_state_index {
+	NV40_STATE_FB = 0,
+	NV40_STATE_VIEWPORT = 1,
+	NV40_STATE_BLEND = 2,
+	NV40_STATE_RAST = 3,
+	NV40_STATE_ZSA = 4,
+	NV40_STATE_BCOL = 5,
+	NV40_STATE_CLIP = 6,
+	NV40_STATE_SCISSOR = 7,
+	NV40_STATE_STIPPLE = 8,
+	NV40_STATE_FRAGPROG = 9,
+	NV40_STATE_VERTPROG = 10,
+	NV40_STATE_FRAGTEX0 = 11,
+	NV40_STATE_FRAGTEX1 = 12,
+	NV40_STATE_FRAGTEX2 = 13,
+	NV40_STATE_FRAGTEX3 = 14,
+	NV40_STATE_FRAGTEX4 = 15,
+	NV40_STATE_FRAGTEX5 = 16,
+	NV40_STATE_FRAGTEX6 = 17,
+	NV40_STATE_FRAGTEX7 = 18,
+	NV40_STATE_FRAGTEX8 = 19,
+	NV40_STATE_FRAGTEX9 = 20,
+	NV40_STATE_FRAGTEX10 = 21,
+	NV40_STATE_FRAGTEX11 = 22,
+	NV40_STATE_FRAGTEX12 = 23,
+	NV40_STATE_FRAGTEX13 = 24,
+	NV40_STATE_FRAGTEX14 = 25,
+	NV40_STATE_FRAGTEX15 = 26,
+	NV40_STATE_VERTTEX0 = 27,
+	NV40_STATE_VERTTEX1 = 28,
+	NV40_STATE_VERTTEX2 = 29,
+	NV40_STATE_VERTTEX3 = 30,
+	NV40_STATE_VTXBUF = 31,
+	NV40_STATE_VTXFMT = 32,
+	NV40_STATE_VTXATTR = 33,
+	NV40_STATE_MAX = 34
+};
+
+#include "nv40_screen.h"
+
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
+#define NV40_NEW_UCP		(1 << 12)
+
+struct nv40_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv40_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV40_STATE_MAX];
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv40_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv40_state state;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nv40_vertex_program *vertprog;
+	struct nv40_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv40_rasterizer_state *rasterizer;
+	struct nv40_zsa_state *zsa;
+	struct nv40_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv40_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv40_context *
+nv40_context(struct pipe_context *pipe)
+{
+	return (struct nv40_context *)pipe;
+}
+
+struct nv40_state_entry {
+	boolean (*validate)(struct nv40_context *nv40);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_query_functions(struct nv40_context *nv40);
+
+extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_destroy(struct nv40_context *,
+				  struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_destroy(struct nv40_context *,
+				  struct nv40_fragment_program *);
+
+/* nv40_fragtex.c */
+extern void nv40_fragtex_bind(struct nv40_context *);
+
+/* nv40_state.c and friends */
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
+extern struct nv40_state_entry nv40_state_rasterizer;
+extern struct nv40_state_entry nv40_state_scissor;
+extern struct nv40_state_entry nv40_state_stipple;
+extern struct nv40_state_entry nv40_state_fragprog;
+extern struct nv40_state_entry nv40_state_vertprog;
+extern struct nv40_state_entry nv40_state_blend;
+extern struct nv40_state_entry nv40_state_blend_colour;
+extern struct nv40_state_entry nv40_state_zsa;
+extern struct nv40_state_entry nv40_state_viewport;
+extern struct nv40_state_entry nv40_state_framebuffer;
+extern struct nv40_state_entry nv40_state_fragtex;
+extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
new file mode 100644
index 00000000000..8e56cdc2fe0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nv40_render_stage {
+	struct draw_stage stage;
+	struct nv40_context *nv40;
+	unsigned prim;
+};
+
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(NULL);
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
+}
+
+static void
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
+}
+
+static void
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nv40_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
+	nv40_state_emit(nv40);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++) {
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++)
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0, NULL);
+
+	return TRUE;
+}
+
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_3F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
new file mode 100644
index 00000000000..91dcbebda0d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -0,0 +1,991 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv40_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv40_sreg
+temp(struct nv40_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nv40_sreg
+constant(struct nv40_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv40_fpc *fpc, int size)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_INPUT:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_OUTPUT:
+		sr |= NV40_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV40SR_TEMP:
+		sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv40_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV40SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV40_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV40SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->DstRegister.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->DstRegister.Index];
+	case TGSI_FILE_NULL:
+		return nv40_sr(NV40SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv40_sr(NV40SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv40_sr(NV40SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		switch (fdec->Semantic.SemanticIndex) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		default:
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_prepare(struct nv40_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nv40_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nv40_fragprog_upload(struct nv40_context *nv40,
+		     struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv40_fragprog_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct pipe_buffer *constbuf =
+		nv40->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
+	nv40_fragprog_translate(nv40, fp);
+	if (!fp->translated) {
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv40_fragprog_upload(nv40, fp);
+
+	so = so_new(4, 1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv40_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv40_fragprog_upload(nv40, fp);
+	}
+
+	if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_fragprog_destroy(struct nv40_context *nv40,
+		      struct nv40_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv40_state_entry nv40_state_fragprog = {
+	.validate = nv40_fragprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FRAGPROG,
+		.hw = NV40_STATE_FRAGPROG
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
new file mode 100644
index 00000000000..0227d22620d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -0,0 +1,168 @@
+#include "nv40_context.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x | NV40TCL_TEX_SWIZZLE_S0_Y_##ts0y |         \
+   NV40TCL_TEX_SWIZZLE_S0_Z_##ts0z | NV40TCL_TEX_SWIZZLE_S0_W_##ts0w |         \
+   NV40TCL_TEX_SWIZZLE_S1_X_##ts1x | NV40TCL_TEX_SWIZZLE_S1_Y_##ts1y |         \
+   NV40TCL_TEX_SWIZZLE_S1_Z_##ts1z | NV40TCL_TEX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV40TCL_TEX_FILTER_SIGNED_RED*sx) | (NV40TCL_TEX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV40TCL_TEX_FILTER_SIGNED_BLUE*sz) | (NV40TCL_TEX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv40_fragtex_build(struct nv40_context *nv40, int unit)
+{
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV40TCL_TEX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
+
+static boolean
+nv40_fragtex_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct nv40_state *state = &nv40->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv40->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv40_fragtex_build(nv40, unit);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	nv40->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_fragtex = {
+	.validate = nv40_fragtex_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SAMPLER | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
new file mode 100644
index 00000000000..b68967c07fd
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -0,0 +1,194 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv40_context.h"
+
+static void
+nv40_miptree_layout(struct nv40_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f, pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	pitch = pt->width[0];
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			pitch = pt->nblocksx[l];
+		pitch = align(pitch, 64);
+
+		mt->level[l].pitch = pitch * pt->block.size;
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+			offset += mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		/* XXX: Re-enable when SIFM size limits are fixed */
+		/*case PIPE_FORMAT_R16_SNORM:*/
+			break;
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv40_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		assert(mt->shadow_surface);
+		pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv40_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv40_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void
+nv40_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv40_miptree_create;
+	pscreen->texture_release = nv40_miptree_release;
+	pscreen->get_tex_surface = nv40_miptree_surface_new;
+	pscreen->tex_surface_release = nv40_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
new file mode 100644
index 00000000000..57f39cfab0c
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+
+struct nv40_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv40_query *
+nv40_query(struct pipe_query *pipe)
+{
+	return (struct nv40_query *)pipe;
+}
+
+static struct pipe_query *
+nv40_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv40_query *q;
+
+	q = CALLOC(1, sizeof(struct nv40_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	if (q->object)
+		nv40->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64 tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv40->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv40->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv40->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv40_init_query_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_query = nv40_query_create;
+	nv40->pipe.destroy_query = nv40_query_destroy;
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.get_query_result = nv40_query_result;
+}
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
new file mode 100644
index 00000000000..25c78682961
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -0,0 +1,365 @@
+#include "pipe/p_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static const char *
+nv40_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv40_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		if (screen->curie->grclass == NV40TCL)
+			return 1;
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv40_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv40_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+			mt->shadow_surface = screen->get_tex_surface
+			(
+				screen, mt->shadow_tex,
+				surface->face, surface->level, surface->zslice,
+				surface->usage
+			);
+		}
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+
+	map = ws->buffer_map(ws, surface_to_map->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv40_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, surface_to_unmap->buffer);
+
+	if (surface_to_unmap != surface) {
+		struct nv40_screen *nvscreen = nv40_screen(screen);
+
+		nvscreen->nvws->surface_copy(nvscreen->nvws,
+		                             surface, 0, 0,
+		                             surface_to_unmap, 0, 0,
+		                             surface->width, surface->height);
+	}
+}
+
+static void
+nv40_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->curie);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_stateobj *so;
+	unsigned curie_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV40TCL;
+		else
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	default:
+		break;
+	}
+
+	if (!curie_class) {
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static curie initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, screen->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv40_screen_destroy;
+
+	screen->pipe.get_name = nv40_screen_get_name;
+	screen->pipe.get_vendor = nv40_screen_get_vendor;
+	screen->pipe.get_param = nv40_screen_get_param;
+	screen->pipe.get_paramf = nv40_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv40_surface_map;
+	screen->pipe.surface_unmap = nv40_surface_unmap;
+
+	nv40_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
new file mode 100644
index 00000000000..c04a1275a00
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV40_SCREEN_H__
+#define __NV40_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv40_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV40_STATE_MAX];
+};
+
+static INLINE struct nv40_screen *
+nv40_screen(struct pipe_screen *screen)
+{
+	return (struct nv40_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 00000000000..854dccf5486
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,556 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#ifndef NV40_SHADER_NO_FUCKEDNESS
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
new file mode 100644
index 00000000000..255c4b294d1
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend = hwcso;
+	nv40->dirty |= NV40_NEW_BLEND;
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV40TCL_TEX_WRAP_S_SHIFT;
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV40TCL_TEX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV40TCL_TEX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV40TCL_TEX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV40TCL_TEX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV40TCL_TEX_FILTER_MAG_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV40TCL_TEX_FILTER_MAG_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv40->tex_sampler[unit] = sampler[unit];
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_samplers; unit++) {
+		nv40->tex_sampler[unit] = NULL;
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_samplers = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv40_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], miptree[unit]);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], NULL);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_textures = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, curie, NV40TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, curie, NV40TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->rasterizer = hwcso;
+	nv40->dirty |= NV40_NEW_RAST;
+	nv40->draw_dirty |= NV40_NEW_RAST;
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->zsa = hwcso;
+	nv40->dirty |= NV40_NEW_ZSA;
+}
+
+static void
+nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->vertprog = hwcso;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+	nv40->draw_dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
+	nv40_vertprog_destroy(nv40, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->fragprog = hwcso;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40_fragprog_destroy(nv40, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend_colour = *bcol;
+	nv40->dirty |= NV40_NEW_BCOL;
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->clip = *clip;
+	nv40->dirty |= NV40_NEW_UCP;
+	nv40->draw_dirty |= NV40_NEW_UCP;
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->constbuf[shader] = buf->buffer;
+	nv40->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->framebuffer = *fb;
+	nv40->dirty |= NV40_NEW_FB;
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->stipple, stipple->stipple, 4 * 32);
+	nv40->dirty |= NV40_NEW_STIPPLE;
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->scissor = *s;
+	nv40->dirty |= NV40_NEW_SCISSOR;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->viewport = *vpt;
+	nv40->dirty |= NV40_NEW_VIEWPORT;
+	nv40->draw_dirty |= NV40_NEW_VIEWPORT;
+}
+
+static void
+nv40_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxbuf, vb, sizeof(*vb) * count);
+	nv40->vtxbuf_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxelt, ve, sizeof(*ve) * count);
+	nv40->vtxelt_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->edgeflags = bitfield;
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_create;
+	nv40->pipe.bind_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_bind;
+	nv40->pipe.delete_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
+	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
+	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
new file mode 100644
index 00000000000..9c55903ae30
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -0,0 +1,91 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv40_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv40_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv40_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nv40_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv40_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv40_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv40_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
new file mode 100644
index 00000000000..95e6d7394f4
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_blend_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->blend->so, &nv40->state.hw[NV40_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend = {
+	.validate = nv40_state_blend_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BLEND,
+		.hw = NV40_STATE_BLEND
+	}
+};
+
+static boolean
+nv40_state_blend_colour_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv40->blend_colour;
+
+	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend_colour = {
+	.validate = nv40_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BCOL,
+		.hw = NV40_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
new file mode 100644
index 00000000000..ab88dc416e5
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -0,0 +1,184 @@
+#include "nv40_context.h"
+#include "nv40_state.h"
+#include "draw/draw_context.h"
+
+static struct nv40_state_entry *render_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vbo,
+	NULL
+};
+
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
+static void
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv40_state_entry *e = *states;
+
+		if (nv40->dirty & e->dirty.pipe) {
+			if (e->validate(nv40))
+				nv40->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nv40->dirty = 0;
+}
+
+void
+nv40_state_emit(struct nv40_context *nv40)
+{
+	struct nv40_state *state = &nv40->state;
+	struct nv40_screen *screen = nv40->screen;
+	unsigned i, samplers;
+	uint64 states;
+
+	if (nv40->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV40_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv40->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv40->nvws,
+				      state->hw[NV40_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+}
+
+boolean
+nv40_state_validate(struct nv40_context *nv40)
+{
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	struct draw_context *draw = nv40->draw;
+
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = SWTNL;
+	}
+
+	if (nv40->draw_dirty & NV40_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nv40->vertprog->draw);
+
+	if (nv40->draw_dirty & NV40_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nv40->rasterizer->pipe);
+
+	if (nv40->draw_dirty & NV40_NEW_UCP)
+		draw_set_clip_state(draw, &nv40->clip);
+
+	if (nv40->draw_dirty & NV40_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nv40->viewport);
+
+	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
+		draw_set_edgeflags(draw, nv40->edgeflags);
+		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
+		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
+
+	nv40->draw_dirty = 0;
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
new file mode 100644
index 00000000000..28592d71c37
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -0,0 +1,155 @@
+#include "nv40_context.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nv40_state_framebuffer_validate(struct nv40_context *nv40)
+{
+	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV40TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->num_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		rt_format = NV40TCL_RT_FORMAT_TYPE_SWIZZLED |
+		            log2i(fb->width) << NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT |
+		            log2i(fb->height) << NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT;
+	}
+	else
+		rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->stride);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, rt[2]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, rt[2]->buffer, rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, rt[3]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, rt[3]->buffer, rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->stride);
+	}
+
+	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_framebuffer = {
+	.validate = nv40_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FB,
+		.hw = NV40_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_rasterizer.c b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
new file mode 100644
index 00000000000..9ecda5990f0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_rasterizer_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->rasterizer->so,
+	       &nv40->state.hw[NV40_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_rasterizer = {
+	.validate = nv40_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_RAST,
+		.hw = NV40_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
new file mode 100644
index 00000000000..285239ef419
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_scissor_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv40->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv40->state.scissor_enabled == 0))
+		return FALSE;
+	nv40->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	if (nv40->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_scissor = {
+	.validate = nv40_state_scissor_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SCISSOR | NV40_NEW_RAST,
+		.hw = NV40_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
new file mode 100644
index 00000000000..b51024ad9b2
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_stipple_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv40->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv40->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_stipple = {
+	.validate = nv40_state_stipple_validate,
+	.dirty = {
+		.pipe = NV40_NEW_STIPPLE | NV40_NEW_RAST,
+		.hw = NV40_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
new file mode 100644
index 00000000000..869a55b4053
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -0,0 +1,67 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_viewport_validate(struct nv40_context *nv40)
+{
+	struct pipe_viewport_state *vpt = &nv40->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (nv40->render_mode == HW && !nv40->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv40->state.hw[NV40_STATE_VIEWPORT] &&
+	    (bypass || !(nv40->dirty & NV40_NEW_VIEWPORT)) &&
+	    nv40->state.viewport_bypass == bypass)
+		return FALSE;
+	nv40->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_viewport = {
+	.validate = nv40_state_viewport_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VIEWPORT | NV40_NEW_RAST,
+		.hw = NV40_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_zsa.c b/src/gallium/drivers/nv40/nv40_state_zsa.c
new file mode 100644
index 00000000000..fb760677c88
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_zsa_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->zsa->so,
+	       &nv40->state.hw[NV40_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_zsa = {
+	.validate = nv40_state_zsa_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ZSA,
+		.hw = NV40_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
new file mode 100644
index 00000000000..576af7c59ee
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -0,0 +1,77 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv40_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	if (do_flip) {
+		/*XXX: This dodgyness will do for now for correctness.  But,
+		 *     need to investigate whether the 2D engine is able to
+		 *     manage a flip (perhaps SIFM?), if not, use the 3D engine
+		 */
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv40_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.surface_copy = nv40_surface_copy;
+	nv40->pipe.surface_fill = nv40_surface_fill;
+}
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
new file mode 100644
index 00000000000..09f6e79d32a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -0,0 +1,555 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV40TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV40TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv40->idxbuf = NULL;
+		nv40->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv40->idxbuf ||
+	    type != nv40->idxbuf_format) {
+		nv40->dirty |= NV40_NEW_ARRAYS;
+		nv40->idxbuf = ib;
+		nv40->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV40TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, curie, NV40TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, curie, NV40TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, curie, NV40TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, curie, NV40TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	nv40_vbo_set_idxbuf(nv40, NULL, 0);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv40_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv40_draw_elements_u08(nv40, map, mode, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, map, mode, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv40_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	if (idxbuf) {
+		nv40_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct pipe_buffer *ib = nv40->idxbuf;
+	unsigned ib_format = nv40->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv40->edgeflags) {
+		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
+
+	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->pitch) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, curie, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv40->state.hw[NV40_STATE_VTXBUF]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv40->state.hw[NV40_STATE_VTXFMT]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXFMT);
+	so_ref(sattr, &nv40->state.hw[NV40_STATE_VTXATTR]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vbo = {
+	.validate = nv40_vbo_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
new file mode 100644
index 00000000000..ff988e6a5f4
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -0,0 +1,1070 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+#define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	struct nv40_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_address;
+	struct nv40_sreg *r_temp;
+
+	struct nv40_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nv40_sreg
+temp(struct nv40_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nv40_sreg
+constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	struct nv40_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv40_sr(NV40SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_TEMP:
+		sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV40SR_INPUT:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index << 
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+		break;
+	case NV40SR_OUTPUT:
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		case NV40_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv40_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->DstRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= tgsi_mask(1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= tgsi_mask(1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(vpc);
+
+	if (mask)
+		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(vpc);
+		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
+		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(vpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_prepare(struct nv40_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->DeclarationRange.Last > high_addr) {
+					high_addr =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->FullDstRegisters[0];
+
+			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
+				if (fdst->DstRegister.Index > high_addr)
+					high_addr = fdst->DstRegister.Index;
+			}
+		
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nv40_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code the the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
+		struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_POS);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_CLIP(i));
+		struct nv40_sreg ceqn = constant(vpc, -1,
+						 nv40->clip.ucp[i][0],
+						 nv40->clip.ucp[i][1],
+						 nv40->clip.ucp[i][2],
+						 nv40->clip.ucp[i][3]);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = MASK_Y; break;
+		case 1: case 4: mask = MASK_Z; break;
+		case 2: case 5: mask = MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp); 
+	if (vpc->r_address)
+		FREE(vpc->r_address); 
+	if (vpc->imm)	
+		FREE(vpc->imm); 
+	FREE(vpc);
+}
+
+static boolean
+nv40_vertprog_validate(struct nv40_context *nv40)
+{ 
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nv40->dirty & NV40_NEW_UCP) ||
+		    memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nv40_vertprog_destroy(nv40, vp);
+			memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
+	nv40_vertprog_translate(nv40, vp);
+	if (!vp->translated) {
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
+		return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(7, 0);
+		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
+		so_data  (so, vp->ir);
+		so_data  (so, vp->or);
+		so_method(so, curie,  NV40TCL_CLIP_PLANE_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV40_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv40_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			ws->buffer_unmap(ws, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv40->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv40_state_entry nv40_state_vertprog = {
+	.validate = nv40_vertprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
+		.hw = NV40_STATE_VERTPROG,
+	}
+};
+