61 files changed, 8033 insertions, 1521 deletions
diff --git a/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h b/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h
index 77af6d39a5a..e23ee53ffb1 100644
--- a/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h
+++ b/src/gallium/drivers/svga/include/VGPU10ShaderTokens.h
@@ -201,7 +201,7 @@ typedef enum {
    VGPU10_OPCODE_DCL_GLOBAL_FLAGS                  = 106,
 
    /* GL guest */
-   VGPU10_OPCODE_IDIV                              = 107,
+   VGPU10_OPCODE_VMWARE                            = 107,
 
    /* DX10.1 */
    VGPU10_OPCODE_LOD                               = 108,
diff --git a/src/gallium/drivers/svga/include/svga3d_types.h b/src/gallium/drivers/svga/include/svga3d_types.h
index 48eafe72202..94262314e29 100644
--- a/src/gallium/drivers/svga/include/svga3d_types.h
+++ b/src/gallium/drivers/svga/include/svga3d_types.h
@@ -436,8 +436,9 @@ typedef uint32 SVGA3dSurfaceFlags;
  * mob-backing to store all the samples.
  */
 #define SVGA3D_SURFACE_MULTISAMPLE            (CONST64U(1) << 32)
+#define SVGA3D_SURFACE_DRAWINDIRECT_ARGS      (CONST64U(1) << 38)
 
-#define SVGA3D_SURFACE_FLAG_MAX               (CONST64U(1) << 33)
+#define SVGA3D_SURFACE_FLAG_MAX               (CONST64U(1) << 42)
 
 /*
  * Surface flags types:
@@ -464,7 +465,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
            SVGA3D_SURFACE_HINT_INDIRECT_UPDATE | \
            SVGA3D_SURFACE_TRANSFER_FROM_BUFFER | \
            SVGA3D_SURFACE_VADECODE |             \
-           SVGA3D_SURFACE_MULTISAMPLE            \
+           SVGA3D_SURFACE_MULTISAMPLE |          \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS      \
         )
 
 #define SVGA3D_SURFACE_2D_DISALLOWED_MASK           \
@@ -480,7 +482,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
            SVGA3D_SURFACE_BIND_STREAM_OUTPUT |      \
            SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
            SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
         )
 
 #define SVGA3D_SURFACE_BASICOPS_DISALLOWED_MASK     \
@@ -508,7 +511,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
            SVGA3D_SURFACE_HINT_INDIRECT_UPDATE |    \
            SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
            SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
         )
 
 #define SVGA3D_SURFACE_BUFFER_DISALLOWED_MASK       \
@@ -527,7 +531,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
            SVGA3D_SURFACE_VOLUME |                  \
            SVGA3D_SURFACE_1D |                      \
            SVGA3D_SURFACE_SCREENTARGET |            \
-           SVGA3D_SURFACE_MOB_PITCH                 \
+           SVGA3D_SURFACE_MOB_PITCH |               \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
         )
 
 #define SVGA3D_SURFACE_DX_ONLY_MASK             \
@@ -636,7 +641,8 @@ typedef uint64 SVGA3dSurfaceAllFlags;
            SVGA3D_SURFACE_BIND_STREAM_OUTPUT |      \
            SVGA3D_SURFACE_TRANSFER_FROM_BUFFER |    \
            SVGA3D_SURFACE_VADECODE |                \
-           SVGA3D_SURFACE_MULTISAMPLE               \
+           SVGA3D_SURFACE_MULTISAMPLE |             \
+           SVGA3D_SURFACE_DRAWINDIRECT_ARGS         \
         )
 
 
diff --git a/src/gallium/drivers/svga/meson.build b/src/gallium/drivers/svga/meson.build
index 368d0c7f342..8dcdadd6e1d 100644
--- a/src/gallium/drivers/svga/meson.build
+++ b/src/gallium/drivers/svga/meson.build
@@ -36,6 +36,7 @@ files_svga = files(
   'svga_pipe_flush.c',
   'svga_pipe_fs.c',
   'svga_pipe_gs.c',
+  'svga_pipe_ts.c',
   'svga_pipe_misc.c',
   'svga_pipe_query.c',
   'svga_pipe_rasterizer.c',
@@ -56,6 +57,7 @@ files_svga = files(
   'svga_state_framebuffer.c',
   'svga_state_fs.c',
   'svga_state_gs.c',
+  'svga_state_ts.c',
   'svga_state_need_swtnl.c',
   'svga_state_rss.c',
   'svga_state_sampler.c',
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h
index f6cb4fc27c1..22a40cf05cb 100644
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -697,4 +697,33 @@ SVGA3D_vgpu10_ResolveCopy(struct svga_winsys_context *swc,
                           struct svga_winsys_surface *src,
                           const SVGA3dSurfaceFormat copyFormat);
 
+enum pipe_error
+SVGA3D_sm5_DrawIndexedInstancedIndirect(struct svga_winsys_context *swc,
+                                        struct svga_winsys_surface *argBuffer,
+                                        unsigned argOffset);
+
+enum pipe_error
+SVGA3D_sm5_DrawInstancedIndirect(struct svga_winsys_context *swc,
+                                 struct svga_winsys_surface *argBuffer,
+                                 unsigned argOffset);
+
+enum pipe_error
+SVGA3D_sm5_Dispatch(struct svga_winsys_context *swc,
+                    const uint32 threadGroupCount[3]);
+
+enum pipe_error
+SVGA3D_sm5_DispatchIndirect(struct svga_winsys_context *swc,
+                            struct svga_winsys_surface *argBuffer,
+                            uint32 argOffset);
+
+enum pipe_error
+SVGA3D_sm5_DefineAndBindStreamOutput(struct svga_winsys_context *swc,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamOutputStrideInBytes[SVGA3D_DX_MAX_SOTARGETS],
+       struct svga_winsys_buffer *declBuf,
+       uint32 rasterizedStream,
+       uint32 sizeInBytes);
+
 #endif /* __SVGA3D_H__ */
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
index 1ca050ecb7a..eb5a482d9ba 100644
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -1130,7 +1130,7 @@ SVGA3D_vgpu10_DefineStreamOutput(struct svga_winsys_context *swc,
 
    memcpy(cmd->decl, decl,
           sizeof(SVGA3dStreamOutputDeclarationEntry)
-          * SVGA3D_MAX_STREAMOUT_DECLS);
+          * SVGA3D_MAX_DX10_STREAMOUT_DECLS);
 
    cmd->rasterizedStream = 0;
    swc->commit(swc);
@@ -1432,3 +1432,159 @@ SVGA3D_vgpu10_ResolveCopy(struct svga_winsys_context *swc,
 
    return PIPE_OK;
 }
+
+
+enum pipe_error
+SVGA3D_sm5_DrawIndexedInstancedIndirect(struct svga_winsys_context *swc,
+                                        struct svga_winsys_surface *argBuffer,
+                                        unsigned argOffset)
+{
+   SVGA3dCmdDXDrawIndexedInstancedIndirect *cmd =
+      SVGA3D_FIFOReserve(swc,
+                         SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED_INDIRECT,
+                         sizeof(SVGA3dCmdDXDrawIndexedInstancedIndirect),
+                         1); /* one relocation */
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_DrawInstancedIndirect(struct svga_winsys_context *swc,
+                                 struct svga_winsys_surface *argBuffer,
+                                 unsigned argOffset)
+{
+   SVGA3dCmdDXDrawInstancedIndirect *cmd =
+      SVGA3D_FIFOReserve(swc,
+                         SVGA_3D_CMD_DX_DRAW_INSTANCED_INDIRECT,
+                         sizeof(SVGA3dCmdDXDrawInstancedIndirect),
+                         1); /* one relocation */
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_Dispatch(struct svga_winsys_context *swc,
+                    const uint32 threadGroupCount[3])
+{
+   SVGA3dCmdDXDispatch *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DX_DISPATCH,
+                            sizeof(SVGA3dCmdDXDispatch),
+                            0);
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   cmd->threadGroupCountX = threadGroupCount[0];
+   cmd->threadGroupCountY = threadGroupCount[1];
+   cmd->threadGroupCountZ = threadGroupCount[2];
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+SVGA3D_sm5_DispatchIndirect(struct svga_winsys_context *swc,
+                            struct svga_winsys_surface *argBuffer,
+                            uint32 argOffset)
+{
+   SVGA3dCmdDXDispatchIndirect *cmd;
+
+   cmd = SVGA3D_FIFOReserve(swc,
+                            SVGA_3D_CMD_DX_DISPATCH_INDIRECT,
+                            sizeof(SVGA3dCmdDXDispatchIndirect),
+                            1);
+   if (!cmd)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   swc->surface_relocation(swc, &cmd->argsBufferSid, NULL, argBuffer,
+                           SVGA_RELOC_READ);
+   cmd->byteOffsetForArgs = argOffset;
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
+
+
+/**
+  * We don't want any flush between DefineStreamOutputWithMob and
+  * BindStreamOutput because it will cause partial state in command
+  * buffer. This function make that sure there is enough room for
+  * both commands before issuing them
+  */
+
+enum pipe_error
+SVGA3D_sm5_DefineAndBindStreamOutput(struct svga_winsys_context *swc,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamOutputStrideInBytes[SVGA3D_DX_MAX_SOTARGETS],
+       struct svga_winsys_buffer *declBuf,
+       uint32 rasterizedStream,
+       uint32 sizeInBytes)
+{
+   unsigned i;
+   SVGA3dCmdHeader *header;
+   SVGA3dCmdDXDefineStreamOutputWithMob *dcmd;
+   SVGA3dCmdDXBindStreamOutput *bcmd;
+
+   unsigned totalSize = 2 * sizeof(*header) +
+                        sizeof(*dcmd) + sizeof(*bcmd);
+
+   /* Make sure there is room for both commands */
+   header = swc->reserve(swc, totalSize, 2);
+   if (!header)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   /* DXDefineStreamOutputWithMob command */
+   header->id = SVGA_3D_CMD_DX_DEFINE_STREAMOUTPUT_WITH_MOB;
+   header->size = sizeof(*dcmd);
+   dcmd = (SVGA3dCmdDXDefineStreamOutputWithMob *)(header + 1);
+   dcmd->soid= soid;
+   dcmd->numOutputStreamEntries = numOutputStreamEntries;
+   dcmd->numOutputStreamStrides = numOutputStreamStrides;
+   dcmd->rasterizedStream = rasterizedStream;
+
+   for (i = 0; i < ARRAY_SIZE(dcmd->streamOutputStrideInBytes); i++)
+      dcmd->streamOutputStrideInBytes[i] = streamOutputStrideInBytes[i];
+
+
+   /* DXBindStreamOutput command */
+   header = (SVGA3dCmdHeader *)(dcmd + 1);
+
+   header->id = SVGA_3D_CMD_DX_BIND_STREAMOUTPUT;
+   header->size = sizeof(*bcmd);
+   bcmd = (SVGA3dCmdDXBindStreamOutput *)(header + 1);
+
+   bcmd->soid = soid;
+   bcmd->offsetInBytes = 0;
+   swc->mob_relocation(swc, &bcmd->mobid,
+                       &bcmd->offsetInBytes, declBuf, 0,
+                       SVGA_RELOC_WRITE);
+
+   bcmd->sizeInBytes = sizeInBytes;
+   bcmd->offsetInBytes = 0;
+
+
+   swc->commit(swc);
+   return PIPE_OK;
+}
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index cdc222e2438..4ef99efe989 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -44,6 +44,7 @@
 #include "svga_debug.h"
 #include "svga_state.h"
 #include "svga_winsys.h"
+#include "svga_streamout.h"
 
 #define CONST0_UPLOAD_DEFAULT_SIZE 65536
 
@@ -79,6 +80,9 @@ svga_destroy(struct pipe_context *pipe)
 
    pipe->delete_blend_state(pipe, svga->noop_blend);
 
+   /* destroy stream output statistics queries */
+   svga_destroy_stream_output_queries(svga);
+
    /* free query gb object */
    if (svga->gb_query) {
       pipe->destroy_query(pipe, NULL);
@@ -91,6 +95,7 @@ svga_destroy(struct pipe_context *pipe)
    svga_cleanup_framebuffer(svga);
    svga_cleanup_tss_binding(svga);
    svga_cleanup_vertex_state(svga);
+   svga_cleanup_tcs_state(svga);
 
    svga_destroy_swtnl(svga);
    svga_hwtnl_destroy(svga->hwtnl);
@@ -174,12 +179,14 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
    svga_init_fs_functions(svga);
    svga_init_vs_functions(svga);
    svga_init_gs_functions(svga);
+   svga_init_ts_functions(svga);
    svga_init_vertex_functions(svga);
    svga_init_constbuffer_functions(svga);
    svga_init_query_functions(svga);
    svga_init_surface_functions(svga);
    svga_init_stream_output_functions(svga);
    svga_init_clear_functions(svga);
+   svga_init_tracked_state(svga);
 
    /* init misc state */
    svga->curr.sample_mask = ~0;
@@ -250,6 +257,7 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
    memset(&svga->state.hw_clear, 0xcd, sizeof(svga->state.hw_clear));
    memset(&svga->state.hw_clear.framebuffer, 0x0,
           sizeof(svga->state.hw_clear.framebuffer));
+   memset(&svga->state.hw_clear.rtv, 0, sizeof(svga->state.hw_clear.rtv));
    svga->state.hw_clear.num_rendertargets = 0;
    svga->state.hw_clear.dsv = NULL;
 
@@ -269,6 +277,8 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
    svga->state.hw_draw.vs = NULL;
    svga->state.hw_draw.gs = NULL;
    svga->state.hw_draw.fs = NULL;
+   svga->state.hw_draw.tcs = NULL;
+   svga->state.hw_draw.tes = NULL;
 
    /* Initialize the currently bound buffer resources */
    memset(svga->state.hw_draw.constbuf, 0,
@@ -303,10 +313,16 @@ svga_context_create(struct pipe_screen *screen, void *priv, unsigned flags)
       svga->noop_blend = svga->pipe.create_blend_state(&svga->pipe, &noop_tmpl);
    }
 
-   svga->dirty = ~0;
+   svga->dirty = SVGA_NEW_ALL;
    svga->pred.query_id = SVGA3D_INVALID_ID;
    svga->disable_rasterizer = FALSE;
 
+   /**
+    * Create stream output statistics queries used in the workaround for auto
+    * draw with stream instancing.
+    */
+   svga_create_stream_output_queries(svga);
+
    goto done;
 
 cleanup:
@@ -398,6 +414,11 @@ svga_context_flush(struct svga_context *svga,
       svga->rebind.flags.fs = TRUE;
       svga->rebind.flags.gs = TRUE;
 
+      if (svga_have_sm5(svga)) {
+         svga->rebind.flags.tcs = TRUE;
+         svga->rebind.flags.tes = TRUE;
+      }
+
       if (svga_need_to_rebind_resources(svga)) {
          svga->rebind.flags.query = TRUE;
       }
@@ -447,12 +468,7 @@ svga_hwtnl_flush_retry(struct svga_context *svga)
 {
    enum pipe_error ret = PIPE_OK;
 
-   ret = svga_hwtnl_flush(svga->hwtnl);
-   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_flush(svga->hwtnl);
-   }
-
+   SVGA_RETRY_OOM(svga, ret, svga_hwtnl_flush(svga->hwtnl));
    assert(ret == PIPE_OK);
 }
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index eef8b88f594..c0c315119f6 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -43,7 +43,7 @@
 #include "svga_winsys.h"
 #include "svga_hw_reg.h"
 #include "svga3d_shaderdefs.h"
-
+#include "svga_debug.h"
 
 /** Non-GPU queries for gallium HUD */
 enum svga_hud {
@@ -56,6 +56,7 @@ enum svga_hud {
    SVGA_QUERY_NUM_BUFFERS_MAPPED,
    SVGA_QUERY_NUM_TEXTURES_MAPPED,
    SVGA_QUERY_NUM_BYTES_UPLOADED,
+   SVGA_QUERY_NUM_COMMAND_BUFFERS,
    SVGA_QUERY_COMMAND_BUFFER_SIZE,
    SVGA_QUERY_FLUSH_TIME,
    SVGA_QUERY_SURFACE_WRITE_FLUSHES,
@@ -64,6 +65,8 @@ enum svga_hud {
    SVGA_QUERY_NUM_BUFFER_UPLOADS,
    SVGA_QUERY_NUM_CONST_BUF_UPDATES,
    SVGA_QUERY_NUM_CONST_UPDATES,
+   SVGA_QUERY_NUM_SHADER_RELOCATIONS,
+   SVGA_QUERY_NUM_SURFACE_RELOCATIONS,
 
 /* running total counters */
    SVGA_QUERY_MEMORY_USED,
@@ -74,6 +77,7 @@ enum svga_hud {
    SVGA_QUERY_NUM_GENERATE_MIPMAP,
    SVGA_QUERY_NUM_FAILED_ALLOCATIONS,
    SVGA_QUERY_NUM_COMMANDS_PER_DRAW,
+   SVGA_QUERY_SHADER_MEM_USED,
 
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
    SVGA_QUERY_MAX
@@ -109,6 +113,8 @@ struct svga_blend_state {
    unsigned alpha_to_coverage:1;
    unsigned alpha_to_one:1;
    unsigned blend_color_alpha:1;  /**< set blend color to alpha value */
+   unsigned logicop_enabled:1;
+   unsigned logicop_mode:5;
 
    /** Per-render target state */
    struct {
@@ -269,6 +275,11 @@ struct svga_state
    struct svga_vertex_shader *vs;
    struct svga_geometry_shader *user_gs; /* user-specified GS */
    struct svga_geometry_shader *gs;      /* derived GS */
+   /* derived tessellation control shader */
+   struct svga_tcs_shader *tcs;
+   /* derived tessellation evaluation shader */
+   struct svga_tes_shader *tes;
+   struct svga_compute_shader *cs;
 
    struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
    /** Constant buffers for each shader.
@@ -286,11 +297,11 @@ struct svga_state
    int nr_fbs;
 
    struct pipe_poly_stipple poly_stipple;
-   struct pipe_scissor_state scissor;
+   struct pipe_scissor_state scissor[SVGA3D_DX_MAX_VIEWPORTS];
    struct pipe_blend_color blend_color;
    struct pipe_stencil_ref stencil_ref;
    struct pipe_clip_state clip;
-   struct pipe_viewport_state viewport;
+   struct pipe_viewport_state viewport[SVGA3D_DX_MAX_VIEWPORTS];
 
    unsigned num_samplers[PIPE_SHADER_TYPES];
    unsigned num_sampler_views[PIPE_SHADER_TYPES];
@@ -303,6 +314,14 @@ struct svga_state
    } tex_flags;
 
    unsigned sample_mask;
+   unsigned vertices_per_patch;
+   float default_tesslevels[6]; /* tessellation (outer[4] + inner[2]) levels */
+   struct {
+      /* Determine the layout of the grid (in block units) to be used. */
+      unsigned size[3];
+      /* If DispatchIndirect is used, this will has grid size info*/
+      struct pipe_resource *indirect;
+   } grid_info;
 };
 
 struct svga_prescale {
@@ -311,21 +330,27 @@ struct svga_prescale {
    boolean enabled;
 };
 
+struct svga_depthrange {
+   float zmin;
+   float zmax;
+};
 
 /* Updated by calling svga_update_state( SVGA_STATE_HW_CLEAR )
  */
 struct svga_hw_clear_state
 {
-   SVGA3dRect viewport;
-
-   struct {
-      float zmin, zmax;
-   } depthrange;
-
    struct pipe_framebuffer_state framebuffer;
-   struct svga_prescale prescale;
+
+   /* VGPU9 only */
+   SVGA3dRect viewport;
+   struct svga_depthrange depthrange;
 
    /* VGPU10 state */
+   SVGA3dViewport viewports[SVGA3D_DX_MAX_VIEWPORTS];
+   struct svga_prescale prescale[SVGA3D_DX_MAX_VIEWPORTS];
+   struct pipe_scissor_state scissors[SVGA3D_DX_MAX_VIEWPORTS];
+   unsigned num_prescale;
+
    unsigned num_rendertargets;
    struct pipe_surface *rtv[SVGA3D_MAX_RENDER_TARGETS];
    struct pipe_surface *dsv;
@@ -361,6 +386,9 @@ struct svga_hw_draw_state
    struct svga_shader_variant *fs;
    struct svga_shader_variant *vs;
    struct svga_shader_variant *gs;
+   struct svga_shader_variant *tcs;
+   struct svga_shader_variant *tes;
+   struct svga_shader_variant *cs;
 
    /** Currently bound constant buffer, per shader stage */
    struct pipe_resource *constbuf[PIPE_SHADER_TYPES];
@@ -495,7 +523,7 @@ struct svga_context
    struct util_bitmask *query_id_bm;
 
    struct {
-      unsigned dirty[SVGA_STATE_MAX];
+      uint64_t dirty[SVGA_STATE_MAX];
 
       /** bitmasks of which const buffers are changed */
       unsigned dirty_constbufs[PIPE_SHADER_TYPES];
@@ -508,7 +536,7 @@ struct svga_context
    } state;
 
    struct svga_state curr;      /* state from the gallium frontend */
-   unsigned dirty;              /* statechanges since last update_state() */
+   uint64_t dirty;              /* statechanges since last update_state() */
 
    union {
       struct {
@@ -518,6 +546,9 @@ struct svga_context
          unsigned vs:1;
          unsigned fs:1;
          unsigned gs:1;
+         unsigned tcs:1;
+         unsigned tes:1;
+         unsigned cs:1;
          unsigned query:1;
       } flags;
       unsigned val;
@@ -531,7 +562,10 @@ struct svga_context
    struct util_bitmask *gb_query_alloc_mask;  /**< gb query object allocation mask */
    struct svga_qmem_alloc_entry *gb_query_map[SVGA_QUERY_MAX];
                                               /**< query mem block mapping */
-   struct svga_query *sq[SVGA_QUERY_MAX];     /**< queries currently in progress */
+   struct svga_query *sq[SVGA_QUERY_MAX+12];  /**< queries currently in progress */
+                                              /* The last 12 entries are for streamout
+                                               * queries for stream 0..3
+                                               */
 
    /** List of buffers with queued transfers */
    struct list_head dirty_buffers;
@@ -545,6 +579,7 @@ struct svga_context
       uint64_t map_buffer_time;         /**< SVGA_QUERY_MAP_BUFFER_TIME */
       uint64_t num_buffers_mapped;      /**< SVGA_QUERY_NUM_BUFFERS_MAPPED */
       uint64_t num_textures_mapped;     /**< SVGA_QUERY_NUM_TEXTURES_MAPPED */
+      uint64_t num_command_buffers;     /**< SVGA_QUERY_NUM_COMMAND_BUFFERS */
       uint64_t command_buffer_size;     /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
       uint64_t flush_time;              /**< SVGA_QUERY_FLUSH_TIME */
       uint64_t surface_write_flushes;   /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
@@ -566,16 +601,28 @@ struct svga_context
       uint64_t num_surface_views;       /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
       uint64_t num_bytes_uploaded;      /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
       uint64_t num_generate_mipmap;     /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
+      uint64_t shader_mem_used;         /**< SVGA_QUERY_SHADER_MEM_USED */
 
       boolean uses_time;                /**< os_time_get() calls needed? */
    } hud;
 
    /** The currently bound stream output targets */
+   boolean in_streamout;                /* Set if streamout is active */
    unsigned num_so_targets;
    struct svga_winsys_surface *so_surfaces[SVGA3D_DX_MAX_SOTARGETS];
    struct pipe_stream_output_target *so_targets[SVGA3D_DX_MAX_SOTARGETS];
    struct svga_stream_output *current_so;
 
+   /**
+    * The following states are used in the workaround for auto draw with
+    * stream instancing.
+    */
+
+   /* Last bound SO targets that can be used to get vertex count */
+   struct pipe_stream_output_target *vcount_so_targets[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned vcount_buffer_stream;       /* SO buffer to stream index mask */
+   struct pipe_query *so_queries[4];    /* SO stat queries for each stream */
+
    /** A blend state with blending disabled, for falling back to when blending
     * is illegal (e.g. an integer texture is bound)
     */
@@ -601,41 +648,58 @@ struct svga_context
 
    boolean render_condition;
    boolean disable_rasterizer; /* Set if to disable rasterization */
+
+   struct {
+      struct svga_tcs_shader *passthrough_tcs;
+      struct svga_vertex_shader *vs;
+      struct svga_tes_shader *tes;
+      unsigned vertices_per_patch;
+      boolean passthrough;
+   } tcs;
+
 };
 
 /* A flag for each frontend state object:
  */
-#define SVGA_NEW_BLEND               0x1
-#define SVGA_NEW_DEPTH_STENCIL_ALPHA 0x2
-#define SVGA_NEW_RAST                0x4
-#define SVGA_NEW_SAMPLER             0x8
-#define SVGA_NEW_TEXTURE             0x10
-#define SVGA_NEW_VBUFFER             0x20
-#define SVGA_NEW_VELEMENT            0x40
-#define SVGA_NEW_FS                  0x80
-#define SVGA_NEW_VS                  0x100
-#define SVGA_NEW_FS_CONST_BUFFER     0x200
-#define SVGA_NEW_VS_CONST_BUFFER     0x400
-#define SVGA_NEW_FRAME_BUFFER        0x800
-#define SVGA_NEW_STIPPLE             0x1000
-#define SVGA_NEW_SCISSOR             0x2000
-#define SVGA_NEW_BLEND_COLOR         0x4000
-#define SVGA_NEW_CLIP                0x8000
-#define SVGA_NEW_VIEWPORT            0x10000
-#define SVGA_NEW_PRESCALE            0x20000
-#define SVGA_NEW_REDUCED_PRIMITIVE   0x40000
-#define SVGA_NEW_TEXTURE_BINDING     0x80000
-#define SVGA_NEW_NEED_PIPELINE       0x100000
-#define SVGA_NEW_NEED_SWVFETCH       0x200000
-#define SVGA_NEW_NEED_SWTNL          0x400000
-#define SVGA_NEW_FS_VARIANT          0x800000
-#define SVGA_NEW_VS_VARIANT          0x1000000
-#define SVGA_NEW_TEXTURE_FLAGS       0x4000000
-#define SVGA_NEW_STENCIL_REF         0x8000000
-#define SVGA_NEW_GS                  0x10000000
-#define SVGA_NEW_GS_CONST_BUFFER     0x20000000
-#define SVGA_NEW_GS_VARIANT          0x40000000
-#define SVGA_NEW_TEXTURE_CONSTS      0x80000000
+#define SVGA_NEW_BLEND               ((uint64_t) 0x1)
+#define SVGA_NEW_DEPTH_STENCIL_ALPHA ((uint64_t) 0x2)
+#define SVGA_NEW_RAST                ((uint64_t) 0x4)
+#define SVGA_NEW_SAMPLER             ((uint64_t) 0x8)
+#define SVGA_NEW_TEXTURE             ((uint64_t) 0x10)
+#define SVGA_NEW_VBUFFER             ((uint64_t) 0x20)
+#define SVGA_NEW_VELEMENT            ((uint64_t) 0x40)
+#define SVGA_NEW_FS                  ((uint64_t) 0x80)
+#define SVGA_NEW_VS                  ((uint64_t) 0x100)
+#define SVGA_NEW_FS_CONST_BUFFER     ((uint64_t) 0x200)
+#define SVGA_NEW_VS_CONST_BUFFER     ((uint64_t) 0x400)
+#define SVGA_NEW_FRAME_BUFFER        ((uint64_t) 0x800)
+#define SVGA_NEW_STIPPLE             ((uint64_t) 0x1000)
+#define SVGA_NEW_SCISSOR             ((uint64_t) 0x2000)
+#define SVGA_NEW_BLEND_COLOR         ((uint64_t) 0x4000)
+#define SVGA_NEW_CLIP                ((uint64_t) 0x8000)
+#define SVGA_NEW_VIEWPORT            ((uint64_t) 0x10000)
+#define SVGA_NEW_PRESCALE            ((uint64_t) 0x20000)
+#define SVGA_NEW_REDUCED_PRIMITIVE   ((uint64_t) 0x40000)
+#define SVGA_NEW_TEXTURE_BINDING     ((uint64_t) 0x80000)
+#define SVGA_NEW_NEED_PIPELINE       ((uint64_t) 0x100000)
+#define SVGA_NEW_NEED_SWVFETCH       ((uint64_t) 0x200000)
+#define SVGA_NEW_NEED_SWTNL          ((uint64_t) 0x400000)
+#define SVGA_NEW_FS_VARIANT          ((uint64_t) 0x800000)
+#define SVGA_NEW_VS_VARIANT          ((uint64_t) 0x1000000)
+#define SVGA_NEW_TEXTURE_FLAGS       ((uint64_t) 0x4000000)
+#define SVGA_NEW_STENCIL_REF         ((uint64_t) 0x8000000)
+#define SVGA_NEW_GS                  ((uint64_t) 0x10000000)
+#define SVGA_NEW_GS_CONST_BUFFER     ((uint64_t) 0x20000000)
+#define SVGA_NEW_GS_VARIANT          ((uint64_t) 0x40000000)
+#define SVGA_NEW_TEXTURE_CONSTS      ((uint64_t) 0x80000000)
+#define SVGA_NEW_TCS                 ((uint64_t) 0x100000000)
+#define SVGA_NEW_TES                 ((uint64_t) 0x200000000)
+#define SVGA_NEW_TCS_VARIANT         ((uint64_t) 0x400000000)
+#define SVGA_NEW_TES_VARIANT         ((uint64_t) 0x800000000)
+#define SVGA_NEW_TCS_CONST_BUFFER    ((uint64_t) 0x1000000000)
+#define SVGA_NEW_TES_CONST_BUFFER    ((uint64_t) 0x2000000000)
+#define SVGA_NEW_TCS_PARAM           ((uint64_t) 0x4000000000)
+#define SVGA_NEW_ALL                 ((uint64_t) 0xFFFFFFFFFFFFFFFF)
 
 
 void svga_init_state_functions( struct svga_context *svga );
@@ -648,9 +712,11 @@ void svga_init_depth_stencil_functions( struct svga_context *svga );
 void svga_init_misc_functions( struct svga_context *svga );
 void svga_init_rasterizer_functions( struct svga_context *svga );
 void svga_init_sampler_functions( struct svga_context *svga );
+void svga_init_cs_functions( struct svga_context *svga );
 void svga_init_fs_functions( struct svga_context *svga );
 void svga_init_vs_functions( struct svga_context *svga );
 void svga_init_gs_functions( struct svga_context *svga );
+void svga_init_ts_functions( struct svga_context *svga );
 void svga_init_vertex_functions( struct svga_context *svga );
 void svga_init_constbuffer_functions( struct svga_context *svga );
 void svga_init_draw_functions( struct svga_context *svga );
@@ -663,6 +729,7 @@ void svga_cleanup_vertex_state( struct svga_context *svga );
 void svga_cleanup_sampler_state( struct svga_context *svga );
 void svga_cleanup_tss_binding( struct svga_context *svga );
 void svga_cleanup_framebuffer( struct svga_context *svga );
+void svga_cleanup_tcs_state( struct svga_context *svga );
 
 void svga_context_flush( struct svga_context *svga,
                          struct pipe_fence_handle **pfence );
@@ -724,6 +791,12 @@ svga_have_sm4_1(const struct svga_context *svga)
 }
 
 static inline boolean
+svga_have_sm5(const struct svga_context *svga)
+{
+   return svga_screen(svga->pipe.screen)->sws->have_sm5;
+}
+
+static inline boolean
 svga_need_to_rebind_resources(const struct svga_context *svga)
 {
    return svga_screen(svga->pipe.screen)->sws->need_to_rebind_resources;
@@ -745,5 +818,107 @@ svga_get_time(struct svga_context *svga)
    return svga->hud.uses_time ? os_time_get() : 0;
 }
 
+/*
+ * The SVGA_TRY_XX family of macros can be used to optionally replace a
+ * function call with an error value, the purpose is to trigger and test
+ * retry path handling.
+ */
+#ifdef DEBUG
+
+/*
+ * Optionally replace a function call with a PIPE_ERROR_OUT_OF_MEMORY
+ * return value
+ */
+#define SVGA_TRY(_func) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? PIPE_ERROR_OUT_OF_MEMORY : (_func))
+
+/* Optionally replace a function call with a NULL return value */
+#define SVGA_TRY_PTR(_func) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? NULL : (_func))
+
+/*
+ * Optionally replace a function call with a NULL return value, and set
+ * the _retry parameter to TRUE.
+ */
+#define SVGA_TRY_MAP(_func, _retry) \
+   ((SVGA_DEBUG & DEBUG_RETRY) ? (_retry) = TRUE, NULL : (_func))
+#else
+
+#define SVGA_TRY(_func) (_func)
+
+#define SVGA_TRY_PTR(_func) (_func)
+
+#define SVGA_TRY_MAP(_func, _retry) (_func)
+#endif
+
+/**
+ * Enter retry processing after hitting out-of-command space
+ */
+static inline void
+svga_retry_enter(struct svga_context *svga)
+{
+   /* We shouldn't nest retries, but currently we do. */
+   if ((SVGA_DEBUG & DEBUG_RETRY) && svga->swc->in_retry) {
+      debug_printf("WARNING: Recursive retry. Level: %u.\n",
+                   svga->swc->in_retry);
+   }
+   svga->swc->in_retry++;
+}
+
+/**
+ * Exit retry processing after hitting out-of-command space
+ */
+static inline void
+svga_retry_exit(struct svga_context *svga)
+{
+   assert(svga->swc->in_retry > 0);
+   svga->swc->in_retry--;
+}
+
+/**
+ * Perform a function call, and on failure flush the context and retry,
+ * asserting that the retry succeeded. On return, the boolean argument
+ * _retried indicates whether the function call was retried or not.
+ */
+#define SVGA_RETRY_CHECK(_svga, _func, _retried)       \
+   do {                                                \
+      enum pipe_error ret;                             \
+                                                       \
+      ret = SVGA_TRY(_func);                           \
+      (_retried) = (ret != PIPE_OK);                   \
+      if (_retried) {                                  \
+         svga_retry_enter(_svga);                      \
+         svga_context_flush(_svga, NULL);              \
+         ret = (_func);                                \
+         assert(ret == PIPE_OK);                       \
+         svga_retry_exit(_svga);                       \
+      }                                                \
+   } while(0)
+
+/**
+ * Perform a function call, and on failure flush the context and retry,
+ * asserting that the retry succeeded.
+ */
+#define SVGA_RETRY(_svga, _func)                \
+   do {                                         \
+      UNUSED boolean retried;                   \
+                                                \
+      SVGA_RETRY_CHECK(_svga, _func, retried);  \
+   } while(0)
+
+/**
+ * Perform a function call, and on out-of-memory, flush the context and
+ * retry. The retry return value is stored in _ret for reuse.
+ */
+#define SVGA_RETRY_OOM(_svga, _ret, _func)              \
+   do {                                                 \
+      (_ret) = SVGA_TRY(_func);                         \
+      if ((_ret) == PIPE_ERROR_OUT_OF_MEMORY) {         \
+         svga_retry_enter(_svga);                       \
+         svga_context_flush(_svga, NULL);               \
+         (_ret) = (_func);                              \
+         svga_retry_exit(_svga);                        \
+      }                                                 \
+   } while (0);
 
 #endif
diff --git a/src/gallium/drivers/svga/svga_debug.h b/src/gallium/drivers/svga/svga_debug.h
index 3686cc6d9cc..cdad858b045 100644
--- a/src/gallium/drivers/svga/svga_debug.h
+++ b/src/gallium/drivers/svga/svga_debug.h
@@ -46,6 +46,7 @@
 #define DEBUG_CACHE        0x8000
 #define DEBUG_STREAMOUT    0x10000
 #define DEBUG_SAMPLERS     0x20000
+#define DEBUG_RETRY        0x100000
 
 #ifdef DEBUG
 extern int SVGA_DEBUG;
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index e0e55f129b8..f8db818b3d0 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -349,7 +349,7 @@ validate_sampler_resources(struct svga_context *svga)
 
    assert(svga_have_vgpu10(svga));
 
-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_COMPUTE; shader++) {
       unsigned count = svga->curr.num_sampler_views[shader];
       unsigned i;
       struct svga_winsys_surface *surfaces[PIPE_MAX_SAMPLERS];
@@ -379,7 +379,8 @@ validate_sampler_resources(struct svga_context *svga)
 
       if (shader == PIPE_SHADER_FRAGMENT &&
           svga->curr.rast->templ.poly_stipple_enable) {
-         const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+         const unsigned unit =
+            svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
          struct svga_pipe_sampler_view *sv =
             svga->polygon_stipple.sampler_view;
 
@@ -415,7 +416,7 @@ validate_constant_buffers(struct svga_context *svga)
 
    assert(svga_have_vgpu10(svga));
 
-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_COMPUTE; shader++) {
       enum pipe_error ret;
       struct svga_buffer *buffer;
       struct svga_winsys_surface *handle;
@@ -482,6 +483,8 @@ last_command_was_draw(const struct svga_context *svga)
    case SVGA_3D_CMD_DX_DRAW_INSTANCED:
    case SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED:
    case SVGA_3D_CMD_DX_DRAW_AUTO:
+   case SVGA_3D_CMD_DX_DRAW_INDEXED_INSTANCED_INDIRECT:
+   case SVGA_3D_CMD_DX_DRAW_INSTANCED_INDIRECT:
       return true;
    default:
       return false;
@@ -511,17 +514,51 @@ vertex_buffers_equal(unsigned count,
  * Prepare the vertex buffers for a drawing command.
  */
 static enum pipe_error
-validate_vertex_buffers(struct svga_hwtnl *hwtnl)
+validate_vertex_buffers(struct svga_hwtnl *hwtnl,
+                   const struct pipe_stream_output_target *so_vertex_count)
 {
    struct svga_context *svga = hwtnl->svga;
    struct pipe_resource *vbuffers[SVGA3D_INPUTREG_MAX];
    struct svga_winsys_surface *vbuffer_handles[SVGA3D_INPUTREG_MAX];
-   const unsigned vbuf_count = hwtnl->cmd.vbuf_count;
+   struct svga_winsys_surface *so_vertex_count_handle;
+   const unsigned vbuf_count = so_vertex_count ? 1 : hwtnl->cmd.vbuf_count;
    int last_vbuf = -1;
    unsigned i;
 
    assert(svga_have_vgpu10(svga));
 
+   /* Get handle for each referenced vertex buffer, unless we're using a
+    * stream-out buffer to specify the drawing information (DrawAuto).
+    */
+   if (so_vertex_count) {
+      i = 0;
+   }
+   else {
+      for (i = 0; i < vbuf_count; i++) {
+         struct svga_buffer *sbuf =
+            svga_buffer(hwtnl->cmd.vbufs[i].buffer.resource);
+
+         if (sbuf) {
+            vbuffer_handles[i] = svga_buffer_handle(svga, &sbuf->b.b,
+                                                    PIPE_BIND_VERTEX_BUFFER);
+            assert(sbuf->key.flags & SVGA3D_SURFACE_BIND_VERTEX_BUFFER);
+            if (vbuffer_handles[i] == NULL)
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            vbuffers[i] = &sbuf->b.b;
+            last_vbuf = i;
+         }
+         else {
+            vbuffers[i] = NULL;
+            vbuffer_handles[i] = NULL;
+         }
+      }
+   }
+
+   for (; i < svga->state.hw_draw.num_vbuffers; i++) {
+      vbuffers[i] = NULL;
+      vbuffer_handles[i] = NULL;
+   }
+
    /* Get handle for each referenced vertex buffer */
    for (i = 0; i < vbuf_count; i++) {
       struct svga_buffer *sbuf =
@@ -558,14 +595,38 @@ validate_vertex_buffers(struct svga_hwtnl *hwtnl)
       svga->state.hw_draw.layout_id = hwtnl->cmd.vdecl_layout_id;
    }
 
+   /* Get handle for the stream out buffer */
+   if (so_vertex_count) {
+      so_vertex_count_handle = svga_buffer_handle(svga,
+                                                  so_vertex_count->buffer,
+                                                  (PIPE_BIND_VERTEX_BUFFER |
+                                                   PIPE_BIND_STREAM_OUTPUT));
+      if (!so_vertex_count_handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+   else {
+      so_vertex_count_handle = NULL;
+   }
+
    /* setup vertex buffers */
    {
       SVGA3dVertexBuffer vbuffer_attrs[PIPE_MAX_ATTRIBS];
 
-      for (i = 0; i < vbuf_count; i++) {
-         vbuffer_attrs[i].stride = hwtnl->cmd.vbufs[i].stride;
-         vbuffer_attrs[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
-         vbuffer_attrs[i].sid = 0;
+      if (so_vertex_count) {
+         /* Set IA slot0 input buffer to the SO buffer */
+         assert(vbuf_count == 1);
+         vbuffer_attrs[0].stride = hwtnl->cmd.vbufs[0].stride;
+         vbuffer_attrs[0].offset = hwtnl->cmd.vbufs[0].buffer_offset;
+         vbuffer_attrs[0].sid = 0;
+         vbuffers[0] = so_vertex_count->buffer;
+         vbuffer_handles[0] = so_vertex_count_handle;
+      }
+      else {
+         for (i = 0; i < vbuf_count; i++) {
+            vbuffer_attrs[i].stride = hwtnl->cmd.vbufs[i].stride;
+            vbuffer_attrs[i].offset = hwtnl->cmd.vbufs[i].buffer_offset;
+            vbuffer_attrs[i].sid = 0;
+         }
       }
 
       /* If any of the vertex buffer state has changed, issue
@@ -736,10 +797,14 @@ static enum pipe_error
 draw_vgpu10(struct svga_hwtnl *hwtnl,
             const SVGA3dPrimitiveRange *range,
             unsigned vcount,
+            unsigned min_index, unsigned max_index,
             struct pipe_resource *ib,
-            unsigned start_instance, unsigned instance_count)
+            unsigned start_instance, unsigned instance_count,
+            const struct pipe_draw_indirect_info *indirect,
+            const struct pipe_stream_output_target *so_vertex_count)
 {
    struct svga_context *svga = hwtnl->svga;
+   struct svga_winsys_surface *indirect_handle;
    enum pipe_error ret;
 
    assert(svga_have_vgpu10(svga));
@@ -779,7 +844,7 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
    if (ret != PIPE_OK)
       return ret;
 
-   ret = validate_vertex_buffers(hwtnl);
+   ret = validate_vertex_buffers(hwtnl, so_vertex_count);
    if (ret != PIPE_OK)
       return ret;
 
@@ -789,6 +854,16 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
          return ret;
    }
 
+   if (indirect) {
+      indirect_handle = svga_buffer_handle(svga, indirect->buffer,
+                                           PIPE_BIND_COMMAND_ARGS_BUFFER);
+      if (!indirect_handle)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+   else {
+      indirect_handle = NULL;
+   }
+
    /* Set primitive type (line, tri, etc) */
    if (svga->state.hw_draw.topology != range->primType) {
       ret = SVGA3D_vgpu10_SetTopology(svga->swc, range->primType);
@@ -800,15 +875,18 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
 
    if (ib) {
       /* indexed drawing */
-      if (instance_count > 1) {
+      if (indirect) {
+         ret = SVGA3D_sm5_DrawIndexedInstancedIndirect(svga->swc,
+                                                       indirect_handle,
+                                                       indirect->offset);
+      }
+      else if (instance_count > 1) {
          ret = SVGA3D_vgpu10_DrawIndexedInstanced(svga->swc,
                                                   vcount,
                                                   instance_count,
                                                   0, /* startIndexLocation */
                                                   range->indexBias,
                                                   start_instance);
-         if (ret != PIPE_OK)
-            return ret;
       }
       else {
          /* non-instanced drawing */
@@ -816,8 +894,9 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
                                          vcount,
                                          0,      /* startIndexLocation */
                                          range->indexBias);
-         if (ret != PIPE_OK)
-            return ret;
+      }
+      if (ret != PIPE_OK) {
+         return ret;
       }
    }
    else {
@@ -835,22 +914,30 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
 
       assert(svga->state.hw_draw.ib == NULL);
 
-      if (instance_count > 1) {
+      if (so_vertex_count) {
+         /* Stream-output drawing */
+         ret = SVGA3D_vgpu10_DrawAuto(svga->swc);
+      }
+      else if (indirect) {
+         ret = SVGA3D_sm5_DrawInstancedIndirect(svga->swc,
+                                                indirect_handle,
+                                                indirect->offset);
+      }
+      else if (instance_count > 1) {
          ret = SVGA3D_vgpu10_DrawInstanced(svga->swc,
                                            vcount,
                                            instance_count,
                                            range->indexBias,
                                            start_instance);
-         if (ret != PIPE_OK)
-            return ret;
       }
       else {
          /* non-instanced */
          ret = SVGA3D_vgpu10_Draw(svga->swc,
                                   vcount,
                                   range->indexBias);
-         if (ret != PIPE_OK)
-            return ret;
+      }
+      if (ret != PIPE_OK) {
+         return ret;
       }
    }
 
@@ -1044,14 +1131,20 @@ check_draw_params(struct svga_hwtnl *hwtnl,
 /**
  * All drawing filters down into this function, either directly
  * on the hardware path or after doing software vertex processing.
+ * \param indirect  if non-null, get the vertex count, first vertex, etc.
+ *                  from a buffer.
+ * \param so_vertex_count  if non-null, get the vertex count from a
+ *                         stream-output target.
  */
 enum pipe_error
 svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
-                const SVGA3dPrimitiveRange * range,
+                const SVGA3dPrimitiveRange *range,
                 unsigned vcount,
-                unsigned min_index,
-                unsigned max_index, struct pipe_resource *ib,
-                unsigned start_instance, unsigned instance_count)
+                unsigned min_index, unsigned max_index,
+                struct pipe_resource *ib,
+                unsigned start_instance, unsigned instance_count,
+                const struct pipe_draw_indirect_info *indirect,
+                const struct pipe_stream_output_target *so_vertex_count)
 {
    enum pipe_error ret = PIPE_OK;
 
@@ -1059,17 +1152,14 @@ svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
 
    if (svga_have_vgpu10(hwtnl->svga)) {
       /* draw immediately */
-      ret = draw_vgpu10(hwtnl, range, vcount, ib,
-                        start_instance, instance_count);
-      if (ret != PIPE_OK) {
-         svga_context_flush(hwtnl->svga, NULL);
-         ret = draw_vgpu10(hwtnl, range, vcount, ib,
-                           start_instance, instance_count);
-         assert(ret == PIPE_OK);
-      }
+      SVGA_RETRY(hwtnl->svga, draw_vgpu10(hwtnl, range, vcount, min_index,
+                                          max_index, ib, start_instance,
+                                          instance_count, indirect,
+                                          so_vertex_count));
    }
    else {
       /* batch up drawing commands */
+      assert(indirect == NULL);
 #ifdef DEBUG
       check_draw_params(hwtnl, range, min_index, max_index, ib);
       assert(start_instance == 0);
diff --git a/src/gallium/drivers/svga/svga_draw.h b/src/gallium/drivers/svga/svga_draw.h
index 9d79676d3f9..56d5127051d 100644
--- a/src/gallium/drivers/svga/svga_draw.h
+++ b/src/gallium/drivers/svga/svga_draw.h
@@ -60,7 +60,8 @@ svga_hwtnl_vertex_buffers(struct svga_hwtnl *hwtnl,
 enum pipe_error
 svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                        enum pipe_prim_type prim, unsigned start, unsigned count,
-                       unsigned start_instance, unsigned instance_count);
+                       unsigned start_instance, unsigned instance_count,
+                       ubyte vertices_per_patch);
 
 enum pipe_error
 svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index 19d5e503137..af27e038bc8 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -175,13 +175,14 @@ done:
 static enum pipe_error
 simple_draw_arrays(struct svga_hwtnl *hwtnl,
                    enum pipe_prim_type prim, unsigned start, unsigned count,
-                   unsigned start_instance, unsigned instance_count)
+                   unsigned start_instance, unsigned instance_count,
+                   ubyte vertices_per_patch)
 {
    SVGA3dPrimitiveRange range;
    unsigned hw_prim;
    unsigned hw_count;
 
-   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   hw_prim = svga_translate_prim(prim, count, &hw_count, vertices_per_patch);
    if (hw_count == 0)
       return PIPE_ERROR_BAD_INPUT;
 
@@ -200,14 +201,16 @@ simple_draw_arrays(struct svga_hwtnl *hwtnl,
     */
    return svga_hwtnl_prim(hwtnl, &range, count,
                           0, count - 1, NULL,
-                          start_instance, instance_count);
+                          start_instance, instance_count,
+                          NULL, NULL);
 }
 
 
 enum pipe_error
 svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                        enum pipe_prim_type prim, unsigned start, unsigned count,
-                       unsigned start_instance, unsigned instance_count)
+                       unsigned start_instance, unsigned instance_count,
+                       ubyte vertices_per_patch)
 {
    enum pipe_prim_type gen_prim;
    unsigned gen_size, gen_nr;
@@ -225,7 +228,7 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
    }
 
    if (svga->curr.rast->templ.flatshade &&
-       svga->state.hw_draw.fs->constant_color_output) {
+         svga_fs_variant(svga->state.hw_draw.fs)->constant_color_output) {
       /* The fragment color is a constant, not per-vertex so the whole
        * primitive will be the same color (except for possible blending).
        * We can ignore the current provoking vertex state and use whatever
@@ -273,7 +276,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
 
    if (gen_type == U_GENERATE_LINEAR) {
       ret = simple_draw_arrays(hwtnl, gen_prim, start, count,
-                                start_instance, instance_count);
+                               start_instance, instance_count,
+                               vertices_per_patch);
    }
    else {
       struct pipe_resource *gen_buf = NULL;
@@ -299,7 +303,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                                                      count - 1,
                                                      gen_prim, 0, gen_nr,
                                                      start_instance,
-                                                     instance_count);
+                                                     instance_count,
+                                                     vertices_per_patch);
       }
 
       if (gen_buf) {
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
index 41cd4d18993..b17fe44f747 100644
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -186,14 +186,15 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
                                       enum pipe_prim_type prim, unsigned start,
                                       unsigned count,
                                       unsigned start_instance,
-                                      unsigned instance_count)
+                                      unsigned instance_count,
+                                      ubyte vertices_per_patch)
 {
    SVGA3dPrimitiveRange range;
    unsigned hw_prim;
    unsigned hw_count;
    unsigned index_offset = start * index_size;
 
-   hw_prim = svga_translate_prim(prim, count, &hw_count);
+   hw_prim = svga_translate_prim(prim, count, &hw_count, vertices_per_patch);
    if (hw_count == 0)
       return PIPE_OK; /* nothing to draw */
 
@@ -206,7 +207,8 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
 
    return svga_hwtnl_prim(hwtnl, &range, count,
                           min_index, max_index, index_buffer,
-                          start_instance, instance_count);
+                          start_instance, instance_count,
+                          NULL, NULL);
 }
 
 
@@ -234,12 +236,20 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                        &gen_size, &gen_nr, &gen_func);
    }
    else {
+      unsigned hw_pv;
+
+      /* There is no geometry ordering with PATCH, so no need to
+       * consider provoking vertex mode for the translation.
+       * So use the same api_pv as the hw_pv.
+       */
+      hw_pv = info->mode == PIPE_PRIM_PATCHES ? hwtnl->api_pv :
+                                                hwtnl->hw_pv;
       gen_type = u_index_translator(svga_hw_prims,
                                     info->mode,
                                     info->index_size,
                                     count,
                                     hwtnl->api_pv,
-                                    hwtnl->hw_pv,
+                                    hw_pv,
                                     PR_DISABLE,
                                     &gen_prim, &gen_size, &gen_nr, &gen_func);
    }
@@ -271,7 +281,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                   info->max_index,
                                                   gen_prim, index_offset, count,
                                                   info->start_instance,
-                                                  info->instance_count);
+                                                  info->instance_count,
+                                                  info->vertices_per_patch);
       pipe_resource_reference(&index_buffer, NULL);
    }
    else {
@@ -299,7 +310,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                                      gen_prim, gen_offset,
                                                      gen_nr,
                                                      info->start_instance,
-                                                     info->instance_count);
+                                                     info->instance_count,
+                                                     info->vertices_per_patch);
       }
 
       if (gen_buf) {
diff --git a/src/gallium/drivers/svga/svga_draw_private.h b/src/gallium/drivers/svga/svga_draw_private.h
index 52a2c0f18b3..475ccc5aae0 100644
--- a/src/gallium/drivers/svga/svga_draw_private.h
+++ b/src/gallium/drivers/svga/svga_draw_private.h
@@ -52,7 +52,8 @@ static const unsigned svga_hw_prims =
     (1 << PIPE_PRIM_LINES_ADJACENCY) |
     (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY) |
     (1 << PIPE_PRIM_TRIANGLES_ADJACENCY) |
-    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY));
+    (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) |
+    (1 << PIPE_PRIM_PATCHES));
 
 
 /**
@@ -64,7 +65,8 @@ static const unsigned svga_hw_prims =
  * those to other types of primitives with index/translation code.
  */
 static inline SVGA3dPrimitiveType
-svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count)
+svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count,
+                    ubyte vertices_per_patch)
 {
    switch (mode) {
    case PIPE_PRIM_POINTS:
@@ -107,6 +109,13 @@ svga_translate_prim(unsigned mode, unsigned vcount, unsigned *prim_count)
       *prim_count = vcount / 2 - 2 ;
       return SVGA3D_PRIMITIVE_TRIANGLESTRIP_ADJ;
 
+   case PIPE_PRIM_PATCHES:
+      *prim_count = vcount / vertices_per_patch ;
+      assert(vertices_per_patch >= 1);
+      assert(vertices_per_patch <= 32);
+      return (SVGA3D_PRIMITIVE_1_CONTROL_POINT_PATCH - 1)
+             + vertices_per_patch;
+
    default:
       assert(0);
       *prim_count = 0;
@@ -218,7 +227,9 @@ svga_hwtnl_prim(struct svga_hwtnl *hwtnl,
                 unsigned min_index,
                 unsigned max_index,
                 struct pipe_resource *ib,
-                unsigned start_instance, unsigned instance_count);
+                unsigned start_instance, unsigned instance_count,
+                const struct pipe_draw_indirect_info *indirect,
+                const struct pipe_stream_output_target *so_vertex_count);
 
 enum pipe_error
 svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
@@ -231,6 +242,7 @@ svga_hwtnl_simple_draw_range_elements(struct svga_hwtnl *hwtnl,
                                       unsigned start,
                                       unsigned count,
                                       unsigned start_instance,
-                                      unsigned instance_count);
+                                      unsigned instance_count,
+                                      ubyte vertices_per_patch);
 
 #endif
diff --git a/src/gallium/drivers/svga/svga_format.c b/src/gallium/drivers/svga/svga_format.c
index 3f68f0cd67e..bb2f546d67d 100644
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@@ -71,10 +71,10 @@ static const struct vgpu10_format_entry format_conversion_table[] =
    [ PIPE_FORMAT_Z32_FLOAT ] =             { SVGA3D_FORMAT_INVALID,      SVGA3D_D32_FLOAT,            SVGA3D_D32_FLOAT,            0 },
    [ PIPE_FORMAT_Z24_UNORM_S8_UINT ] =     { SVGA3D_FORMAT_INVALID,      SVGA3D_D24_UNORM_S8_UINT,    SVGA3D_D24_UNORM_S8_UINT,    0 },
    [ PIPE_FORMAT_Z24X8_UNORM ] =           { SVGA3D_FORMAT_INVALID,      SVGA3D_D24_UNORM_S8_UINT,    SVGA3D_D24_UNORM_S8_UINT,    0 },
-   [ PIPE_FORMAT_R32_FLOAT ] =             { SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            SVGA3D_R32_FLOAT,            TF_GEN_MIPS },
-   [ PIPE_FORMAT_R32G32_FLOAT ] =          { SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS },
+   [ PIPE_FORMAT_R32_FLOAT ] =             { SVGA3D_R32_FLOAT,           SVGA3D_R32_FLOAT,            SVGA3D_R32_FLOAT,            TF_GEN_MIPS  },
+   [ PIPE_FORMAT_R32G32_FLOAT ] =          { SVGA3D_R32G32_FLOAT,        SVGA3D_R32G32_FLOAT,         SVGA3D_R32G32_FLOAT,         TF_GEN_MIPS  },
    [ PIPE_FORMAT_R32G32B32_FLOAT ] =       { SVGA3D_R32G32B32_FLOAT,     SVGA3D_R32G32B32_FLOAT,      SVGA3D_R32G32B32_FLOAT,      TF_GEN_MIPS },
-   [ PIPE_FORMAT_R32G32B32A32_FLOAT ] =    { SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS },
+   [ PIPE_FORMAT_R32G32B32A32_FLOAT ] =    { SVGA3D_R32G32B32A32_FLOAT,  SVGA3D_R32G32B32A32_FLOAT,   SVGA3D_R32G32B32A32_FLOAT,   TF_GEN_MIPS  },
    [ PIPE_FORMAT_R32_USCALED ] =           { SVGA3D_R32_UINT,            SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
    [ PIPE_FORMAT_R32G32_USCALED ] =        { SVGA3D_R32G32_UINT,         SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
    [ PIPE_FORMAT_R32G32B32_USCALED ] =     { SVGA3D_R32G32B32_UINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       VF_U_TO_F_CAST },
@@ -176,11 +176,11 @@ static const struct vgpu10_format_entry format_conversion_table[] =
    [ PIPE_FORMAT_R16G16B16A16_SINT ] =     { SVGA3D_R16G16B16A16_SINT,   SVGA3D_R16G16B16A16_SINT,    SVGA3D_R16G16B16A16_SINT,    0 },
    [ PIPE_FORMAT_R32_UINT ] =              { SVGA3D_R32_UINT,            SVGA3D_R32_UINT,             SVGA3D_R32_UINT,             0 },
    [ PIPE_FORMAT_R32G32_UINT ] =           { SVGA3D_R32G32_UINT,         SVGA3D_R32G32_UINT,          SVGA3D_R32G32_UINT,          0 },
-   [ PIPE_FORMAT_R32G32B32_UINT ] =        { SVGA3D_R32G32B32_UINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       0 },
+   [ PIPE_FORMAT_R32G32B32_UINT ] =        { SVGA3D_R32G32B32_UINT,      SVGA3D_R32G32B32_UINT,       SVGA3D_R32G32B32_UINT,       0 },
    [ PIPE_FORMAT_R32G32B32A32_UINT ] =     { SVGA3D_R32G32B32A32_UINT,   SVGA3D_R32G32B32A32_UINT,    SVGA3D_R32G32B32A32_UINT,    0 },
    [ PIPE_FORMAT_R32_SINT ] =              { SVGA3D_R32_SINT,            SVGA3D_R32_SINT,             SVGA3D_R32_SINT,             0 },
    [ PIPE_FORMAT_R32G32_SINT ] =           { SVGA3D_R32G32_SINT,         SVGA3D_R32G32_SINT,          SVGA3D_R32G32_SINT,          0 },
-   [ PIPE_FORMAT_R32G32B32_SINT ] =        { SVGA3D_R32G32B32_SINT,      SVGA3D_FORMAT_INVALID,       SVGA3D_FORMAT_INVALID,       0 },
+   [ PIPE_FORMAT_R32G32B32_SINT ] =        { SVGA3D_R32G32B32_SINT,      SVGA3D_R32G32B32_SINT,       SVGA3D_R32G32B32_SINT,       0 },
    [ PIPE_FORMAT_R32G32B32A32_SINT ] =     { SVGA3D_R32G32B32A32_SINT,   SVGA3D_R32G32B32A32_SINT,    SVGA3D_R32G32B32A32_SINT,    0 },
    [ PIPE_FORMAT_A8_UINT ] =               { SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       SVGA3D_R8_UINT,              TF_000X },
    [ PIPE_FORMAT_I8_UINT ] =               { SVGA3D_FORMAT_INVALID,      SVGA3D_FORMAT_INVALID,       SVGA3D_R8_UINT,              TF_XXXX },
@@ -2137,7 +2137,7 @@ svga_is_format_supported(struct pipe_screen *screen,
    }
 
    if (util_format_is_srgb(format) &&
-       (bindings & PIPE_BIND_DISPLAY_TARGET)) {
+       (bindings & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_RENDER_TARGET))) {
        /* We only support sRGB rendering with vgpu10 */
       return false;
    }
@@ -2252,6 +2252,12 @@ svga_is_dx_format_supported(struct pipe_screen *screen,
       return svga_format != SVGA3D_FORMAT_INVALID;
    }
 
+   if (bindings & PIPE_BIND_SAMPLER_VIEW && target == PIPE_BUFFER) {
+      unsigned flags;
+      svga_translate_texture_buffer_view_format(format, &svga_format, &flags);
+      return svga_format != SVGA3D_FORMAT_INVALID;
+   }
+
    svga_format = svga_translate_format(ss, format, bindings);
    if (svga_format == SVGA3D_FORMAT_INVALID) {
       return false;
diff --git a/src/gallium/drivers/svga/svga_link.c b/src/gallium/drivers/svga/svga_link.c
index 0bf40d153b7..c9861a7e481 100644
--- a/src/gallium/drivers/svga/svga_link.c
+++ b/src/gallium/drivers/svga/svga_link.c
@@ -87,6 +87,15 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,
       }
    }
 
+   /* Find the index for position */
+   linkage->position_index = 0;
+   for (i = 0; i < outshader_info->num_outputs; i++) {
+      if (outshader_info->output_semantic_name[i] == TGSI_SEMANTIC_POSITION) {
+         linkage->position_index = i;
+         break;
+      }
+   }
+
    linkage->num_inputs = inshader_info->num_inputs;
 
    /* Things like the front-face register are handled here */
@@ -100,7 +109,8 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,
 
    /* Debug */
    if (SVGA_DEBUG & DEBUG_TGSI) {
-      unsigned reg = 0;
+      uint64_t reg = 0;
+      uint64_t one = 1;
       debug_printf("### linkage info: num_inputs=%d input_map_max=%d\n",
                    linkage->num_inputs, linkage->input_map_max);
 
@@ -116,10 +126,8 @@ svga_link_shaders(const struct tgsi_shader_info *outshader_info,
                       tgsi_interpolate_names[inshader_info->input_interpolate[i]]);
 
          /* make sure no repeating register index */
-         if (reg & 1 << linkage->input_map[i]) {
-            assert(0);
-         }
-         reg |= 1 << linkage->input_map[i];
+         assert((reg & (one << linkage->input_map[i])) == 0);
+         reg |= one << linkage->input_map[i];
       }
    }
 }
diff --git a/src/gallium/drivers/svga/svga_link.h b/src/gallium/drivers/svga/svga_link.h
index c21686eef59..8d3517ea28a 100644
--- a/src/gallium/drivers/svga/svga_link.h
+++ b/src/gallium/drivers/svga/svga_link.h
@@ -9,6 +9,7 @@ struct svga_context;
 struct shader_linkage
 {
    unsigned num_inputs;
+   unsigned position_index; /* position register index */
    unsigned input_map_max;  /* highest index of mapped inputs */
    ubyte input_map[PIPE_MAX_SHADER_INPUTS];
 };
diff --git a/src/gallium/drivers/svga/svga_pipe_blend.c b/src/gallium/drivers/svga/svga_pipe_blend.c
index b5557d31f44..e24a6beb0e4 100644
--- a/src/gallium/drivers/svga/svga_pipe_blend.c
+++ b/src/gallium/drivers/svga/svga_pipe_blend.c
@@ -92,6 +92,51 @@ svga_translate_blend_func(unsigned mode)
 
 
 /**
+ * Translate gallium logicop mode to SVGA3D logicop mode.
+ */
+static int
+translate_logicop(enum pipe_logicop op)
+{
+   switch (op) {
+   case PIPE_LOGICOP_CLEAR:
+      return SVGA3D_DX11_LOGICOP_CLEAR;
+   case PIPE_LOGICOP_NOR:
+      return SVGA3D_DX11_LOGICOP_NOR;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return SVGA3D_DX11_LOGICOP_AND_INVERTED;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return SVGA3D_DX11_LOGICOP_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return SVGA3D_DX11_LOGICOP_AND_REVERSE;
+   case PIPE_LOGICOP_INVERT:
+      return SVGA3D_DX11_LOGICOP_INVERT;
+   case PIPE_LOGICOP_XOR:
+      return SVGA3D_DX11_LOGICOP_XOR;
+   case PIPE_LOGICOP_NAND:
+      return SVGA3D_DX11_LOGICOP_NAND;
+   case PIPE_LOGICOP_AND:
+      return SVGA3D_DX11_LOGICOP_AND;
+   case PIPE_LOGICOP_EQUIV:
+      return SVGA3D_DX11_LOGICOP_EQUIV;
+   case PIPE_LOGICOP_NOOP:
+      return SVGA3D_DX11_LOGICOP_NOOP;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return SVGA3D_DX11_LOGICOP_OR_INVERTED;
+   case PIPE_LOGICOP_COPY:
+      return SVGA3D_DX11_LOGICOP_COPY;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return SVGA3D_DX11_LOGICOP_OR_REVERSE;
+   case PIPE_LOGICOP_OR:
+      return SVGA3D_DX11_LOGICOP_OR;
+   case PIPE_LOGICOP_SET:
+      return SVGA3D_DX11_LOGICOP_SET;
+   default:
+      return SVGA3D_DX11_LOGICOP_COPY;
+   }
+};
+
+
+/**
  * Define a vgpu10 blend state object for the given
  * svga blend state.
  */
@@ -100,7 +145,6 @@ define_blend_state_object(struct svga_context *svga,
                           struct svga_blend_state *bs)
 {
    SVGA3dDXBlendStatePerRT perRT[SVGA3D_MAX_RENDER_TARGETS];
-   unsigned try;
    int i;
 
    assert(svga_have_vgpu10(svga));
@@ -116,31 +160,141 @@ define_blend_state_object(struct svga_context *svga,
       perRT[i].destBlendAlpha = bs->rt[i].dstblend_alpha;
       perRT[i].blendOpAlpha = bs->rt[i].blendeq_alpha;
       perRT[i].renderTargetWriteMask = bs->rt[i].writemask;
-      perRT[i].logicOpEnable = 0;
-      perRT[i].logicOp = SVGA3D_LOGICOP_COPY;
+      perRT[i].logicOpEnable = bs->logicop_enabled;
+      perRT[i].logicOp = bs->logicop_mode;
    }
 
-   /* Loop in case command buffer is full and we need to flush and retry */
-   for (try = 0; try < 2; try++) {
-      enum pipe_error ret;
-
-      ret = SVGA3D_vgpu10_DefineBlendState(svga->swc,
-                                           bs->id,
-                                           bs->alpha_to_coverage,
-                                           bs->independent_blend_enable,
-                                           perRT);
-      if (ret == PIPE_OK)
-         return;
-      svga_context_flush(svga, NULL);
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DefineBlendState(svga->swc,
+                                                   bs->id,
+                                                   bs->alpha_to_coverage,
+                                                   bs->independent_blend_enable,
+                                                   perRT));
+}
+
+
+/**
+ * If SVGA3D_DEVCAP_LOGIC_BLENDOPS is false, we can't directly implement
+ * GL's logicops.  But we can emulate some of them.  We set up the blending
+ * state for that here.
+ */
+static void
+emulate_logicop(struct svga_context *svga,
+                unsigned logicop_func,
+                struct svga_blend_state *blend,
+                unsigned buffer)
+{
+   switch (logicop_func) {
+   case PIPE_LOGICOP_XOR:
+   case PIPE_LOGICOP_INVERT:
+      blend->need_white_fragments = TRUE;
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
+      break;
+   case PIPE_LOGICOP_CLEAR:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_COPY:
+      blend->rt[buffer].blend_enable = FALSE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      blend->rt[buffer].blend_enable   = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_NOOP:
+      blend->rt[buffer].blend_enable   = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   case PIPE_LOGICOP_SET:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_AND:
+      /* Approximate with minimum - works for the 0 & anything case: */
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
+      break;
+   case PIPE_LOGICOP_OR:
+      /* Approximate with maximum - works for the 1 | anything case: */
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      blend->rt[buffer].blend_enable = TRUE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
+      break;
+   case PIPE_LOGICOP_NAND:
+   case PIPE_LOGICOP_NOR:
+   case PIPE_LOGICOP_EQUIV:
+      /* Fill these in with plausible values */
+      blend->rt[buffer].blend_enable = FALSE;
+      blend->rt[buffer].srcblend       = SVGA3D_BLENDOP_ONE;
+      blend->rt[buffer].dstblend       = SVGA3D_BLENDOP_ZERO;
+      blend->rt[buffer].blendeq        = SVGA3D_BLENDEQ_ADD;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   blend->rt[buffer].srcblend_alpha = blend->rt[buffer].srcblend;
+   blend->rt[buffer].dstblend_alpha = blend->rt[buffer].dstblend;
+   blend->rt[buffer].blendeq_alpha = blend->rt[buffer].blendeq;
+
+   if (logicop_func == PIPE_LOGICOP_XOR) {
+      pipe_debug_message(&svga->debug.callback, CONFORMANCE,
+                         "XOR logicop mode has limited support");
+   }
+   else if (logicop_func != PIPE_LOGICOP_COPY) {
+      pipe_debug_message(&svga->debug.callback, CONFORMANCE,
+                         "general logicops are not supported");
    }
 }
 
 
+
 static void *
 svga_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *templ)
 {
    struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *ss = svga_screen(pipe->screen);
    struct svga_blend_state *blend = CALLOC_STRUCT( svga_blend_state );
    unsigned i;
 
@@ -166,107 +320,18 @@ svga_create_blend_state(struct pipe_context *pipe,
        * top of D3D9 API.  Instead we try to simulate with various blend modes.
        */
       if (templ->logicop_enable) {
-         switch (templ->logicop_func) {
-         case PIPE_LOGICOP_XOR:
-         case PIPE_LOGICOP_INVERT:
-            blend->need_white_fragments = TRUE;
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_SUBTRACT;
-            break;
-         case PIPE_LOGICOP_CLEAR:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_COPY:
-            blend->rt[i].blend_enable = FALSE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_COPY_INVERTED:
-            blend->rt[i].blend_enable   = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_NOOP:
-            blend->rt[i].blend_enable   = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         case PIPE_LOGICOP_SET:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_AND:
-            /* Approximate with minimum - works for the 0 & anything case: */
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_AND_REVERSE:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_AND_INVERTED:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MINIMUM;
-            break;
-         case PIPE_LOGICOP_OR:
-            /* Approximate with maximum - works for the 1 | anything case: */
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_OR_REVERSE:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_SRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_INVDESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_OR_INVERTED:
-            blend->rt[i].blend_enable = TRUE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_INVSRCCOLOR;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_DESTCOLOR;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_MAXIMUM;
-            break;
-         case PIPE_LOGICOP_NAND:
-         case PIPE_LOGICOP_NOR:
-         case PIPE_LOGICOP_EQUIV:
-            /* Fill these in with plausible values */
-            blend->rt[i].blend_enable = FALSE;
-            blend->rt[i].srcblend       = SVGA3D_BLENDOP_ONE;
-            blend->rt[i].dstblend       = SVGA3D_BLENDOP_ZERO;
-            blend->rt[i].blendeq        = SVGA3D_BLENDEQ_ADD;
-            break;
-         default:
-            assert(0);
-            break;
-         }
-         blend->rt[i].srcblend_alpha = blend->rt[i].srcblend;
-         blend->rt[i].dstblend_alpha = blend->rt[i].dstblend;
-         blend->rt[i].blendeq_alpha = blend->rt[i].blendeq;
-
-         if (templ->logicop_func == PIPE_LOGICOP_XOR) {
-            pipe_debug_message(&svga->debug.callback, CONFORMANCE,
-                               "XOR logicop mode has limited support");
+         if (ss->haveBlendLogicops) {
+            blend->logicop_enabled = TRUE;
+            blend->logicop_mode = translate_logicop(templ->logicop_func);
+            blend->rt[i].blendeq = SVGA3D_BLENDEQ_ADD;
+            blend->rt[i].blendeq_alpha = SVGA3D_BLENDEQ_ADD;
+            blend->rt[i].srcblend = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].srcblend_alpha = SVGA3D_BLENDOP_ZERO;
+            blend->rt[i].dstblend_alpha = SVGA3D_BLENDOP_ZERO;
          }
-         else if (templ->logicop_func != PIPE_LOGICOP_COPY) {
-            pipe_debug_message(&svga->debug.callback, CONFORMANCE,
-                               "general logicops are not supported");
+         else {
+            emulate_logicop(svga, templ->logicop_func, blend, i);
          }
       }
       else {
@@ -374,14 +439,7 @@ static void svga_delete_blend_state(struct pipe_context *pipe,
       (struct svga_blend_state *) blend;
 
    if (svga_have_vgpu10(svga) && bs->id != SVGA3D_INVALID_ID) {
-      enum pipe_error ret;
-
-      ret = SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id);
-      if (ret != PIPE_OK) {
-         svga_context_flush(svga, NULL);
-         ret = SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id);
-         assert(ret == PIPE_OK);
-      }
+      SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyBlendState(svga->swc, bs->id));
 
       if (bs->id == svga->state.hw_draw.blend_id)
          svga->state.hw_draw.blend_id = SVGA3D_INVALID_ID;
diff --git a/src/gallium/drivers/svga/svga_pipe_blit.c b/src/gallium/drivers/svga/svga_pipe_blit.c
index 31806ceb1e1..a756509ce76 100644
--- a/src/gallium/drivers/svga/svga_pipe_blit.c
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -80,7 +80,6 @@ intra_surface_copy(struct svga_context *svga, struct pipe_resource *tex,
                     unsigned dst_x, unsigned dst_y, unsigned dst_z,
                     unsigned width, unsigned height, unsigned depth)
 {
-   enum pipe_error ret;
    SVGA3dCopyBox box;
    struct svga_texture *stex;
 
@@ -102,15 +101,8 @@ intra_surface_copy(struct svga_context *svga, struct pipe_resource *tex,
    box.srcy = src_y;
    box.srcz = src_z;
 
-   ret = SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc,
-                                 stex->handle, level, layer_face,  &box);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-   ret = SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc,
-                                 stex->handle, level, layer_face, &box);
-      assert(ret == PIPE_OK);
-   }
-
+   SVGA_RETRY(svga, SVGA3D_vgpu10_IntraSurfaceCopy(svga->swc, stex->handle,
+                                                   level, layer_face,  &box));
    /* Mark the texture subresource as rendered-to. */
    svga_set_texture_rendered_to(stex, layer_face, level);
 }
@@ -630,11 +622,13 @@ try_blit(struct svga_context *svga, const struct pipe_blit_info *blit_info)
    util_blitter_save_vertex_elements(svga->blitter, (void*)svga->curr.velems);
    util_blitter_save_vertex_shader(svga->blitter, svga->curr.vs);
    util_blitter_save_geometry_shader(svga->blitter, svga->curr.user_gs);
+   util_blitter_save_tessctrl_shader(svga->blitter, svga->curr.tcs);
+   util_blitter_save_tesseval_shader(svga->blitter, svga->curr.tes);
    util_blitter_save_so_targets(svga->blitter, svga->num_so_targets,
                      (struct pipe_stream_output_target**)svga->so_targets);
    util_blitter_save_rasterizer(svga->blitter, (void*)svga->curr.rast);
-   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport);
-   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor);
+   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport[0]);
+   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor[0]);
    util_blitter_save_fragment_shader(svga->blitter, svga->curr.fs);
    util_blitter_save_blend(svga->blitter, (void*)svga->curr.blend);
    util_blitter_save_depth_stencil_alpha(svga->blitter,
@@ -835,7 +829,6 @@ svga_resource_copy_region(struct pipe_context *pipe,
    if (dst_tex->target == PIPE_BUFFER && src_tex->target == PIPE_BUFFER) {
       /* can't copy within the same buffer, unfortunately */
       if (svga_have_vgpu10(svga) && src_tex != dst_tex) {
-         enum pipe_error ret;
          struct svga_winsys_surface *src_surf;
          struct svga_winsys_surface *dst_surf;
          struct svga_buffer *dbuffer = svga_buffer(dst_tex);
@@ -844,15 +837,9 @@ svga_resource_copy_region(struct pipe_context *pipe,
          src_surf = svga_buffer_handle(svga, src_tex, sbuffer->bind_flags);
          dst_surf = svga_buffer_handle(svga, dst_tex, dbuffer->bind_flags);
 
-         ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                        src_box->x, dstx, src_box->width);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf, dst_surf,
-                                           src_box->x, dstx, src_box->width);
-            assert(ret == PIPE_OK);
-         }
-
+         SVGA_RETRY(svga, SVGA3D_vgpu10_BufferCopy(svga->swc, src_surf,
+                                                   dst_surf, src_box->x, dstx,
+                                                   src_box->width));
          dbuffer->dirty = TRUE;
       }
       else {
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
index 89a9b533f91..490f91b5fc9 100644
--- a/src/gallium/drivers/svga/svga_pipe_clear.c
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -45,11 +45,13 @@ begin_blit(struct svga_context *svga)
    util_blitter_save_vertex_elements(svga->blitter, (void*)svga->curr.velems);
    util_blitter_save_vertex_shader(svga->blitter, svga->curr.vs);
    util_blitter_save_geometry_shader(svga->blitter, svga->curr.gs);
+   util_blitter_save_tessctrl_shader(svga->blitter, svga->curr.tcs);
+   util_blitter_save_tesseval_shader(svga->blitter, svga->curr.tes);
    util_blitter_save_so_targets(svga->blitter, svga->num_so_targets,
                      (struct pipe_stream_output_target**)svga->so_targets);
    util_blitter_save_rasterizer(svga->blitter, (void*)svga->curr.rast);
-   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport);
-   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor);
+   util_blitter_save_viewport(svga->blitter, &svga->curr.viewport[0]);
+   util_blitter_save_scissor(svga->blitter, &svga->curr.scissor[0]);
    util_blitter_save_fragment_shader(svga->blitter, svga->curr.fs);
    util_blitter_save_blend(svga->blitter, (void*)svga->curr.blend);
    util_blitter_save_depth_stencil_alpha(svga->blitter,
@@ -248,15 +250,7 @@ svga_clear(struct pipe_context *pipe, unsigned buffers, const struct pipe_scisso
    /* flush any queued prims (don't want them to appear after the clear!) */
    svga_hwtnl_flush_retry(svga);
 
-   ret = try_clear( svga, buffers, color, depth, stencil );
-
-   if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-      /* Flush command buffer and retry:
-       */
-      svga_context_flush( svga, NULL );
-
-      ret = try_clear( svga, buffers, color, depth, stencil );
-   }
+   SVGA_RETRY_OOM(svga, ret, try_clear( svga, buffers, color, depth, stencil));
 
    /*
     * Mark target surfaces as dirty
@@ -277,7 +271,6 @@ svga_clear_texture(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    struct svga_surface *svga_surface_dst;
-   enum pipe_error ret;
    struct pipe_surface tmpl;
    struct pipe_surface *surface;
 
@@ -309,8 +302,8 @@ svga_clear_texture(struct pipe_context *pipe,
          stencil = 0;
       }
       else {
-         util_format_unpack_z_float(surface->format, &depth, data, 1);
-         util_format_unpack_s_8uint(surface->format, &stencil, data, 1);
+         desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+         desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
       }
 
       if (util_format_has_depth(desc)) {
@@ -334,17 +327,9 @@ svga_clear_texture(struct pipe_context *pipe,
          /* clearing whole surface, use direct VGPU10 command */
 
 
-         ret = SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
-                                                   clear_flags,
-                                                   stencil, depth);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
-                                                      clear_flags,
-                                                      stencil, depth);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_ClearDepthStencilView(svga->swc, dsv,
+                                                              clear_flags,
+                                                              stencil, depth));
       }
       else {
          /* To clear subtexture use software fallback */
@@ -367,7 +352,18 @@ svga_clear_texture(struct pipe_context *pipe,
          color.f[0] = color.f[1] = color.f[2] = color.f[3] = 0;
       }
       else {
-         util_format_unpack_rgba(surface->format, color.ui, data, 1);
+         if (util_format_is_pure_sint(surface->format)) {
+            /* signed integer */
+            desc->unpack_rgba_sint(color.i, 0, data, 0, 1, 1);
+         }
+         else if (util_format_is_pure_uint(surface->format)) {
+            /* unsigned integer */
+            desc->unpack_rgba_uint(color.ui, 0, data, 0, 1, 1);
+         }
+         else {
+            /* floating point */
+            desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
+         }
       }
 
       /* Setup render target view */
@@ -390,14 +386,8 @@ svga_clear_texture(struct pipe_context *pipe,
          }
          else {
             /* clearing whole surface using VGPU10 command */
-            ret = SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
-                                                      color.f);
-            if (ret != PIPE_OK) {
-               svga_context_flush(svga,NULL);
-               ret = SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
-                                                         color.f);
-               assert(ret == PIPE_OK);
-            }
+            SVGA_RETRY(svga, SVGA3D_vgpu10_ClearRenderTargetView(svga->swc, rtv,
+                                                                 color.f));
          }
       }
       else {
@@ -526,13 +516,9 @@ svga_clear_render_target(struct pipe_context *pipe,
                                         height);
     } else {
        enum pipe_error ret;
-       
-       ret = svga_try_clear_render_target(svga, dst, color);
-       if (ret == PIPE_ERROR_OUT_OF_MEMORY) {
-          svga_context_flush( svga, NULL );
-          ret = svga_try_clear_render_target(svga, dst, color);
-       }
-       
+
+       SVGA_RETRY_OOM(svga, ret, svga_try_clear_render_target(svga, dst,
+                                                              color));
        assert (ret == PIPE_OK);
     }
     svga_toggle_render_condition(svga, render_condition_enabled, TRUE);
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 5ebd17cf0ea..e6fabfc995e 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -24,12 +24,16 @@
  **********************************************************/
 
 
+#include "util/u_draw.h"
+#include "util/format/u_format.h"
 #include "util/u_helpers.h"
 #include "util/u_inlines.h"
 #include "util/u_prim.h"
 #include "util/u_prim_restart.h"
 
 #include "svga_context.h"
+#include "svga_draw_private.h"
+#include "svga_screen.h"
 #include "svga_draw.h"
 #include "svga_shader.h"
 #include "svga_surface.h"
@@ -37,60 +41,138 @@
 #include "svga_debug.h"
 #include "svga_resource_buffer.h"
 
-/* Returns TRUE if we are currently using flat shading.
- */
-static boolean
-is_using_flat_shading(const struct svga_context *svga)
-{
-   return
-      svga->state.hw_draw.fs ? svga->state.hw_draw.fs->uses_flat_interp : FALSE;
-}
-
 
 static enum pipe_error
 retry_draw_range_elements(struct svga_context *svga,
                           const struct pipe_draw_info *info,
                           unsigned count)
 {
-   enum pipe_error ret;
-
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_DRAWELEMENTS);
 
-   ret = svga_hwtnl_draw_range_elements(svga->hwtnl, info, count);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_range_elements(svga->hwtnl, info, count);
-   }
+   SVGA_RETRY(svga, svga_hwtnl_draw_range_elements(svga->hwtnl, info, count));
 
-   assert (ret == PIPE_OK);
    SVGA_STATS_TIME_POP(svga_sws(svga));
-   return ret;
+   return PIPE_OK;
 }
 
 
 static enum pipe_error
-retry_draw_arrays(struct svga_context *svga,
-                  enum pipe_prim_type prim, unsigned start, unsigned count,
-                  unsigned start_instance, unsigned instance_count)
+retry_draw_arrays( struct svga_context *svga,
+                   enum pipe_prim_type prim, unsigned start, unsigned count,
+                   unsigned start_instance, unsigned instance_count,
+                   ubyte vertices_per_patch)
 {
    enum pipe_error ret;
 
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_DRAWARRAYS);
 
-   for (unsigned try = 0; try < 2; try++) {
-      ret = svga_hwtnl_draw_arrays(svga->hwtnl, prim, start, count,
-                                   start_instance, instance_count);
-      if (ret == PIPE_OK)
-         break;
-      svga_context_flush(svga, NULL);
-   }
-
+   SVGA_RETRY_OOM(svga, ret, svga_hwtnl_draw_arrays(svga->hwtnl, prim, start,
+                                                    count, start_instance,
+                                                    instance_count,
+                                                    vertices_per_patch));
    SVGA_STATS_TIME_POP(svga_sws(svga));
    return ret;
 }
 
 
 /**
+ * Auto draw (get vertex count from a transform feedback result).
+ */
+static enum pipe_error
+retry_draw_auto(struct svga_context *svga,
+                const struct pipe_draw_info *info)
+{
+   assert(svga_have_sm5(svga));
+   assert(info->count_from_stream_output);
+   assert(info->instance_count == 1);
+   /* SO drawing implies core profile and none of these prim types */
+   assert(info->mode != PIPE_PRIM_QUADS &&
+          info->mode != PIPE_PRIM_QUAD_STRIP &&
+          info->mode != PIPE_PRIM_POLYGON);
+
+   if (info->mode == PIPE_PRIM_LINE_LOOP) {
+      /* XXX need to do a fallback */
+      assert(!"draw auto fallback not supported yet");
+      return PIPE_OK;
+   }
+   else {
+      SVGA3dPrimitiveRange range;
+      unsigned hw_count;
+
+      range.primType = svga_translate_prim(info->mode, 12, &hw_count,
+                                           info->vertices_per_patch);
+      range.primitiveCount = 0;
+      range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+      range.indexArray.offset = 0;
+      range.indexArray.stride = 0;
+      range.indexWidth = 0;
+      range.indexBias = 0;
+
+      SVGA_RETRY(svga, svga_hwtnl_prim
+                 (svga->hwtnl, &range,
+                  0,    /* vertex count comes from SO buffer */
+                  0,    /* don't know min index */
+                  ~0u,  /* don't know max index */
+                  NULL, /* no index buffer */
+                  0,    /* start instance */
+                  1,    /* only 1 instance supported */
+                  NULL, /* indirect drawing info */
+                  info->count_from_stream_output));
+
+      return PIPE_OK;
+   }
+}
+
+
+/**
+ * Indirect draw (get vertex count, start index, etc. from a buffer object.
+ */
+static enum pipe_error
+retry_draw_indirect(struct svga_context *svga,
+                    const struct pipe_draw_info *info)
+{
+   assert(svga_have_sm5(svga));
+   assert(info->indirect);
+   /* indirect drawing implies core profile and none of these prim types */
+   assert(info->mode != PIPE_PRIM_QUADS &&
+          info->mode != PIPE_PRIM_QUAD_STRIP &&
+          info->mode != PIPE_PRIM_POLYGON);
+
+   if (info->mode == PIPE_PRIM_LINE_LOOP) {
+      /* need to do a fallback */
+      util_draw_indirect(&svga->pipe, info);
+      return PIPE_OK;
+   }
+   else {
+      SVGA3dPrimitiveRange range;
+      unsigned hw_count;
+
+      range.primType = svga_translate_prim(info->mode, 12, &hw_count,
+                                           info->vertices_per_patch);
+      range.primitiveCount = 0;  /* specified in indirect buffer */
+      range.indexArray.surfaceId = SVGA3D_INVALID_ID;
+      range.indexArray.offset = 0;
+      range.indexArray.stride = 0;
+      range.indexWidth = info->index_size;
+      range.indexBias = 0; /* specified in indirect buffer */
+
+      SVGA_RETRY(svga, svga_hwtnl_prim
+                 (svga->hwtnl, &range,
+                  0,   /* vertex count is in indirect buffer */
+                  0,   /* don't know min index */
+                  ~0u, /* don't know max index */
+                  info->index.resource,
+                  info->start_instance,
+                  0,   /* don't know instance count */
+                  info->indirect,
+                  NULL)); /* SO vertex count */
+
+      return PIPE_OK;
+   }
+}
+
+
+/**
  * Determine if we need to implement primitive restart with a fallback
  * path which breaks the original primitive into sub-primitive at the
  * restart indexes.
@@ -116,6 +198,21 @@ need_fallback_prim_restart(const struct svga_context *svga,
 }
 
 
+/**
+ * A helper function to return the vertex count from the primitive count
+ * returned from the stream output statistics query for the specified stream.
+ */
+static unsigned
+get_vcount_from_stream_output(struct svga_context *svga,
+                              const struct pipe_draw_info *info,
+                              unsigned stream)
+{
+   unsigned primcount;
+   primcount = svga_get_primcount_from_stream_output(svga, stream);
+   return u_vertices_for_prims(info->mode, primcount);
+}
+
+
 static void
 svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
@@ -147,6 +244,18 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       svga->dirty |= SVGA_NEW_REDUCED_PRIMITIVE;
    }
 
+   if (svga->curr.vertices_per_patch != info->vertices_per_patch) {
+      svga->curr.vertices_per_patch = info->vertices_per_patch;
+
+      /* If input patch size changes, we need to notifiy the TCS
+       * code to reevaluate the shader variant since the
+       * vertices per patch count is a constant in the control
+       * point count declaration.
+       */
+      if (svga->curr.tcs || svga->curr.tes)
+         svga->dirty |= SVGA_NEW_TCS_PARAM;
+   }
+
    if (need_fallback_prim_restart(svga, info)) {
       enum pipe_error r;
       r = util_draw_vbo_without_prim_restart(pipe, info);
@@ -155,7 +264,8 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       goto done;
    }
 
-   if (!u_trim_pipe_prim(info->mode, &count))
+   if (!info->indirect && !info->count_from_stream_output &&
+       !u_trim_pipe_prim(info->mode, &count))
       goto done;
 
    needed_swtnl = svga->state.sw.need_swtnl;
@@ -189,20 +299,53 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       }
       svga_hwtnl_set_fillmode(svga->hwtnl, svga->curr.rast->hw_fillmode);
 
+      svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
+
       /** determine if flatshade is to be used after svga_update_state()
        *  in case the fragment shader is changed.
        */
       svga_hwtnl_set_flatshade(svga->hwtnl,
                                svga->curr.rast->templ.flatshade ||
-                               is_using_flat_shading(svga),
+                               svga_is_using_flat_shading(svga),
                                svga->curr.rast->templ.flatshade_first);
 
-      if (info->index_size) {
+      if (info->count_from_stream_output) {
+         unsigned stream = 0;
+         assert(count == 0);
+
+         /* If the vertex count is from the stream output of a non-zero stream
+          * or the draw info specifies instancing, we will need a workaround
+          * since the draw_auto command does not support stream instancing.
+          * The workaround requires querying the vertex count from the
+          * stream output statistics query for the specified stream and then
+          * fallback to the regular draw function.
+          */
+
+         /* Check the stream index of the specified stream output target */
+         for (unsigned i = 0; i < ARRAY_SIZE(svga->so_targets); i++) {
+            if (svga->vcount_so_targets[i] == info->count_from_stream_output) {
+               stream = (svga->vcount_buffer_stream >> (i * 4)) & 0xf;
+               break;
+            }
+         }
+         if (info->instance_count > 1 || stream > 0) {
+            count = get_vcount_from_stream_output(svga, info, stream);
+         }
+      }
+
+      if (info->count_from_stream_output && count == 0) {
+         ret = retry_draw_auto(svga, info);
+      }
+      else if (info->indirect) {
+         ret = retry_draw_indirect(svga, info);
+      }
+      else if (info->index_size) {
          ret = retry_draw_range_elements(svga, info, count);
       }
       else {
          ret = retry_draw_arrays(svga, info->mode, info->start, count,
-                                 info->start_instance, info->instance_count);
+                                 info->start_instance, info->instance_count,
+                                 info->vertices_per_patch);
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
index a2f00b1d290..7795afbfe1f 100644
--- a/src/gallium/drivers/svga/svga_pipe_fs.c
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -37,7 +37,7 @@
 #include "svga_shader.h"
 
 
-static void *
+void *
 svga_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
@@ -69,7 +69,7 @@ svga_create_fs_state(struct pipe_context *pipe,
 }
 
 
-static void
+void
 svga_bind_fs_state(struct pipe_context *pipe, void *shader)
 {
    struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
@@ -85,6 +85,7 @@ svga_delete_fs_state(struct pipe_context *pipe, void *shader)
 {
    struct svga_context *svga = svga_context(pipe);
    struct svga_fragment_shader *fs = (struct svga_fragment_shader *) shader;
+   struct svga_fragment_shader *next_fs;
    struct svga_shader_variant *variant, *tmp;
    enum pipe_error ret;
 
@@ -92,27 +93,32 @@ svga_delete_fs_state(struct pipe_context *pipe, void *shader)
 
    assert(fs->base.parent == NULL);
 
-   draw_delete_fragment_shader(svga->swtnl.draw, fs->draw_shader);
+   while (fs) {
+      next_fs = (struct svga_fragment_shader *) fs->base.next;
+
+      draw_delete_fragment_shader(svga->swtnl.draw, fs->draw_shader);
 
-   for (variant = fs->base.variants; variant; variant = tmp) {
-      tmp = variant->next;
+      for (variant = fs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
 
-      /* Check if deleting currently bound shader */
-      if (variant == svga->state.hw_draw.fs) {
-         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.fs) {
             ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
-            assert(ret == PIPE_OK);
+            if (ret != PIPE_OK) {
+               svga_context_flush(svga, NULL);
+               ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_PS, NULL);
+               assert(ret == PIPE_OK);
+            }
+            svga->state.hw_draw.fs = NULL;
          }
-         svga->state.hw_draw.fs = NULL;
+
+         svga_destroy_shader_variant(svga, variant);
       }
 
-      svga_destroy_shader_variant(svga, variant);
+      FREE((void *)fs->base.tokens);
+      FREE(fs);
+      fs = next_fs;
    }
-
-   FREE((void *)fs->base.tokens);
-   FREE(fs);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
index e0d1e51f412..61b4897c5d6 100644
--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -40,9 +40,16 @@ svga_set_scissor_states(struct pipe_context *pipe,
                         unsigned num_scissors,
                         const struct pipe_scissor_state *scissors)
 {
+   ASSERTED struct svga_screen *svgascreen = svga_screen(pipe->screen);
    struct svga_context *svga = svga_context(pipe);
+   unsigned i, num_sc;
+
+   assert(start_slot + num_scissors <= svgascreen->max_viewports);
+
+   for (i = 0, num_sc = start_slot; i < num_scissors; i++)  {
+      svga->curr.scissor[num_sc++] = scissors[i]; /* struct copy */
+   }
 
-   memcpy(&svga->curr.scissor, scissors, sizeof(*scissors));
    svga->dirty |= SVGA_NEW_SCISSOR;
 }
 
@@ -199,8 +206,14 @@ svga_set_viewport_states(struct pipe_context *pipe,
                          const struct pipe_viewport_state *viewports)
 {
    struct svga_context *svga = svga_context(pipe);
+   ASSERTED struct svga_screen *svgascreen = svga_screen(pipe->screen);
+   unsigned i, num_vp;
 
-   svga->curr.viewport = *viewports; /* struct copy */
+   assert(start_slot + num_viewports <= svgascreen->max_viewports);
+
+   for (i = 0, num_vp = start_slot; i < num_viewports; i++)  {
+      svga->curr.viewport[num_vp++] = viewports[i]; /* struct copy */
+   }
 
    svga->dirty |= SVGA_NEW_VIEWPORT;
 }
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 1b9b17e2a8e..38874deb414 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -50,6 +50,7 @@ struct svga_query {
    SVGA3dQueryType svga_type;      /**< SVGA3D_QUERYTYPE_x or unused */
 
    unsigned id;                    /** Per-context query identifier */
+   boolean active;                 /** TRUE if query is active */
 
    struct pipe_fence_handle *fence;
 
@@ -214,10 +215,10 @@ get_query_result_vgpu9(struct svga_context *svga, struct svga_query *sq,
  * will hold queries of the same type. Multiple memory blocks can be allocated
  * for a particular query type.
  *
- * Currently each memory block is of 184 bytes. We support up to 128
+ * Currently each memory block is of 184 bytes. We support up to 512
  * memory blocks. The query memory size is arbitrary right now.
  * Each occlusion query takes about 8 bytes. One memory block can accomodate
- * 23 occlusion queries. 128 of those blocks can support up to 2944 occlusion
+ * 23 occlusion queries. 512 of those blocks can support up to 11K occlusion
  * queries. That seems reasonable for now. If we think this limit is
  * not enough, we can increase the limit or try to grow the mob in runtime.
  * Note, SVGA device does not impose one mob per context for queries,
@@ -228,7 +229,7 @@ get_query_result_vgpu9(struct svga_context *svga, struct svga_query *sq,
  * following commands: DXMoveQuery, DXBindAllQuery & DXReadbackAllQuery.
  */
 #define SVGA_QUERY_MEM_BLOCK_SIZE    (sizeof(SVGADXQueryResultUnion) * 2)
-#define SVGA_QUERY_MEM_SIZE          (128 * SVGA_QUERY_MEM_BLOCK_SIZE)
+#define SVGA_QUERY_MEM_SIZE          (512 * SVGA_QUERY_MEM_BLOCK_SIZE)
 
 struct svga_qmem_alloc_entry
 {
@@ -243,31 +244,34 @@ struct svga_qmem_alloc_entry
 
 /**
  * Allocate a memory block from the query object memory
- * \return -1 if out of memory, else index of the query memory block
+ * \return NULL if out of memory, else pointer to the query memory block
  */
-static int
+static struct svga_qmem_alloc_entry *
 allocate_query_block(struct svga_context *svga)
 {
    int index;
    unsigned offset;
+   struct svga_qmem_alloc_entry *alloc_entry = NULL;
 
    /* Find the next available query block */
    index = util_bitmask_add(svga->gb_query_alloc_mask);
 
    if (index == UTIL_BITMASK_INVALID_INDEX)
-      return -1;
+      return NULL;
 
    offset = index * SVGA_QUERY_MEM_BLOCK_SIZE;
    if (offset >= svga->gb_query_len) {
       unsigned i;
 
+      /* Deallocate the out-of-range index */
+      util_bitmask_clear(svga->gb_query_alloc_mask, index);
+      index = -1;
+
       /**
        * All the memory blocks are allocated, lets see if there is
        * any empty memory block around that can be freed up.
        */
-      index = -1;
       for (i = 0; i < SVGA3D_QUERYTYPE_MAX && index == -1; i++) {
-         struct svga_qmem_alloc_entry *alloc_entry;
          struct svga_qmem_alloc_entry *prev_alloc_entry = NULL;
 
          alloc_entry = svga->gb_query_map[i];
@@ -286,9 +290,20 @@ allocate_query_block(struct svga_context *svga)
             }
          }
       }
+
+      if (index == -1) {
+         debug_printf("Query memory object is full\n");
+         return NULL;
+      }
    }
 
-   return index;
+   if (!alloc_entry) {
+      assert(index != -1);
+      alloc_entry = CALLOC_STRUCT(svga_qmem_alloc_entry);
+      alloc_entry->block_index = index;
+   }
+
+   return alloc_entry;
 }
 
 /**
@@ -346,17 +361,14 @@ allocate_query_block_entry(struct svga_context *svga,
                            unsigned len)
 {
    struct svga_qmem_alloc_entry *alloc_entry;
-   int block_index = -1;
 
-   block_index = allocate_query_block(svga);
-   if (block_index == -1)
-      return NULL;
-   alloc_entry = CALLOC_STRUCT(svga_qmem_alloc_entry);
+   alloc_entry = allocate_query_block(svga);
    if (!alloc_entry)
       return NULL;
 
-   alloc_entry->block_index = block_index;
-   alloc_entry->start_offset = block_index * SVGA_QUERY_MEM_BLOCK_SIZE;
+   assert(alloc_entry->block_index != -1);
+   alloc_entry->start_offset =
+      alloc_entry->block_index * SVGA_QUERY_MEM_BLOCK_SIZE;
    alloc_entry->nquery = 0;
    alloc_entry->alloc_mask = util_bitmask_create();
    alloc_entry->next = NULL;
@@ -508,17 +520,16 @@ define_query_vgpu10(struct svga_context *svga,
 
    sq->gb_query = svga->gb_query;
 
-   /* Allocate an integer ID for this query */
-   sq->id = util_bitmask_add(svga->query_id_bm);
-   if (sq->id == UTIL_BITMASK_INVALID_INDEX)
-      return PIPE_ERROR_OUT_OF_MEMORY;
+   /* Make sure query length is in multiples of 8 bytes */
+   qlen = align(resultLen + sizeof(SVGA3dQueryState), 8);
 
    /* Find a slot for this query in the gb object */
-   qlen = resultLen + sizeof(SVGA3dQueryState);
    sq->offset = allocate_query(svga, sq->svga_type, qlen);
    if (sq->offset == -1)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
+   assert((sq->offset & 7) == 0);
+
    SVGA_DBG(DEBUG_QUERY, "   query type=%d qid=0x%x offset=%d\n",
             sq->svga_type, sq->id, sq->offset);
 
@@ -731,7 +742,19 @@ svga_create_query(struct pipe_context *pipe,
    case PIPE_QUERY_PRIMITIVES_EMITTED:
    case PIPE_QUERY_SO_STATISTICS:
       assert(svga_have_vgpu10(svga));
-      sq->svga_type = SVGA3D_QUERYTYPE_STREAMOUTPUTSTATS;
+
+      /* Until the device supports the new query type for multiple streams,
+       * we will use the single stream query type for stream 0.
+       */
+      if (svga_have_sm5(svga) && index > 0) {
+         assert(index < 4);
+
+         sq->svga_type = SVGA3D_QUERYTYPE_SOSTATS_STREAM0 + index;
+      }
+      else {
+         assert(index == 0);
+         sq->svga_type = SVGA3D_QUERYTYPE_STREAMOUTPUTSTATS;
+      }
       ret = define_query_vgpu10(svga, sq,
                                 sizeof(SVGADXStreamOutStatisticsQueryResult));
       if (ret != PIPE_OK)
@@ -969,7 +992,10 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
       assert(!"unexpected query type in svga_begin_query()");
    }
 
-   svga->sq[sq->type] = sq;
+   SVGA_DBG(DEBUG_QUERY, "%s sq=0x%x id=%d type=%d svga_type=%d\n",
+            __FUNCTION__, sq, sq->id, sq->type, sq->svga_type);
+
+   sq->active = TRUE;
 
    return true;
 }
@@ -988,12 +1014,12 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    SVGA_DBG(DEBUG_QUERY, "%s sq=0x%x id=%d\n", __FUNCTION__,
             sq, sq->id);
 
-   if (sq->type == PIPE_QUERY_TIMESTAMP && svga->sq[sq->type] != sq)
+   if (sq->type == PIPE_QUERY_TIMESTAMP && !sq->active)
       svga_begin_query(pipe, q);
 
    svga_hwtnl_flush_retry(svga);
 
-   assert(svga->sq[sq->type] == sq);
+   assert(sq->active);
 
    switch (sq->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -1083,7 +1109,7 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    default:
       assert(!"unexpected query type in svga_end_query()");
    }
-   svga->sq[sq->type] = NULL;
+   sq->active = FALSE;
    return true;
 }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index 7d5936fa1ec..7764a855391 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -118,6 +118,9 @@ define_rasterizer_object(struct svga_context *svga,
       rast->templ.line_stipple_factor : 0;
    const uint16 line_pattern = rast->templ.line_stipple_enable ?
       rast->templ.line_stipple_pattern : 0;
+   const uint8 pv_last = !rast->templ.flatshade_first &&
+      svgascreen->haveProvokingVertex;
+
    unsigned try;
 
    rast->id = util_bitmask_add(svga->rast_object_id_bm);
@@ -194,7 +197,18 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       rast->templ.point_smooth = TRUE;
    }
 
-   if (templ->point_smooth) {
+   if (rast->templ.point_smooth &&
+       rast->templ.point_size_per_vertex == 0 &&
+       rast->templ.point_size <= screen->pointSmoothThreshold) {
+      /* If the point size is less than the threshold, disable smoothing.
+       * Note that this only effects point rendering when we use the
+       * pipe_rasterizer_state::point_size value, not when the point size
+       * is set in the VS.
+       */
+      rast->templ.point_smooth = FALSE;
+   }
+
+   if (rast->templ.point_smooth) {
       /* For smooth points we need to generate fragments for at least
        * a 2x2 region.  Otherwise the quad we draw may be too small and
        * we may generate no fragments at all.
@@ -237,7 +251,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       }
    }
 
-   if (!svga_have_vgpu10(svga) && templ->point_smooth) {
+   if (!svga_have_vgpu10(svga) && rast->templ.point_smooth) {
       rast->need_pipeline |= SVGA_PIPELINE_FLAG_POINTS;
       rast->need_pipeline_points_str = "smooth points";
    }
diff --git a/src/gallium/drivers/svga/svga_pipe_streamout.c b/src/gallium/drivers/svga/svga_pipe_streamout.c
index 0c6c034751c..380ceaa3aa7 100644
--- a/src/gallium/drivers/svga/svga_pipe_streamout.c
+++ b/src/gallium/drivers/svga/svga_pipe_streamout.c
@@ -44,6 +44,89 @@ svga_stream_output_target(struct pipe_stream_output_target *s)
    return (struct svga_stream_output_target *)s;
 }
 
+
+/**
+ * A helper function to send different version of the DefineStreamOutput command
+ * depending on if device is SM5 capable or not.
+ */
+static enum pipe_error
+svga_define_stream_output(struct svga_context *svga,
+       SVGA3dStreamOutputId soid,
+       uint32 numOutputStreamEntries,
+       uint32 numOutputStreamStrides,
+       uint32 streamStrides[SVGA3D_DX_MAX_SOTARGETS],
+       const SVGA3dStreamOutputDeclarationEntry decls[SVGA3D_MAX_STREAMOUT_DECLS],
+       uint32 rasterizedStream,
+       struct svga_stream_output *streamout)
+{
+   unsigned i;
+
+   SVGA_DBG(DEBUG_STREAMOUT, "%s: id=%d\n", __FUNCTION__, soid);
+   SVGA_DBG(DEBUG_STREAMOUT,
+            "numOutputStreamEntires=%d\n", numOutputStreamEntries);
+
+   for (i = 0; i < numOutputStreamEntries; i++) {
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "  %d: slot=%d regIdx=%d regMask=0x%x stream=%d\n",
+               i, decls[i].outputSlot, decls[i].registerIndex,
+               decls[i].registerMask, decls[i].stream);
+   }
+
+   SVGA_DBG(DEBUG_STREAMOUT,
+            "numOutputStreamStrides=%d\n", numOutputStreamStrides);
+   for (i = 0; i < numOutputStreamStrides; i++) {
+      SVGA_DBG(DEBUG_STREAMOUT, "  %d ", streamStrides[i]);
+   }
+   SVGA_DBG(DEBUG_STREAMOUT, "\n");
+
+   if (svga_have_sm5(svga) &&
+       (numOutputStreamEntries > SVGA3D_MAX_DX10_STREAMOUT_DECLS ||
+        numOutputStreamStrides > 1)) {
+      unsigned bufSize = sizeof(SVGA3dStreamOutputDeclarationEntry)
+         * numOutputStreamEntries;
+      struct svga_winsys_buffer *declBuf;
+      struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
+      void *map;
+
+      declBuf = svga_winsys_buffer_create(svga, 1, SVGA_BUFFER_USAGE_PINNED,
+                                          bufSize);
+      if (!declBuf)
+         return PIPE_ERROR;
+      map = sws->buffer_map(sws, declBuf, PIPE_TRANSFER_WRITE);
+      if (!map) {
+         sws->buffer_destroy(sws, declBuf);
+         return PIPE_ERROR;
+      }
+
+      /* copy decls to buffer */
+      memcpy(map, decls, bufSize);
+
+      /* unmap buffer */
+      sws->buffer_unmap(sws, declBuf);
+      streamout->declBuf = declBuf;
+
+      SVGA_RETRY(svga, SVGA3D_sm5_DefineAndBindStreamOutput
+                 (svga->swc, soid,
+                  numOutputStreamEntries,
+                  numOutputStreamStrides,
+                  streamStrides,
+                  streamout->declBuf,
+                  rasterizedStream,
+                  bufSize));
+   } else {
+      SVGA_RETRY(svga, SVGA3D_vgpu10_DefineStreamOutput(svga->swc, soid,
+                                                        numOutputStreamEntries,
+                                                        streamStrides,
+                                                        decls));
+   }
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Creates stream output from the stream output info.
+ */
 struct svga_stream_output *
 svga_create_stream_output(struct svga_context *svga,
                           struct svga_shader *shader,
@@ -52,9 +135,13 @@ svga_create_stream_output(struct svga_context *svga,
    struct svga_stream_output *streamout;
    SVGA3dStreamOutputDeclarationEntry decls[SVGA3D_MAX_STREAMOUT_DECLS];
    unsigned strides[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned dstOffset[SVGA3D_DX_MAX_SOTARGETS];
+   unsigned numStreamStrides = 0;
+   unsigned numDecls;
    unsigned i;
    enum pipe_error ret;
    unsigned id;
+   ASSERTED unsigned maxDecls;
 
    assert(info->num_outputs <= PIPE_MAX_SO_OUTPUTS);
 
@@ -64,7 +151,12 @@ svga_create_stream_output(struct svga_context *svga,
    if (!svga_have_vgpu10(svga))
       return NULL;
 
-   assert(info->num_outputs <= SVGA3D_MAX_STREAMOUT_DECLS);
+   if (svga_have_sm5(svga))
+      maxDecls = SVGA3D_MAX_STREAMOUT_DECLS;
+   else if (svga_have_vgpu10(svga))
+      maxDecls = SVGA3D_MAX_DX10_STREAMOUT_DECLS;
+
+   assert(info->num_outputs <= maxDecls);
 
    /* Allocate an integer ID for the stream output */
    id = util_bitmask_add(svga->stream_output_id_bm);
@@ -81,15 +173,17 @@ svga_create_stream_output(struct svga_context *svga,
    streamout->info = *info;
    streamout->id = id;
    streamout->pos_out_index = -1;
+   streamout->streammask = 0;
 
-   SVGA_DBG(DEBUG_STREAMOUT, "%s, num_outputs=%d id=%d\n", __FUNCTION__,
-            info->num_outputs, id);
-
-   /* init whole decls and stride arrays to zero to avoid garbage values */
+   /* Init whole decls and stride arrays to zero to avoid garbage values */
    memset(decls, 0, sizeof(decls));
    memset(strides, 0, sizeof(strides));
+   memset(dstOffset, 0, sizeof(dstOffset));
+
+   SVGA_DBG(DEBUG_STREAMOUT, "%s: num_outputs\n",
+            __FUNCTION__, info->num_outputs);
 
-   for (i = 0; i < info->num_outputs; i++) {
+   for (i = 0, numDecls = 0; i < info->num_outputs; i++, numDecls++) {
       unsigned reg_idx = info->output[i].register_index;
       unsigned buf_idx = info->output[i].output_buffer;
       const enum tgsi_semantic sem_name =
@@ -97,17 +191,59 @@ svga_create_stream_output(struct svga_context *svga,
 
       assert(buf_idx <= PIPE_MAX_SO_BUFFERS);
 
+      numStreamStrides = MAX2(numStreamStrides, buf_idx);
+
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "  %d: register_index=%d output_buffer=%d stream=%d\n",
+               i, reg_idx, buf_idx, info->output[i].stream);
+
+      SVGA_DBG(DEBUG_STREAMOUT,
+               "     dst_offset=%d start_component=%d num_components=%d\n",
+               info->output[i].dst_offset,
+               info->output[i].start_component,
+               info->output[i].num_components);
+
+      streamout->buffer_stream |= info->output[i].stream << (buf_idx * 4);
+
+      /**
+       * Check if the destination offset of the current output
+       * is at the expected offset. If it is greater, then that means
+       * there is a gap in the stream output. We need to insert
+       * extra declaration entries with an invalid register index
+       * to specify a gap.
+       */
+      while (info->output[i].dst_offset > dstOffset[buf_idx]) {
+
+         unsigned numComponents = info->output[i].dst_offset -
+                                  dstOffset[buf_idx];;
+
+         assert(svga_have_sm5(svga));
+
+         /* We can only specify at most 4 components to skip in each
+          * declaration entry.
+          */
+         numComponents = numComponents > 4 ? 4 : numComponents;
+
+         decls[numDecls].outputSlot = buf_idx,
+         decls[numDecls].stream = info->output[i].stream;
+         decls[numDecls].registerIndex = SVGA3D_INVALID_ID;
+         decls[numDecls].registerMask = (1 << numComponents) - 1;
+
+         dstOffset[buf_idx] += numComponents;
+         numDecls++;
+      }
+
       if (sem_name == TGSI_SEMANTIC_POSITION) {
          /**
           * Check if streaming out POSITION. If so, replace the
           * register index with the index for NON_ADJUSTED POSITION.
           */
-         decls[i].registerIndex = shader->info.num_outputs;
+         decls[numDecls].registerIndex = shader->info.num_outputs;
 
          /* Save this output index, so we can tell later if this stream output
           * includes an output of a vertex position
           */
-         streamout->pos_out_index = i;
+         streamout->pos_out_index = numDecls;
       }
       else if (sem_name == TGSI_SEMANTIC_CLIPDIST) {
          /**
@@ -116,44 +252,49 @@ svga_create_stream_output(struct svga_context *svga,
           * It's valid to write to ClipDistance variable for non-enabled
           * clip planes.
           */
-         decls[i].registerIndex = shader->info.num_outputs + 1 +
-                                  shader->info.output_semantic_index[reg_idx];
+         decls[numDecls].registerIndex =
+            shader->info.num_outputs + 1 +
+            shader->info.output_semantic_index[reg_idx];
       }
       else {
-         decls[i].registerIndex = reg_idx;
+         decls[numDecls].registerIndex = reg_idx;
       }
 
-      decls[i].outputSlot = buf_idx;
-      decls[i].registerMask =
+      decls[numDecls].outputSlot = buf_idx;
+      decls[numDecls].registerMask =
          ((1 << info->output[i].num_components) - 1)
             << info->output[i].start_component;
 
-      SVGA_DBG(DEBUG_STREAMOUT, "%d slot=%d regIdx=%d regMask=0x%x\n",
-               i, decls[i].outputSlot, decls[i].registerIndex,
-               decls[i].registerMask);
+      decls[numDecls].stream = info->output[i].stream;
+      assert(decls[numDecls].stream == 0 || svga_have_sm5(svga));
+
+      /* Set the bit in streammask for the enabled stream */
+      streamout->streammask |= 1 << info->output[i].stream;
+
+      /* Update the expected offset for the next output */
+      dstOffset[buf_idx] += info->output[i].num_components;
 
       strides[buf_idx] = info->stride[buf_idx] * sizeof(float);
    }
 
-   ret = SVGA3D_vgpu10_DefineStreamOutput(svga->swc, id,
-                                          info->num_outputs,
-                                          strides,
-                                          decls);
+   assert(numDecls <= maxDecls);
+
+   /* Send the DefineStreamOutput command.
+    * Note, rasterizedStream is always 0.
+    */
+   ret = svga_define_stream_output(svga, id,
+                                   numDecls, numStreamStrides+1,
+                                   strides, decls, 0, streamout);
+
    if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DefineStreamOutput(svga->swc, id,
-                                             info->num_outputs,
-                                             strides,
-                                             decls);
-      if (ret != PIPE_OK) {
-         util_bitmask_clear(svga->stream_output_id_bm, id);
-         FREE(streamout);
-         streamout = NULL;
-      }
+      util_bitmask_clear(svga->stream_output_id_bm, id);
+      FREE(streamout);
+      streamout = NULL;
    }
    return streamout;
 }
 
+
 enum pipe_error
 svga_set_stream_output(struct svga_context *svga,
                        struct svga_stream_output *streamout)
@@ -168,12 +309,28 @@ svga_set_stream_output(struct svga_context *svga,
             streamout, id);
 
    if (svga->current_so != streamout) {
+
+      /* Before unbinding the current stream output, stop the stream output
+       * statistics queries for the active streams.
+       */
+      if (svga_have_sm5(svga) && svga->current_so) {
+         svga->vcount_buffer_stream = svga->current_so->buffer_stream;
+         svga_end_stream_output_queries(svga, svga->current_so->streammask);
+      }
+
       enum pipe_error ret = SVGA3D_vgpu10_SetStreamOutput(svga->swc, id);
       if (ret != PIPE_OK) {
          return ret;
       }
 
       svga->current_so = streamout;
+
+      /* After binding the new stream output, start the stream output
+       * statistics queries for the active streams.
+       */
+      if (svga_have_sm5(svga) && svga->current_so) {
+         svga_begin_stream_output_queries(svga, svga->current_so->streammask);
+      }
    }
 
    return PIPE_OK;
@@ -183,17 +340,18 @@ void
 svga_delete_stream_output(struct svga_context *svga,
                           struct svga_stream_output *streamout)
 {
-   enum pipe_error ret;
+   struct svga_winsys_screen *sws = svga_screen(svga->pipe.screen)->sws;
 
    SVGA_DBG(DEBUG_STREAMOUT, "%s streamout=0x%x\n", __FUNCTION__, streamout);
 
    assert(svga_have_vgpu10(svga));
    assert(streamout != NULL);
 
-   ret = SVGA3D_vgpu10_DestroyStreamOutput(svga->swc, streamout->id);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DestroyStreamOutput(svga->swc, streamout->id);
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyStreamOutput(svga->swc,
+                                                      streamout->id));
+
+   if (svga_have_sm5(svga) && streamout->declBuf) {
+      sws->buffer_destroy(sws, streamout->declBuf);
    }
 
    /* Release the ID */
@@ -203,6 +361,7 @@ svga_delete_stream_output(struct svga_context *svga,
    FREE(streamout);
 }
 
+
 static struct pipe_stream_output_target *
 svga_create_stream_output_target(struct pipe_context *pipe,
                                  struct pipe_resource *buffer,
@@ -252,9 +411,9 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
    struct SVGA3dSoTarget soBindings[SVGA3D_DX_MAX_SOTARGETS];
-   enum pipe_error ret;
    unsigned i;
    unsigned num_so_targets;
+   boolean begin_so_queries = num_targets > 0;
 
    SVGA_DBG(DEBUG_STREAMOUT, "%s num_targets=%d\n", __FUNCTION__,
             num_targets);
@@ -269,6 +428,14 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
       sbuf->dirty = TRUE;
    }
 
+   /* Before the currently bound streamout targets are unbound,
+    * save them in case they need to be referenced to retrieve the
+    * number of vertices being streamed out.
+    */
+   for (i = 0; i < ARRAY_SIZE(svga->so_targets); i++) {
+      svga->vcount_so_targets[i] = svga->so_targets[i];
+   }
+
    assert(num_targets <= SVGA3D_DX_MAX_SOTARGETS);
 
    for (i = 0; i < num_targets; i++) {
@@ -283,7 +450,16 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
              & SVGA3D_SURFACE_BIND_STREAM_OUTPUT);
 
       svga->so_targets[i] = &sot->base;
-      soBindings[i].offset = sot->base.buffer_offset;
+      if (offsets[i] == -1) {
+         soBindings[i].offset = -1;
+
+         /* The streamout is being resumed. There is no need to restart streamout statistics
+          * queries for the draw-auto fallback since those queries are still active.
+          */
+         begin_so_queries = FALSE;
+      }
+      else
+         soBindings[i].offset = sot->base.buffer_offset + offsets[i];
 
       /* The size cannot extend beyond the end of the buffer.  Clamp it. */
       size = MIN2(sot->base.buffer_size,
@@ -299,15 +475,22 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
    }
 
    num_so_targets = MAX2(svga->num_so_targets, num_targets);
-   ret = SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
-                                    soBindings, svga->so_surfaces);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
-                                       soBindings, svga->so_surfaces);
-   }
-
+   SVGA_RETRY(svga, SVGA3D_vgpu10_SetSOTargets(svga->swc, num_so_targets,
+                                               soBindings, svga->so_surfaces));
    svga->num_so_targets = num_targets;
+
+   if (svga_have_sm5(svga) && svga->current_so && begin_so_queries) {
+
+      /* If there are aleady active queries and we need to start a new streamout,
+       * we need to stop the current active queries first.
+       */
+      if (svga->in_streamout) {
+         svga_end_stream_output_queries(svga, svga->current_so->streammask);
+      }
+
+      /* Start stream out statistics queries for the new streamout */
+      svga_begin_stream_output_queries(svga, svga->current_so->streammask);
+   }
 }
 
 /**
@@ -329,6 +512,7 @@ svga_rebind_stream_output_targets(struct svga_context *svga)
    return PIPE_OK;
 }
 
+
 void
 svga_init_stream_output_functions(struct svga_context *svga)
 {
@@ -336,3 +520,117 @@ svga_init_stream_output_functions(struct svga_context *svga)
    svga->pipe.stream_output_target_destroy = svga_destroy_stream_output_target;
    svga->pipe.set_stream_output_targets = svga_set_stream_output_targets;
 }
+
+
+/**
+ * A helper function to create stream output statistics queries for each stream.
+ * These queries are created as a workaround for DrawTransformFeedbackInstanced or
+ * DrawTransformFeedbackStreamInstanced when auto draw doesn't support
+ * instancing or non-0 stream. In this case, the vertex count will
+ * be retrieved from the stream output statistics query.
+ */
+void
+svga_create_stream_output_queries(struct svga_context *svga)
+{
+   unsigned i;
+
+   if (!svga_have_sm5(svga))
+      return;
+
+   for (i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      svga->so_queries[i] = svga->pipe.create_query(&svga->pipe,
+                               PIPE_QUERY_SO_STATISTICS, i);
+      assert(svga->so_queries[i] != NULL);
+   }
+}
+
+
+/**
+ * Destroy the stream output statistics queries for the draw-auto workaround.
+ */
+void
+svga_destroy_stream_output_queries(struct svga_context *svga)
+{
+   unsigned i;
+
+   if (!svga_have_sm5(svga))
+      return;
+
+   for (i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      svga->pipe.destroy_query(&svga->pipe, svga->so_queries[i]);
+   }
+}
+
+
+/**
+ * Start stream output statistics queries for the active streams.
+ */
+void
+svga_begin_stream_output_queries(struct svga_context *svga,
+                                 unsigned streammask)
+{
+   assert(svga_have_sm5(svga));
+   assert(!svga->in_streamout);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      bool ret;
+      if (streammask & (1 << i)) {
+         ret = svga->pipe.begin_query(&svga->pipe, svga->so_queries[i]);
+      }
+      (void) ret;
+   }   
+   svga->in_streamout = TRUE;
+
+   return;
+}
+
+
+/**
+ * Stop stream output statistics queries for the active streams.
+ */
+void
+svga_end_stream_output_queries(struct svga_context *svga,
+                               unsigned streammask)
+{
+   assert(svga_have_sm5(svga));
+
+   if (!svga->in_streamout)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(svga->so_queries); i++) {
+      bool ret;
+      if (streammask & (1 << i)) {
+         ret = svga->pipe.end_query(&svga->pipe, svga->so_queries[i]);
+      }
+      (void) ret;
+   }   
+   svga->in_streamout = FALSE;
+
+   return;
+}
+
+
+/**
+ * Return the primitive count returned from the stream output statistics query
+ * for the specified stream.
+ */
+unsigned
+svga_get_primcount_from_stream_output(struct svga_context *svga,
+                                      unsigned stream)
+{
+   unsigned primcount = 0;
+   union pipe_query_result result;
+   bool ret;
+
+   if (svga->current_so) {
+      svga_end_stream_output_queries(svga, svga->current_so->streammask);
+   }
+
+   ret = svga->pipe.get_query_result(&svga->pipe,
+                                     svga->so_queries[stream],
+                                     TRUE, &result);
+   if (ret)
+      primcount = result.so_statistics.num_primitives_written;
+
+   return primcount;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_ts.c b/src/gallium/drivers/svga/svga_pipe_ts.c
new file mode 100644
index 00000000000..12a3bf486b7
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_pipe_ts.c
@@ -0,0 +1,219 @@
+/**********************************************************
+ * Copyright 2018-2020 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "svga_context.h"
+#include "svga_shader.h"
+
+static void
+svga_set_tess_state(struct pipe_context *pipe,
+                    const float default_outer_level[4],
+                    const float default_inner_level[2])
+{
+   struct svga_context *svga = svga_context(pipe);
+   unsigned i;
+
+   for (i = 0; i < 4; i++) {
+      svga->curr.default_tesslevels[i] = default_outer_level[i];
+   }
+   for (i = 0; i < 2; i++) {
+      svga->curr.default_tesslevels[i + 4] = default_inner_level[i];
+   }
+}
+
+
+static void *
+svga_create_tcs_state(struct pipe_context *pipe,
+                      const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tcs_shader *tcs;
+
+   tcs = CALLOC_STRUCT(svga_tcs_shader);
+   if (!tcs)
+      return NULL;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_CREATETCS);
+
+   tcs->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(tcs->base.tokens, &tcs->base.info);
+
+   tcs->base.id = svga->debug.shader_id++;
+
+   tcs->generic_outputs = svga_get_generic_outputs_mask(&tcs->base.info);
+
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return tcs;
+}
+
+
+static void
+svga_bind_tcs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_tcs_shader *tcs = (struct svga_tcs_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   if (tcs == svga->curr.tcs)
+      return;
+
+   svga->curr.tcs = tcs;
+   svga->dirty |= SVGA_NEW_TCS;
+}
+
+
+static void
+svga_delete_tcs_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tcs_shader *tcs = (struct svga_tcs_shader *) shader;
+   struct svga_tcs_shader *next_tcs;
+   struct svga_shader_variant *variant, *tmp;
+
+   svga_hwtnl_flush_retry(svga);
+
+   assert(tcs->base.parent == NULL);
+
+   while (tcs) {
+      next_tcs = (struct svga_tcs_shader *)tcs->base.next;
+      for (variant = tcs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.tcs) {
+            SVGA_RETRY(svga, svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, NULL));
+            svga->state.hw_draw.tcs = NULL;
+         }
+
+         svga_destroy_shader_variant(svga, variant);
+      }
+
+      FREE((void *)tcs->base.tokens);
+      FREE(tcs);
+      tcs = next_tcs;
+   }
+}
+
+
+void
+svga_cleanup_tcs_state(struct svga_context *svga)
+{
+   if (svga->tcs.passthrough_tcs) {
+      svga_delete_tcs_state(&svga->pipe, svga->tcs.passthrough_tcs);
+   }
+}
+
+
+static void *
+svga_create_tes_state(struct pipe_context *pipe,
+                      const struct pipe_shader_state *templ)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tes_shader *tes;
+
+   tes = CALLOC_STRUCT(svga_tes_shader);
+   if (!tes)
+      return NULL;
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_CREATETES);
+
+   tes->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   /* Collect basic info that we'll need later:
+    */
+   tgsi_scan_shader(tes->base.tokens, &tes->base.info);
+
+   tes->base.id = svga->debug.shader_id++;
+
+   tes->generic_inputs = svga_get_generic_inputs_mask(&tes->base.info);
+
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return tes;
+}
+
+
+static void
+svga_bind_tes_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_tes_shader *tes = (struct svga_tes_shader *) shader;
+   struct svga_context *svga = svga_context(pipe);
+
+   if (tes == svga->curr.tes)
+      return;
+
+   svga->curr.tes = tes;
+   svga->dirty |= SVGA_NEW_TES;
+}
+
+
+static void
+svga_delete_tes_state(struct pipe_context *pipe, void *shader)
+{
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_tes_shader *tes = (struct svga_tes_shader *) shader;
+   struct svga_tes_shader *next_tes;
+   struct svga_shader_variant *variant, *tmp;
+
+   svga_hwtnl_flush_retry(svga);
+
+   assert(tes->base.parent == NULL);
+
+   while (tes) {
+      next_tes = (struct svga_tes_shader *)tes->base.next;
+      for (variant = tes->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.tes) {
+            SVGA_RETRY(svga, svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, NULL));
+            svga->state.hw_draw.tes = NULL;
+         }
+
+         svga_destroy_shader_variant(svga, variant);
+      }
+
+      FREE((void *)tes->base.tokens);
+      FREE(tes);
+      tes = next_tes;
+   }
+}
+
+
+void
+svga_init_ts_functions(struct svga_context *svga)
+{
+   svga->pipe.set_tess_state = svga_set_tess_state;
+   svga->pipe.create_tcs_state = svga_create_tcs_state;
+   svga->pipe.bind_tcs_state = svga_bind_tcs_state;
+   svga->pipe.delete_tcs_state = svga_delete_tcs_state;
+   svga->pipe.create_tes_state = svga_create_tes_state;
+   svga->pipe.bind_tes_state = svga_bind_tes_state;
+   svga->pipe.delete_tes_state = svga_delete_tes_state;
+}
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
index a475e000f2e..aa7396c2c6b 100644
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -166,6 +166,7 @@ svga_delete_vs_state(struct pipe_context *pipe, void *shader)
 {
    struct svga_context *svga = svga_context(pipe);
    struct svga_vertex_shader *vs = (struct svga_vertex_shader *)shader;
+   struct svga_vertex_shader *next_vs;
    struct svga_shader_variant *variant, *tmp;
    enum pipe_error ret;
 
@@ -173,37 +174,42 @@ svga_delete_vs_state(struct pipe_context *pipe, void *shader)
 
    assert(vs->base.parent == NULL);
 
-   /* Check if there is a generated geometry shader to go with this
-    * vertex shader. If there is, then delete the geometry shader as well.
-    */
-   if (vs->gs != NULL) {
-      svga->pipe.delete_gs_state(&svga->pipe, vs->gs);
-   }
+   while (vs) {
+      next_vs = (struct svga_vertex_shader *)vs->base.next;
 
-   if (vs->base.stream_output != NULL)
-      svga_delete_stream_output(svga, vs->base.stream_output);
+      /* Check if there is a generated geometry shader to go with this
+       * vertex shader. If there is, then delete the geometry shader as well.
+       */
+      if (vs->gs != NULL) {
+         svga->pipe.delete_gs_state(&svga->pipe, vs->gs);
+      }
 
-   draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
+      if (vs->base.stream_output != NULL)
+         svga_delete_stream_output(svga, vs->base.stream_output);
 
-   for (variant = vs->base.variants; variant; variant = tmp) {
-      tmp = variant->next;
+      draw_delete_vertex_shader(svga->swtnl.draw, vs->draw_shader);
 
-      /* Check if deleting currently bound shader */
-      if (variant == svga->state.hw_draw.vs) {
-         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
+      for (variant = vs->base.variants; variant; variant = tmp) {
+         tmp = variant->next;
+
+         /* Check if deleting currently bound shader */
+         if (variant == svga->state.hw_draw.vs) {
             ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
-            assert(ret == PIPE_OK);
+            if (ret != PIPE_OK) {
+               svga_context_flush(svga, NULL);
+               ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_VS, NULL);
+               assert(ret == PIPE_OK);
+            }
+            svga->state.hw_draw.vs = NULL;
          }
-         svga->state.hw_draw.vs = NULL;
+
+         svga_destroy_shader_variant(svga, variant);
       }
 
-      svga_destroy_shader_variant(svga, variant);
+      FREE((void *)vs->base.tokens);
+      FREE(vs);
+      vs = next_vs;
    }
-
-   FREE((void *)vs->base.tokens);
-   FREE(vs);
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index 6629a8cc14d..4f19b8ca035 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -53,7 +53,8 @@ svga_buffer_needs_hw_storage(const struct svga_screen *ss,
                              const struct pipe_resource *template)
 {
    unsigned bind_mask = (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER |
-                         PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT);
+                         PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
+                         PIPE_BIND_SHADER_BUFFER | PIPE_BIND_COMMAND_ARGS_BUFFER);
 
    if (ss->sws->have_vgpu10) {
       /*
@@ -478,6 +479,9 @@ svga_buffer_create(struct pipe_screen *screen,
           */
          bind_flags |= (PIPE_BIND_VERTEX_BUFFER |
                         PIPE_BIND_INDEX_BUFFER);
+
+         /* It may be used for shader resource as well. */
+         bind_flags |= PIPE_BIND_SAMPLER_VIEW;
       }
 
       if (svga_buffer_create_host_surface(ss, sbuf, bind_flags) != PIPE_OK)
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 2e9ca060059..5d2b934e7c1 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -175,6 +175,11 @@ svga_buffer_create_host_surface(struct svga_screen *ss,
       if (bind_flags & PIPE_BIND_SAMPLER_VIEW)
          sbuf->key.flags |= SVGA3D_SURFACE_BIND_SHADER_RESOURCE;
 
+      if (bind_flags & PIPE_BIND_COMMAND_ARGS_BUFFER) {
+         assert(ss->sws->have_sm5);
+         sbuf->key.flags |= SVGA3D_SURFACE_DRAWINDIRECT_ARGS;
+      }
+
       if (!bind_flags && sbuf->b.b.usage == PIPE_USAGE_STAGING) {
          /* This surface is to be used with the
           * SVGA3D_CMD_DX_TRANSFER_FROM_BUFFER command, and no other
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 2aa4e52faa7..1bae8c39595 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -133,26 +133,25 @@ svga_transfer_dma(struct svga_context *svga,
       }
    }
    else {
-      int y, h, y_max;
+      int y, h, srcy;
       unsigned blockheight =
          util_format_get_blockheight(st->base.resource->format);
 
       h = st->hw_nblocksy * blockheight;
-      y_max = st->box.y + st->box.h;
+      srcy = 0;
 
-      for (y = st->box.y; y < y_max; y += h) {
+      for (y = 0; y < st->box.h; y += h) {
          unsigned offset, length;
          void *hw, *sw;
 
-         if (y + h > y_max)
-            h = y_max - y;
+         if (y + h > st->box.h)
+            h = st->box.h - y;
 
          /* Transfer band must be aligned to pixel block boundaries */
          assert(y % blockheight == 0);
          assert(h % blockheight == 0);
 
-         /* First band starts at the top of the SW buffer. */
-         offset = (y - st->box.y) * st->base.stride / blockheight;
+         offset = y * st->base.stride / blockheight;
          length = h * st->base.stride / blockheight;
 
          sw = (uint8_t *) st->swbuf + offset;
@@ -160,9 +159,9 @@ svga_transfer_dma(struct svga_context *svga,
          if (transfer == SVGA3D_WRITE_HOST_VRAM) {
             unsigned usage = PIPE_TRANSFER_WRITE;
 
-            /* Don't write to an in-flight DMA buffer. Synchronize or
-             * discard in-flight storage. */
-            if (y != st->box.y) {
+            /* Wait for the previous DMAs to complete */
+            /* TODO: keep one DMA (at half the size) in the background */
+            if (y) {
                svga_context_flush(svga, NULL);
                usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
             }
@@ -178,7 +177,7 @@ svga_transfer_dma(struct svga_context *svga,
          svga_transfer_dma_band(svga, st, transfer,
                                 st->box.x, y, st->box.z,
                                 st->box.w, h, st->box.d,
-                                0, 0, 0, flags);
+                                0, srcy, 0, flags);
 
          /*
           * Prevent the texture contents to be discarded on the next band
@@ -488,6 +487,18 @@ svga_texture_transfer_map_direct(struct svga_context *svga,
          svga_context_flush(svga, NULL);
       }
 
+      if (map && rebind) {
+         enum pipe_error ret;
+
+         ret = SVGA3D_BindGBSurface(swc, surf);
+         if (ret != PIPE_OK) {
+            svga_context_flush(svga, NULL);
+            ret = SVGA3D_BindGBSurface(swc, surf);
+            assert(ret == PIPE_OK);
+         }
+         svga_context_flush(svga, NULL);
+      }
+
       /*
        * Make sure we return NULL if the map fails
        */
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 2975bfefdfa..f7e3a900290 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -49,6 +49,10 @@
 /* NOTE: this constant may get moved into a svga3d*.h header file */
 #define SVGA3D_DX_MAX_RESOURCE_SIZE (128 * 1024 * 1024)
 
+#ifndef MESA_GIT_SHA1
+#define MESA_GIT_SHA1 "(unknown git revision)"
+#endif
+
 #ifdef DEBUG
 int SVGA_DEBUG = 0;
 
@@ -249,7 +253,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
                   12 /* 2048x2048 */);
 
    case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return sws->have_vgpu10 ? SVGA3D_MAX_SURFACE_ARRAYSIZE : 0;
+      return sws->have_sm5 ? SVGA3D_SM5_MAX_SURFACE_ARRAYSIZE :
+             (sws->have_vgpu10 ? SVGA3D_SM4_MAX_SURFACE_ARRAYSIZE : 0);
 
    case PIPE_CAP_BLEND_EQUATION_SEPARATE: /* req. for GL 1.5 */
       return 1;
@@ -266,7 +271,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
       return 1; /* The color outputs of vertex shaders are not clamped */
    case PIPE_CAP_VERTEX_COLOR_CLAMPED:
-      return 0; /* The driver can't clamp vertex colors */
+      return sws->have_vgpu10;
    case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
       return 0; /* The driver can't clamp fragment colors */
 
@@ -274,10 +279,16 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1; /* expected for GL_ARB_framebuffer_object */
 
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
-      return sws->have_vgpu10 ? 330 : 120;
+      if (sws->have_sm5) {
+         return 410;
+      } else if (sws->have_vgpu10) {
+         return 330;
+      } else {
+         return 120;
+      }
 
    case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-      return sws->have_vgpu10 ? 330 : 120;
+      return sws->have_sm5 ? 410 : (sws->have_vgpu10 ? 330 : 120);
 
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
    case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE:
@@ -303,10 +314,12 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
       return sws->have_vgpu10 ? 4 : 0;
    case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return sws->have_vgpu10 ? SVGA3D_MAX_STREAMOUT_DECLS : 0;
+      return sws->have_sm5 ? SVGA3D_MAX_STREAMOUT_DECLS :
+             (sws->have_vgpu10 ? SVGA3D_MAX_DX10_STREAMOUT_DECLS : 0);
    case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return sws->have_sm5;
    case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
-      return 0;
+      return sws->have_sm5;
    case PIPE_CAP_TEXTURE_MULTISAMPLE:
       return svgascreen->ms_samples ? 1 : 0;
 
@@ -350,7 +363,16 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return sws->have_sm4_1;
 
    case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
-      return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */
+      /* SM4_1 supports only single-channel textures where as SM5 supports
+       * all four channel textures */
+      return sws->have_sm5 ? 4 :
+             (sws->have_sm4_1 ? 1 : 0);
+   case PIPE_CAP_DRAW_INDIRECT:
+      return sws->have_sm5;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return sws->have_sm5 ? 4 : 0;
+   case PIPE_CAP_COMPUTE:
+      return 0;
    case PIPE_CAP_MAX_VARYINGS:
       return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
@@ -362,9 +384,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
    case PIPE_CAP_TEXTURE_BARRIER:
-   case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
-   case PIPE_CAP_COMPUTE:
    case PIPE_CAP_START_INSTANCE:
    case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
@@ -372,7 +392,6 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_GATHER_SM5:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
@@ -410,7 +429,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
       return 2048;
    case PIPE_CAP_MAX_VIEWPORTS:
-      return 1;
+      assert((!sws->have_vgpu10 && svgascreen->max_viewports == 1) ||
+             (sws->have_vgpu10 &&
+              svgascreen->max_viewports == SVGA3D_DX_MAX_VIEWPORTS));
+      return svgascreen->max_viewports;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
 
@@ -427,10 +449,11 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return sws->have_vgpu10;
    case PIPE_CAP_CLEAR_TEXTURE:
       return sws->have_vgpu10;
+   case PIPE_CAP_DOUBLES:
+      return sws->have_sm5;
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
@@ -453,7 +476,6 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
    case PIPE_CAP_FBFETCH:
    case PIPE_CAP_TGSI_MUL_ZERO_WINS:
-   case PIPE_CAP_DOUBLES:
    case PIPE_CAP_INT64:
    case PIPE_CAP_INT64_DIVMOD:
    case PIPE_CAP_TGSI_TEX_TXF_LZ:
@@ -487,6 +509,9 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 32;
    case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
       return 1 << 27;
+   /* Verify this once protocol is finalized. Setting it to minimum value. */
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return sws->have_sm5 ? 30 : 0;
    default:
       return u_pipe_screen_get_param_defaults(screen, param);
    }
@@ -674,12 +699,12 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
    assert(sws->have_vgpu10);
    (void) sws;  /* silence unused var warnings in non-debug builds */
 
-   /* Only VS, GS, FS supported */
-   if (shader != PIPE_SHADER_VERTEX &&
-       shader != PIPE_SHADER_GEOMETRY &&
-       shader != PIPE_SHADER_FRAGMENT) {
+   if ((!sws->have_sm5) &&
+       (shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL))
+      return 0;
+
+   if (shader == PIPE_SHADER_COMPUTE)
       return 0;
-   }
 
    /* NOTE: we do not query the device for any caps/limits at this time */
 
@@ -697,6 +722,10 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
          return VGPU10_MAX_FS_INPUTS;
       else if (shader == PIPE_SHADER_GEOMETRY)
          return VGPU10_MAX_GS_INPUTS;
+      else if (shader == PIPE_SHADER_TESS_CTRL)
+         return VGPU11_MAX_HS_INPUTS;
+      else if (shader == PIPE_SHADER_TESS_EVAL)
+         return VGPU11_MAX_DS_INPUT_CONTROL_POINTS;
       else
          return VGPU10_MAX_VS_INPUTS;
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
@@ -704,6 +733,10 @@ vgpu10_get_shader_param(struct pipe_screen *screen,
          return VGPU10_MAX_FS_OUTPUTS;
       else if (shader == PIPE_SHADER_GEOMETRY)
          return VGPU10_MAX_GS_OUTPUTS;
+      else if (shader == PIPE_SHADER_TESS_CTRL)
+         return VGPU11_MAX_HS_OUTPUTS;
+      else if (shader == PIPE_SHADER_TESS_EVAL)
+         return VGPU11_MAX_DS_OUTPUTS;
       else
          return VGPU10_MAX_VS_OUTPUTS;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
@@ -844,6 +877,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-bytes-uploaded", SVGA_QUERY_NUM_BYTES_UPLOADED,
             PIPE_DRIVER_QUERY_TYPE_BYTES),
+      QUERY("num-command-buffers", SVGA_QUERY_NUM_COMMAND_BUFFERS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("command-buffer-size", SVGA_QUERY_COMMAND_BUFFER_SIZE,
             PIPE_DRIVER_QUERY_TYPE_BYTES),
       QUERY("flush-time", SVGA_QUERY_FLUSH_TIME,
@@ -860,6 +895,10 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-const-updates", SVGA_QUERY_NUM_CONST_UPDATES,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-shader-relocations", SVGA_QUERY_NUM_SHADER_RELOCATIONS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-surface-relocations", SVGA_QUERY_NUM_SURFACE_RELOCATIONS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
 
       /* running total counters */
       QUERY("memory-used", SVGA_QUERY_MEMORY_USED,
@@ -878,6 +917,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-commands-per-draw", SVGA_QUERY_NUM_COMMANDS_PER_DRAW,
             PIPE_DRIVER_QUERY_TYPE_FLOAT),
+      QUERY("shader-mem-used", SVGA_QUERY_SHADER_MEM_USED,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
    };
 #undef QUERY
 
@@ -1012,9 +1053,10 @@ svga_screen_create(struct svga_winsys_screen *sws)
       goto error2;
    }
 
-   debug_printf("%s enabled = %u\n",
-                sws->have_sm4_1 ? "SM4_1" : "VGPU10",
-                sws->have_sm4_1 ? 1 : sws->have_vgpu10);
+   debug_printf("%s enabled\n",
+                sws->have_sm5 ? "SM5" :
+                sws->have_sm4_1 ? "SM4_1" :
+                sws->have_vgpu10 ? "VGPU10" : "VGPU9");
 
    debug_printf("Mesa: %s %s (%s)\n", svga_get_name(screen),
                 PACKAGE_VERSION, MESA_GIT_SHA1);
@@ -1081,13 +1123,23 @@ svga_screen_create(struct svga_winsys_screen *sws)
             svgascreen->ms_samples |= 1 << 3;
       }
 
+      if (sws->have_sm5 && debug_get_bool_option("SVGA_MSAA", TRUE)) {
+         if (get_bool_cap(sws, SVGA3D_DEVCAP_MULTISAMPLE_8X, FALSE))
+            svgascreen->ms_samples |= 1 << 7;
+      }
+
       /* Maximum number of constant buffers */
       svgascreen->max_const_buffers =
          get_uint_cap(sws, SVGA3D_DEVCAP_DX_MAX_CONSTANT_BUFFERS, 1);
       svgascreen->max_const_buffers = MIN2(svgascreen->max_const_buffers,
                                            SVGA_MAX_CONST_BUFS);
 
+      svgascreen->haveBlendLogicops =
+         get_bool_cap(sws, SVGA3D_DEVCAP_LOGIC_BLENDOPS, FALSE);
+
       screen->is_format_supported = svga_is_dx_format_supported;
+
+      svgascreen->max_viewports = SVGA3D_DX_MAX_VIEWPORTS;
    }
    else {
       /* VGPU9 */
@@ -1122,6 +1174,9 @@ svga_screen_create(struct svga_winsys_screen *sws)
 
       /* No multisampling */
       svgascreen->ms_samples = 0;
+
+      /* Only one viewport */
+      svgascreen->max_viewports = 1;
    }
 
    /* common VGPU9 / VGPU10 caps */
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index 12b93468da2..aa0001b11e5 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -50,10 +50,13 @@ struct svga_screen
    /** Device caps */
    boolean haveProvokingVertex;
    boolean haveLineStipple, haveLineSmooth;
+   boolean haveBlendLogicops;
    float maxLineWidth, maxLineWidthAA;
    float maxPointSize;
+   float pointSmoothThreshold; /** Disable point AA for sizes less than this */
    unsigned max_color_buffers;
    unsigned max_const_buffers;
+   unsigned max_viewports;
    unsigned ms_samples;
 
    struct {
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index b5bcd51a7fc..a0e5f5ff2b9 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -311,6 +311,9 @@ svga_screen_cache_add(struct svga_screen *svgascreen,
 }
 
 
+/* Maximum number of invalidate surface commands in a command buffer */
+# define SVGA_MAX_SURFACE_TO_INVALIDATE 1000
+
 /**
  * Called during the screen flush to move all buffers not in a validate list
  * into the unused list.
@@ -354,6 +357,7 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
       next = curr->next;
    }
 
+   unsigned nsurf = 0;
    curr = cache->validated.next;
    next = curr->next;
    while (curr != &cache->validated) {
@@ -381,12 +385,14 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
              * this function itself is called inside svga_context_flush().
              */
             svga->swc->flush(svga->swc, NULL);
+            nsurf = 0;
             ret = SVGA3D_InvalidateGBSurface(svga->swc, entry->handle);
             assert(ret == PIPE_OK);
          }
 
          /* add the entry to the invalidated list */
          list_add(&entry->head, &cache->invalidated);
+         nsurf++;
       }
 
       curr = next;
@@ -394,6 +400,16 @@ svga_screen_cache_flush(struct svga_screen *svgascreen,
    }
 
    mtx_unlock(&cache->mutex);
+
+   /**
+    * In some rare cases (when running ARK survival), we hit the max number
+    * of surface relocations with invalidated surfaces during context flush.
+    * So if the number of invalidated surface exceeds a certain limit (1000),
+    * we'll do another winsys flush.
+    */
+   if (nsurf > SVGA_MAX_SURFACE_TO_INVALIDATE) {
+      svga->swc->flush(svga->swc, NULL);
+   }
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index 3a7516945c6..52f1153fd61 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -229,22 +229,25 @@ static const enum pipe_swizzle set_XXXY[PIPE_SWIZZLE_MAX] = {
  */
 void
 svga_init_shader_key_common(const struct svga_context *svga,
-                            enum pipe_shader_type shader,
+                            enum pipe_shader_type shader_type,
+                            const struct svga_shader *shader,
                             struct svga_compile_key *key)
 {
    unsigned i, idx = 0;
 
-   assert(shader < ARRAY_SIZE(svga->curr.num_sampler_views));
+   assert(shader_type < ARRAY_SIZE(svga->curr.num_sampler_views));
 
    /* In case the number of samplers and sampler_views doesn't match,
     * loop over the lower of the two counts.
     */
-   key->num_textures = MAX2(svga->curr.num_sampler_views[shader],
-                            svga->curr.num_samplers[shader]);
+   key->num_textures = MAX2(svga->curr.num_sampler_views[shader_type],
+                            svga->curr.num_samplers[shader_type]);
 
    for (i = 0; i < key->num_textures; i++) {
-      struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
-      const struct svga_sampler_state *sampler = svga->curr.sampler[shader][i];
+      struct pipe_sampler_view *view = svga->curr.sampler_views[shader_type][i];
+      const struct svga_sampler_state
+         *sampler = svga->curr.sampler[shader_type][i];
+
       if (view) {
          assert(view->texture);
          assert(view->texture->target < (1 << 4)); /* texture_target:4 */
@@ -304,6 +307,12 @@ svga_init_shader_key_common(const struct svga_context *svga,
             if (view->texture->format == PIPE_FORMAT_DXT1_RGB ||
                 view->texture->format == PIPE_FORMAT_DXT1_SRGB)
                swizzle_tab = set_alpha;
+
+            /* Save the compare function as we need to handle
+             * depth compare in the shader.
+             */
+            key->tex[i].compare_mode = sampler->compare_mode;
+            key->tex[i].compare_func = sampler->compare_func;
          }
 
          key->tex[i].swizzle_r = swizzle_tab[view->swizzle_r];
@@ -314,8 +323,10 @@ svga_init_shader_key_common(const struct svga_context *svga,
 
       if (sampler) {
          if (!sampler->normalized_coords) {
-            assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
-            key->tex[i].width_height_idx = idx++;
+            if (view) {
+               assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
+               key->tex[i].width_height_idx = idx++;
+	    }
             key->tex[i].unnormalized = TRUE;
             ++key->num_unnormalized_coords;
 
@@ -326,6 +337,9 @@ svga_init_shader_key_common(const struct svga_context *svga,
          }
       }
    }
+
+   key->clamp_vertex_color = svga->curr.rast ?
+                             svga->curr.rast->templ.clamp_vertex_color : 0;
 }
 
 
@@ -380,6 +394,8 @@ define_gb_shader_vgpu9(struct svga_context *svga,
    variant->gb_shader = sws->shader_create(sws, variant->type,
                                            variant->tokens, codeLen);
 
+   svga->hud.shader_mem_used += codeLen;
+
    if (!variant->gb_shader)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
@@ -398,6 +414,7 @@ define_gb_shader_vgpu10(struct svga_context *svga,
 {
    struct svga_winsys_context *swc = svga->swc;
    enum pipe_error ret;
+   unsigned len = codeLen + variant->signatureLen;
 
    /**
     * Shaders in VGPU10 enabled device reside in the device COTable.
@@ -412,7 +429,11 @@ define_gb_shader_vgpu10(struct svga_context *svga,
    /* Create gb memory for the shader and upload the shader code */
    variant->gb_shader = swc->shader_create(swc,
                                            variant->id, variant->type,
-                                           variant->tokens, codeLen);
+                                           variant->tokens, codeLen,
+                                           variant->signature,
+                                           variant->signatureLen);
+
+   svga->hud.shader_mem_used += len;
 
    if (!variant->gb_shader) {
       /* Free the shader ID */
@@ -429,7 +450,8 @@ define_gb_shader_vgpu10(struct svga_context *svga,
     * the shader creation and return an error.
     */
    ret = SVGA3D_vgpu10_DefineAndBindShader(swc, variant->gb_shader,
-                                           variant->id, variant->type, codeLen);
+                                           variant->id, variant->type,
+                                           len);
 
    if (ret != PIPE_OK)
       goto fail;
@@ -511,7 +533,10 @@ svga_set_shader(struct svga_context *svga,
 
    assert(type == SVGA3D_SHADERTYPE_VS ||
           type == SVGA3D_SHADERTYPE_GS ||
-          type == SVGA3D_SHADERTYPE_PS);
+          type == SVGA3D_SHADERTYPE_PS ||
+          type == SVGA3D_SHADERTYPE_HS ||
+          type == SVGA3D_SHADERTYPE_DS ||
+          type == SVGA3D_SHADERTYPE_CS);
 
    if (svga_have_gb_objects(svga)) {
       struct svga_winsys_gb_shader *gbshader =
@@ -533,7 +558,27 @@ svga_set_shader(struct svga_context *svga,
 struct svga_shader_variant *
 svga_new_shader_variant(struct svga_context *svga, enum pipe_shader_type type)
 {
-   struct svga_shader_variant *variant = CALLOC_STRUCT(svga_shader_variant);
+   struct svga_shader_variant *variant;
+
+   switch (type) {
+   case PIPE_SHADER_FRAGMENT:
+      variant = CALLOC(1, sizeof(struct svga_fs_variant));
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      variant = CALLOC(1, sizeof(struct svga_gs_variant));
+      break;
+   case PIPE_SHADER_VERTEX:
+      variant = CALLOC(1, sizeof(struct svga_vs_variant));
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      variant = CALLOC(1, sizeof(struct svga_tes_variant));
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      variant = CALLOC(1, sizeof(struct svga_tcs_variant));
+      break;
+   default:
+      return NULL;
+   }
 
    if (variant) {
       variant->type = svga_shader_type(type);
@@ -547,19 +592,11 @@ void
 svga_destroy_shader_variant(struct svga_context *svga,
                             struct svga_shader_variant *variant)
 {
-   enum pipe_error ret = PIPE_OK;
-
    if (svga_have_gb_objects(svga) && variant->gb_shader) {
       if (svga_have_vgpu10(svga)) {
          struct svga_winsys_context *swc = svga->swc;
          swc->shader_destroy(swc, variant->gb_shader);
-         ret = SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyShader(svga->swc, variant->id));
          util_bitmask_clear(svga->shader_id_bm, variant->id);
       }
       else {
@@ -570,17 +607,13 @@ svga_destroy_shader_variant(struct svga_context *svga,
    }
    else {
       if (variant->id != UTIL_BITMASK_INVALID_INDEX) {
-         ret = SVGA3D_DestroyShader(svga->swc, variant->id, variant->type);
-         if (ret != PIPE_OK) {
-            /* flush and try again */
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_DestroyShader(svga->swc, variant->id, variant->type);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_DestroyShader(svga->swc, variant->id,
+                                               variant->type));
          util_bitmask_clear(svga->shader_id_bm, variant->id);
       }
    }
 
+   FREE(variant->signature);
    FREE((unsigned *)variant->tokens);
    FREE(variant);
 
@@ -612,6 +645,8 @@ svga_rebind_shaders(struct svga_context *svga)
       svga->rebind.flags.vs = 0;
       svga->rebind.flags.gs = 0;
       svga->rebind.flags.fs = 0;
+      svga->rebind.flags.tcs = 0;
+      svga->rebind.flags.tes = 0;
 
       return PIPE_OK;
    }
@@ -637,5 +672,19 @@ svga_rebind_shaders(struct svga_context *svga)
    }
    svga->rebind.flags.fs = 0;
 
+   if (svga->rebind.flags.tcs && hw->tcs && hw->tcs->gb_shader) {
+      ret = swc->resource_rebind(swc, NULL, hw->tcs->gb_shader, SVGA_RELOC_READ);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   svga->rebind.flags.tcs = 0;
+
+   if (svga->rebind.flags.tes && hw->tes && hw->tes->gb_shader) {
+      ret = swc->resource_rebind(swc, NULL, hw->tes->gb_shader, SVGA_RELOC_READ);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+   svga->rebind.flags.tes = 0;
+
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index 67f6b5aeb63..31ccf97d39a 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -68,6 +68,8 @@ struct svga_compile_key
       unsigned need_prescale:1;
       unsigned writes_psize:1;
       unsigned wide_point:1;
+      unsigned writes_viewport_index:1;
+      unsigned num_prescale:5;
    } gs;
 
    /* fragment shader only */
@@ -83,15 +85,42 @@ struct svga_compile_key
       unsigned alpha_func:4;  /**< SVGA3D_CMP_x */
       unsigned write_color0_to_n_cbufs:4;
       unsigned aa_point:1;
+      unsigned layer_to_zero:1;
       int aa_point_coord_index;
       float alpha_ref;
    } fs;
 
+   /* tessellation control shader */
+   struct {
+      unsigned vertices_per_patch:8;
+      enum pipe_prim_type prim_mode:8;
+      enum pipe_tess_spacing spacing:3;
+      unsigned vertices_order_cw:1;
+      unsigned point_mode:1;
+      unsigned passthrough:1;
+   } tcs;
+
+   /* tessellation evaluation shader */
+   struct {
+      unsigned vertices_per_patch:8;
+      unsigned tessfactor_index:8;
+      unsigned need_prescale:1;
+      unsigned need_tessouter:1;
+      unsigned need_tessinner:1;
+   } tes;
+
+   /* compute shader */
+   struct {
+      unsigned grid_size[3];
+   } cs;
+
    /* any shader type */
    int8_t generic_remap_table[MAX_GENERIC_VARYING];
    unsigned num_textures:8;
    unsigned num_unnormalized_coords:8;
    unsigned clip_plane_enable:PIPE_MAX_CLIP_PLANES;
+   unsigned last_vertex_stage:1;
+   unsigned clamp_vertex_color:1;
    unsigned sprite_origin_lower_left:1;
    uint16_t sprite_coord_enable;
    struct {
@@ -121,6 +150,10 @@ struct svga_token_key {
       unsigned writes_psize:1;
       unsigned aa_point:1;
    } gs;
+   struct {
+      unsigned write_position:1;
+   } vs;
+   unsigned dynamic_indexing:1;
 };
 
 /**
@@ -143,6 +176,10 @@ struct svga_shader_variant
    const unsigned *tokens;
    unsigned nr_tokens;
 
+   /* shader signature */
+   unsigned signatureLen;
+   SVGA3dDXShaderSignatureHeader *signature;
+
    /** Per-context shader identifier used with SVGA_3D_CMD_SHADER_DEFINE,
     * SVGA_3D_CMD_SET_SHADER and SVGA_3D_CMD_SHADER_DESTROY.
     */
@@ -154,6 +191,18 @@ struct svga_shader_variant
    /* GB object buffer containing the bytecode */
    struct svga_winsys_gb_shader *gb_shader;
 
+   /** Next variant */
+   struct svga_shader_variant *next;
+};
+
+
+/**
+ * Shader variant for fragment shader
+ */
+struct svga_fs_variant
+{
+   struct svga_shader_variant base;
+
    boolean uses_flat_interp;   /** TRUE if flat interpolation qualifier is
                                 *  applied to any of the varyings.
                                 */
@@ -168,9 +217,56 @@ struct svga_shader_variant
 
    /** For FS-based polygon stipple */
    unsigned pstipple_sampler_unit;
+};
 
-   /** Next variant */
-   struct svga_shader_variant *next;
+
+/**
+ * Shader variant for geometry shader
+ */
+struct svga_gs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for vertex shader
+ */
+struct svga_vs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for tessellation evaluation shader
+ */
+struct svga_tes_variant
+{
+   struct svga_shader_variant base;
+
+   enum pipe_prim_type prim_mode:8;
+   enum pipe_tess_spacing spacing:3;
+   unsigned vertices_order_cw:1;
+   unsigned point_mode:1;
+};
+
+
+/**
+ * Shader variant for tessellation control shader
+ */
+struct svga_tcs_variant
+{
+   struct svga_shader_variant base;
+};
+
+
+/**
+ * Shader variant for compute shader
+ */
+struct svga_cs_variant
+{
+   struct svga_shader_variant base;
 };
 
 
@@ -237,6 +333,30 @@ struct svga_geometry_shader
 };
 
 
+struct svga_tcs_shader
+{
+   struct svga_shader base;
+
+   /** Mask of which generic varying variables are written by this shader */
+   uint64_t generic_outputs;
+};
+
+
+struct svga_tes_shader
+{
+   struct svga_shader base;
+
+   /** Mask of which generic varying variables are written by this shader */
+   uint64_t generic_inputs;
+};
+
+
+struct svga_compute_shader
+{
+   struct svga_shader base;
+};
+
+
 static inline boolean
 svga_compile_keys_equal(const struct svga_compile_key *a,
                         const struct svga_compile_key *b)
@@ -264,7 +384,8 @@ svga_remap_generic_index(int8_t remap_table[MAX_GENERIC_VARYING],
 
 void
 svga_init_shader_key_common(const struct svga_context *svga,
-                            enum pipe_shader_type shader,
+                            enum pipe_shader_type shader_type,
+                            const struct svga_shader *shader,
                             struct svga_compile_key *key);
 
 struct svga_shader_variant *
@@ -328,6 +449,12 @@ svga_shader_type(enum pipe_shader_type shader)
       return SVGA3D_SHADERTYPE_GS;
    case PIPE_SHADER_FRAGMENT:
       return SVGA3D_SHADERTYPE_PS;
+   case PIPE_SHADER_TESS_CTRL:
+      return SVGA3D_SHADERTYPE_HS;
+   case PIPE_SHADER_TESS_EVAL:
+      return SVGA3D_SHADERTYPE_DS;
+   case PIPE_SHADER_COMPUTE:
+      return SVGA3D_SHADERTYPE_CS;
    default:
       assert(!"Invalid shader type");
       return SVGA3D_SHADERTYPE_VS;
@@ -351,4 +478,39 @@ svga_have_gs_streamout(const struct svga_context *svga)
 }
 
 
+static inline struct svga_fs_variant *
+svga_fs_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_PS);
+   return (struct svga_fs_variant *)variant;
+}
+
+
+static inline struct svga_tes_variant *
+svga_tes_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_DS);
+   return (struct svga_tes_variant *)variant;
+}
+
+
+static inline struct svga_cs_variant *
+svga_cs_variant(struct svga_shader_variant *variant)
+{
+   assert(!variant || variant->type == SVGA3D_SHADERTYPE_CS);
+   return (struct svga_cs_variant *)variant;
+}
+
+
+/* Returns TRUE if we are currently using flat shading.
+ */
+static inline boolean
+svga_is_using_flat_shading(const struct svga_context *svga)
+{
+   return
+      svga->state.hw_draw.fs ?
+         svga_fs_variant(svga->state.hw_draw.fs)->uses_flat_interp : FALSE;
+}
+
+
 #endif /* SVGA_SHADER_H */
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c
index dad78389a23..ad647d8784c 100644
--- a/src/gallium/drivers/svga/svga_state.c
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -60,19 +60,40 @@ static const struct svga_tracked_state *hw_clear_state[] =
 };
 
 
-/* Atoms to update hardware state prior to emitting a draw packet.
+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for VGPU9 device.
  */
-static const struct svga_tracked_state *hw_draw_state[] =
+static const struct svga_tracked_state *hw_draw_state_vgpu9[] =
+{
+   &svga_hw_fs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_tss,
+   &svga_hw_tss_binding,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_constants,
+   &svga_hw_vs_constants,
+   NULL
+};
+
+
+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for VGPU10 device.
+ * Geometry Shader is new to VGPU10.
+ * TSS and TSS bindings are replaced by sampler and sampler bindings.
+ */
+static const struct svga_tracked_state *hw_draw_state_vgpu10[] =
 {
    &svga_need_tgsi_transform,
    &svga_hw_fs,
    &svga_hw_gs,
    &svga_hw_vs,
    &svga_hw_rss,
-   &svga_hw_sampler,           /* VGPU10 */
-   &svga_hw_sampler_bindings,  /* VGPU10 */
-   &svga_hw_tss,               /* pre-VGPU10 */
-   &svga_hw_tss_binding,       /* pre-VGPU10 */
+   &svga_hw_sampler,
+   &svga_hw_sampler_bindings,
    &svga_hw_clip_planes,
    &svga_hw_vdecl,
    &svga_hw_fs_constants,
@@ -82,6 +103,33 @@ static const struct svga_tracked_state *hw_draw_state[] =
 };
 
 
+/**
+ * Atoms to update hardware state prior to emitting a draw packet
+ * for SM5 device.
+ * TCS and TES Shaders are new to SM5 device.
+ */
+static const struct svga_tracked_state *hw_draw_state_sm5[] =
+{
+   &svga_need_tgsi_transform,
+   &svga_hw_fs,
+   &svga_hw_gs,
+   &svga_hw_tes,
+   &svga_hw_tcs,
+   &svga_hw_vs,
+   &svga_hw_rss,
+   &svga_hw_sampler,
+   &svga_hw_sampler_bindings,
+   &svga_hw_clip_planes,
+   &svga_hw_vdecl,
+   &svga_hw_fs_constants,
+   &svga_hw_gs_constants,
+   &svga_hw_tes_constants,
+   &svga_hw_tcs_constants,
+   &svga_hw_vs_constants,
+   NULL
+};
+
+
 static const struct svga_tracked_state *swtnl_draw_state[] =
 {
    &svga_update_swtnl_draw,
@@ -89,6 +137,7 @@ static const struct svga_tracked_state *swtnl_draw_state[] =
    NULL
 };
 
+
 /* Flattens the graph of state dependencies.  Could swap the positions
  * of hw_clear_state and need_swtnl_state without breaking anything.
  */
@@ -96,27 +145,26 @@ static const struct svga_tracked_state **state_levels[] =
 {
    need_swtnl_state,
    hw_clear_state,
-   hw_draw_state,
+   NULL,              /* hw_draw_state, to be set to the right version */
    swtnl_draw_state
 };
 
 
-
-static unsigned
-check_state(unsigned a, unsigned b)
+static uint64_t
+check_state(uint64_t a, uint64_t b)
 {
    return (a & b);
 }
 
 static void
-accumulate_state(unsigned *a, unsigned b)
+accumulate_state(uint64_t *a, uint64_t b)
 {
    *a |= b;
 }
 
 
 static void
-xor_states(unsigned *result, unsigned a, unsigned b)
+xor_states(uint64_t *result, uint64_t a, uint64_t b)
 {
    *result = a ^ b;
 }
@@ -125,7 +173,7 @@ xor_states(unsigned *result, unsigned a, unsigned b)
 static enum pipe_error
 update_state(struct svga_context *svga,
              const struct svga_tracked_state *atoms[],
-             unsigned *state)
+             uint64_t *state)
 {
 #ifdef DEBUG
    boolean debug = TRUE;
@@ -144,13 +192,13 @@ update_state(struct svga_context *svga,
        * state flags which are generated and checked to help ensure
        * state atoms are ordered correctly in the list.
        */
-      unsigned examined, prev;
+      uint64_t examined, prev;
 
       examined = 0;
       prev = *state;
 
       for (i = 0; atoms[i] != NULL; i++) {
-         unsigned generated;
+         uint64_t generated;
 
          assert(atoms[i]->dirty);
          assert(atoms[i]->update);
@@ -247,12 +295,7 @@ svga_update_state_retry(struct svga_context *svga, unsigned max_level)
 {
    enum pipe_error ret;
 
-   ret = svga_update_state( svga, max_level );
-
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_update_state( svga, max_level );
-   }
+   SVGA_RETRY_OOM(svga, ret, svga_update_state( svga, max_level ));
 
    return ret == PIPE_OK;
 }
@@ -325,3 +368,14 @@ svga_emit_initial_state(struct svga_context *svga)
       return PIPE_OK;
    }
 }
+
+
+void
+svga_init_tracked_state(struct svga_context *svga)
+{
+   /* Set the hw_draw_state atom list to the one for the particular gpu version.
+    */
+   state_levels[2] = svga_have_sm5(svga) ? hw_draw_state_sm5 :
+                       (svga_have_vgpu10(svga) ? hw_draw_state_vgpu10 :
+                                                 hw_draw_state_vgpu9);
+}
diff --git a/src/gallium/drivers/svga/svga_state.h b/src/gallium/drivers/svga/svga_state.h
index 963a27941ba..76befebe4a3 100644
--- a/src/gallium/drivers/svga/svga_state.h
+++ b/src/gallium/drivers/svga/svga_state.h
@@ -39,8 +39,8 @@ void svga_destroy_state( struct svga_context *svga );
 
 struct svga_tracked_state {
    const char *name;
-   unsigned dirty;
-   enum pipe_error (*update)( struct svga_context *svga, unsigned dirty );
+   uint64_t dirty;
+   enum pipe_error (*update)( struct svga_context *svga, uint64_t dirty );
 };
 
 /* NEED_SWTNL
@@ -61,6 +61,8 @@ extern struct svga_tracked_state svga_need_tgsi_transform;
 extern struct svga_tracked_state svga_hw_vs;
 extern struct svga_tracked_state svga_hw_fs;
 extern struct svga_tracked_state svga_hw_gs;
+extern struct svga_tracked_state svga_hw_tcs;
+extern struct svga_tracked_state svga_hw_tes;
 extern struct svga_tracked_state svga_hw_rss;
 extern struct svga_tracked_state svga_hw_pstipple;
 extern struct svga_tracked_state svga_hw_sampler;
@@ -72,6 +74,8 @@ extern struct svga_tracked_state svga_hw_vdecl;
 extern struct svga_tracked_state svga_hw_fs_constants;
 extern struct svga_tracked_state svga_hw_gs_constants;
 extern struct svga_tracked_state svga_hw_vs_constants;
+extern struct svga_tracked_state svga_hw_tes_constants;
+extern struct svga_tracked_state svga_hw_tcs_constants;
 
 /* SWTNL_DRAW
  */
@@ -105,4 +109,15 @@ enum pipe_error svga_reemit_vs_bindings(struct svga_context *svga);
 
 enum pipe_error svga_reemit_fs_bindings(struct svga_context *svga);
 
+void svga_init_tracked_state(struct svga_context *svga);
+
+void *
+svga_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ);
+
+void
+svga_bind_fs_state(struct pipe_context *pipe, void *shader);
+
+bool svga_update_compute_state(struct svga_context *svga);
+
 #endif
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index e2c5bf0163a..9d9f8934ec4 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -133,12 +133,13 @@ svga_get_extra_fs_constants(const struct svga_context *svga, float *dest)
  * will be returned in 'dest'.
  */
 static unsigned
-svga_get_prescale_constants(const struct svga_context *svga, float **dest)
+svga_get_prescale_constants(const struct svga_context *svga, float **dest,
+		            const struct svga_prescale *prescale)
 {
-   memcpy(*dest, svga->state.hw_clear.prescale.scale, 4 * sizeof(float));
+   memcpy(*dest, prescale->scale, 4 * sizeof(float));
    *dest += 4;
 
-   memcpy(*dest, svga->state.hw_clear.prescale.translate, 4 * sizeof(float));
+   memcpy(*dest, prescale->translate, 4 * sizeof(float));
    *dest += 4;
 
    return 2;
@@ -153,8 +154,8 @@ svga_get_pt_sprite_constants(const struct svga_context *svga, float **dest)
    const struct svga_screen *screen = svga_screen(svga->pipe.screen);
    float *dst = *dest;
 
-   dst[0] = 1.0 / (svga->curr.viewport.scale[0] * 2);
-   dst[1] = 1.0 / (svga->curr.viewport.scale[1] * 2);
+   dst[0] = 1.0 / (svga->curr.viewport[0].scale[0] * 2);
+   dst[1] = 1.0 / (svga->curr.viewport[0].scale[1] * 2);
    dst[2] = svga->curr.rast->pointsize;
    dst[3] = screen->maxPointSize;
    *dest = *dest + 4;
@@ -186,6 +187,7 @@ svga_get_clip_plane_constants(const struct svga_context *svga,
    return count;
 }
 
+
 /**
  * Emit any extra vertex shader constants into the buffer pointed
  * to by 'dest'.
@@ -203,15 +205,16 @@ svga_get_extra_vs_constants(const struct svga_context *svga, float *dest)
    /* SVGA_NEW_VS_VARIANT
     */
    if (variant->key.vs.need_prescale) {
-      count += svga_get_prescale_constants(svga, &dest);
+      count += svga_get_prescale_constants(svga, &dest,
+		                           &svga->state.hw_clear.prescale[0]);
    }
 
    if (variant->key.vs.undo_viewport) {
       /* Used to convert window coords back to NDC coords */
-      dest[0] = 1.0f / svga->curr.viewport.scale[0];
-      dest[1] = 1.0f / svga->curr.viewport.scale[1];
-      dest[2] = -svga->curr.viewport.translate[0];
-      dest[3] = -svga->curr.viewport.translate[1];
+      dest[0] = 1.0f / svga->curr.viewport[0].scale[0];
+      dest[1] = 1.0f / svga->curr.viewport[0].scale[1];
+      dest[2] = -svga->curr.viewport[0].translate[0];
+      dest[3] = -svga->curr.viewport[0].translate[1];
       dest += 4;
       count += 1;
    }
@@ -250,7 +253,20 @@ svga_get_extra_gs_constants(const struct svga_context *svga, float *dest)
    }
 
    if (variant->key.gs.need_prescale) {
-      count += svga_get_prescale_constants(svga, &dest);
+      unsigned i, num_prescale = 1;
+
+      /* If prescale is needed and the geometry shader writes to viewport
+       * index, then prescale for all viewports will be added to the
+       * constant buffer.
+       */
+      if (variant->key.gs.writes_viewport_index)
+         num_prescale = svga->state.hw_clear.num_prescale;
+
+      for (i = 0; i < num_prescale; i++) {
+         count +=
+            svga_get_prescale_constants(svga, &dest,
+			                &svga->state.hw_clear.prescale[i]);
+      }
    }
 
    /* SVGA_NEW_CLIP */
@@ -265,6 +281,77 @@ svga_get_extra_gs_constants(const struct svga_context *svga, float *dest)
 }
 
 
+/**
+ * Emit any extra tessellation control shader constants into the
+ * buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_tcs_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tcs;
+   unsigned count = 0;
+
+   /* SVGA_NEW_CLIP */
+   count += svga_get_clip_plane_constants(svga, variant, &dest);
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_TESS_CTRL,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
+/**
+ * Emit any extra tessellation evaluation shader constants into
+ * the buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_tes_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tes;
+   unsigned count = 0;
+
+   if (variant->key.tes.need_prescale) {
+      count += svga_get_prescale_constants(svga, &dest,
+		                           &svga->state.hw_clear.prescale[0]);
+   }
+
+   /* SVGA_NEW_CLIP */
+   count += svga_get_clip_plane_constants(svga, variant, &dest);
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_TESS_EVAL,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
+/**
+ * Emit any extra compute shader constants into
+ * the buffer pointed to by 'dest'.
+ */
+static unsigned
+svga_get_extra_cs_constants(struct svga_context *svga, float *dest)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.cs;
+   unsigned count = 0;
+
+   /* common constants */
+   count += svga_get_extra_constants_common(svga, variant,
+                                            PIPE_SHADER_COMPUTE,
+                                            dest);
+
+   assert(count <= MAX_EXTRA_CONSTS);
+   return count;
+}
+
+
 /*
  * Check and emit a range of shader constant registers, trying to coalesce
  * successive shader constant updates in a single command in order to save
@@ -490,6 +577,15 @@ emit_constbuf_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
    const struct svga_shader_variant *variant;
    unsigned alloc_buf_size;
 
+   assert(shader == PIPE_SHADER_VERTEX ||
+          shader == PIPE_SHADER_GEOMETRY ||
+          shader == PIPE_SHADER_FRAGMENT ||
+          shader == PIPE_SHADER_TESS_CTRL ||
+          shader == PIPE_SHADER_TESS_EVAL ||
+          shader == PIPE_SHADER_COMPUTE);
+
+   cbuf = &svga->curr.constbufs[shader][0];
+
    switch (shader) {
    case PIPE_SHADER_VERTEX:
       variant = svga->state.hw_draw.vs;
@@ -503,6 +599,18 @@ emit_constbuf_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
       variant = svga->state.hw_draw.gs;
       extra_count = svga_get_extra_gs_constants(svga, (float *) extras);
       break;
+   case PIPE_SHADER_TESS_CTRL:
+      variant = svga->state.hw_draw.tcs;
+      extra_count = svga_get_extra_tcs_constants(svga, (float *) extras);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      variant = svga->state.hw_draw.tes;
+      extra_count = svga_get_extra_tes_constants(svga, (float *) extras);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      variant = svga->state.hw_draw.cs;
+      extra_count = svga_get_extra_cs_constants(svga, (float *) extras);
+      break;
    default:
       assert(!"Unexpected shader type");
       /* Don't return an error code since we don't want to keep re-trying
@@ -706,7 +814,7 @@ emit_consts_vgpu10(struct svga_context *svga, enum pipe_shader_type shader)
 }
 
 static enum pipe_error
-emit_fs_consts(struct svga_context *svga, unsigned dirty)
+emit_fs_consts(struct svga_context *svga, uint64_t dirty)
 {
    const struct svga_shader_variant *variant = svga->state.hw_draw.fs;
    enum pipe_error ret = PIPE_OK;
@@ -741,7 +849,7 @@ struct svga_tracked_state svga_hw_fs_constants =
 
 
 static enum pipe_error
-emit_vs_consts(struct svga_context *svga, unsigned dirty)
+emit_vs_consts(struct svga_context *svga, uint64_t dirty)
 {
    const struct svga_shader_variant *variant = svga->state.hw_draw.vs;
    enum pipe_error ret = PIPE_OK;
@@ -776,7 +884,7 @@ struct svga_tracked_state svga_hw_vs_constants =
 
 
 static enum pipe_error
-emit_gs_consts(struct svga_context *svga, unsigned dirty)
+emit_gs_consts(struct svga_context *svga, uint64_t dirty)
 {
    const struct svga_shader_variant *variant = svga->state.hw_draw.gs;
    enum pipe_error ret = PIPE_OK;
@@ -788,17 +896,17 @@ emit_gs_consts(struct svga_context *svga, unsigned dirty)
 
    /* SVGA_NEW_GS_CONST_BUFFER
     */
-   if (svga_have_vgpu10(svga)) {
-      /**
-       * If only the rasterizer state has changed and the current geometry
-       * shader does not emit wide points, then there is no reason to
-       * re-emit the GS constants, so skip it.
-       */
-      if (dirty == SVGA_NEW_RAST && !variant->key.gs.wide_point)
-         return PIPE_OK;
+   assert(svga_have_vgpu10(svga));
 
-      ret = emit_consts_vgpu10(svga, PIPE_SHADER_GEOMETRY);
-   }
+   /**
+    * If only the rasterizer state has changed and the current geometry
+    * shader does not emit wide points, then there is no reason to
+    * re-emit the GS constants, so skip it.
+    */
+   if (dirty == SVGA_NEW_RAST && !variant->key.gs.wide_point)
+      return PIPE_OK;
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_GEOMETRY);
 
    return ret;
 }
@@ -814,3 +922,66 @@ struct svga_tracked_state svga_hw_gs_constants =
     SVGA_NEW_TEXTURE_CONSTS),
    emit_gs_consts
 };
+
+
+/**
+ * Emit constant buffer for tessellation control shader
+ */
+static enum pipe_error
+emit_tcs_consts(struct svga_context *svga, uint64_t dirty)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tcs;
+   enum pipe_error ret = PIPE_OK;
+
+   assert(svga_have_sm5(svga));
+
+   /* SVGA_NEW_TCS_VARIANT */
+   if (!variant)
+      return PIPE_OK;
+
+   /* SVGA_NEW_TCS_CONST_BUFFER */
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_TESS_CTRL);
+
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tcs_constants =
+{
+   "hw tcs params",
+   (SVGA_NEW_TCS_CONST_BUFFER |
+    SVGA_NEW_TCS_VARIANT),
+   emit_tcs_consts
+};
+
+
+/**
+ * Emit constant buffer for tessellation evaluation shader
+ */
+static enum pipe_error
+emit_tes_consts(struct svga_context *svga, uint64_t dirty)
+{
+   const struct svga_shader_variant *variant = svga->state.hw_draw.tes;
+   enum pipe_error ret = PIPE_OK;
+
+   assert(svga_have_sm5(svga));
+
+   /* SVGA_NEW_TES_VARIANT */
+   if (!variant)
+      return PIPE_OK;
+
+   ret = emit_consts_vgpu10(svga, PIPE_SHADER_TESS_EVAL);
+
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tes_constants =
+{
+   "hw tes params",
+   (SVGA_NEW_PRESCALE |
+    SVGA_NEW_TES_CONST_BUFFER |
+    SVGA_NEW_TES_VARIANT),
+   emit_tes_consts
+};
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
index 99fede51254..dacf86c4277 100644
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -212,9 +212,13 @@ emit_fb_vgpu10(struct svga_context *svga)
       if (curr->cbufs[i]) {
          struct pipe_surface *s = curr->cbufs[i];
 
-         rtv[i] = svga_validate_surface_view(svga, svga_surface(s));
-         if (rtv[i] == NULL) {
-            return PIPE_ERROR_OUT_OF_MEMORY;
+         if (curr->cbufs[i] != hw->cbufs[i]) {
+            rtv[i] = svga_validate_surface_view(svga, svga_surface(s));
+            if (rtv[i] == NULL) {
+               return PIPE_ERROR_OUT_OF_MEMORY;
+            }
+         } else {
+           rtv[i] = svga->state.hw_clear.rtv[i];
          }
 
          assert(svga_surface(rtv[i])->view_id != SVGA3D_INVALID_ID);
@@ -233,9 +237,13 @@ emit_fb_vgpu10(struct svga_context *svga)
    if (curr->zsbuf) {
       struct pipe_surface *s = curr->zsbuf;
 
-      dsv = svga_validate_surface_view(svga, svga_surface(curr->zsbuf));
-      if (!dsv) {
-         return PIPE_ERROR_OUT_OF_MEMORY;
+      if (curr->zsbuf != hw->zsbuf) {
+         dsv = svga_validate_surface_view(svga, svga_surface(curr->zsbuf));
+         if (!dsv) {
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+      } else {
+         dsv = svga->state.hw_clear.dsv;
       }
 
       /* Set the rendered-to flag */
@@ -258,10 +266,6 @@ emit_fb_vgpu10(struct svga_context *svga)
       /* number of render targets sent to the device, not including trailing
        * unbound render targets.
        */
-      svga->state.hw_clear.num_rendertargets = last_rtv + 1;
-      svga->state.hw_clear.dsv = dsv;
-      memcpy(svga->state.hw_clear.rtv, rtv, num_color * sizeof(rtv[0]));
-    
       for (i = 0; i < ss->max_color_buffers; i++) {
          if (hw->cbufs[i] != curr->cbufs[i]) {
             /* propagate the backed view surface before unbinding it */
@@ -270,19 +274,32 @@ emit_fb_vgpu10(struct svga_context *svga)
                                       &svga_surface(hw->cbufs[i])->backed->base,
                                       TRUE);
             }
+            else if (svga->state.hw_clear.rtv[i] != hw->cbufs[i] &&
+                     svga->state.hw_clear.rtv[i]) {
+               /* Free the alternate surface view when it is unbound.  */
+               svga->pipe.surface_destroy(&svga->pipe, svga->state.hw_clear.rtv[i]);
+            }
             pipe_surface_reference(&hw->cbufs[i], curr->cbufs[i]);
          }
       }
+      svga->state.hw_clear.num_rendertargets = last_rtv + 1;
+      memcpy(svga->state.hw_clear.rtv, rtv, num_color * sizeof(rtv[0]));
       hw->nr_cbufs = curr->nr_cbufs;
 
       if (hw->zsbuf != curr->zsbuf) {
          /* propagate the backed view surface before unbinding it */
          if (hw->zsbuf && svga_surface(hw->zsbuf)->backed) {
-            svga_propagate_surface(svga, &svga_surface(hw->zsbuf)->backed->base,
+            svga_propagate_surface(svga,
+                                   &svga_surface(hw->zsbuf)->backed->base,
                                    TRUE);
          }
+         else if (svga->state.hw_clear.dsv != hw->zsbuf && svga->state.hw_clear.dsv) {
+            /* Free the alternate surface view when it is unbound.  */
+            svga->pipe.surface_destroy(&svga->pipe, svga->state.hw_clear.dsv);
+         }
          pipe_surface_reference(&hw->zsbuf, curr->zsbuf);
       }
+      svga->state.hw_clear.dsv = dsv;
    }
 
    return ret;
@@ -290,7 +307,7 @@ emit_fb_vgpu10(struct svga_context *svga)
 
 
 static enum pipe_error
-emit_framebuffer(struct svga_context *svga, unsigned dirty)
+emit_framebuffer(struct svga_context *svga, uint64_t dirty)
 {
    if (svga_have_vgpu10(svga)) {
       return emit_fb_vgpu10(svga);
@@ -383,13 +400,14 @@ struct svga_tracked_state svga_hw_framebuffer =
 /***********************************************************************
  */
 
-static enum pipe_error
-emit_viewport( struct svga_context *svga,
-               unsigned dirty )
+static void
+get_viewport_prescale(struct svga_context *svga,
+                      struct pipe_viewport_state *viewport,
+                      SVGA3dViewport *vp,
+                      struct svga_prescale *prescale)
 {
-   const struct pipe_viewport_state *viewport = &svga->curr.viewport;
-   struct svga_prescale prescale;
    SVGA3dRect rect;
+
    /* Not sure if this state is relevant with POSITIONT.  Probably
     * not, but setting to 0,1 avoids some state pingponging.
     */
@@ -398,7 +416,6 @@ emit_viewport( struct svga_context *svga,
    float flip = -1.0;
    boolean degenerate = FALSE;
    boolean invertY = FALSE;
-   enum pipe_error ret;
 
    float fb_width = (float) svga->curr.framebuffer.width;
    float fb_height = (float) svga->curr.framebuffer.height;
@@ -407,9 +424,8 @@ emit_viewport( struct svga_context *svga,
    float fy = flip * viewport->scale[1] * -1.0f + viewport->translate[1];
    float fw =        viewport->scale[0] * 2.0f;
    float fh = flip * viewport->scale[1] * 2.0f;
-   boolean emit_vgpu10_viewport = FALSE;
 
-   memset( &prescale, 0, sizeof(prescale) );
+   memset(prescale, 0, sizeof(*prescale));
 
    /* Examine gallium viewport transformation and produce a screen
     * rectangle and possibly vertex shader pre-transformation to
@@ -423,14 +439,14 @@ emit_viewport( struct svga_context *svga,
             fw,
             fh);
 
-   prescale.scale[0] = 1.0;
-   prescale.scale[1] = 1.0;
-   prescale.scale[2] = 1.0;
-   prescale.scale[3] = 1.0;
-   prescale.translate[0] = 0;
-   prescale.translate[1] = 0;
-   prescale.translate[2] = 0;
-   prescale.translate[3] = 0;
+   prescale->scale[0] = 1.0;
+   prescale->scale[1] = 1.0;
+   prescale->scale[2] = 1.0;
+   prescale->scale[3] = 1.0;
+   prescale->translate[0] = 0;
+   prescale->translate[1] = 0;
+   prescale->translate[2] = 0;
+   prescale->translate[3] = 0;
 
    /* Enable prescale to adjust vertex positions to match
       VGPU10 convention only if rasterization is enabled.
@@ -439,12 +455,12 @@ emit_viewport( struct svga_context *svga,
       degenerate = TRUE;
       goto out;
    } else {
-      prescale.enabled = TRUE;
+      prescale->enabled = TRUE;
    }
 
    if (fw < 0) {
-      prescale.scale[0] *= -1.0f;
-      prescale.translate[0] += -fw;
+      prescale->scale[0] *= -1.0f;
+      prescale->translate[0] += -fw;
       fw = -fw;
       fx = viewport->scale[0] * 1.0f + viewport->translate[0];
    }
@@ -452,54 +468,54 @@ emit_viewport( struct svga_context *svga,
    if (fh < 0.0) {
       if (svga_have_vgpu10(svga)) {
          /* floating point viewport params below */
-         prescale.translate[1] = fh + fy * 2.0f;
+         prescale->translate[1] = fh + fy * 2.0f;
       }
       else {
          /* integer viewport params below */
-         prescale.translate[1] = fh - 1.0f + fy * 2.0f;
+         prescale->translate[1] = fh - 1.0f + fy * 2.0f;
       }
       fh = -fh;
       fy -= fh;
-      prescale.scale[1] = -1.0f;
+      prescale->scale[1] = -1.0f;
       invertY = TRUE;
    }
 
    if (fx < 0) {
-      prescale.translate[0] += fx;
-      prescale.scale[0] *= fw / (fw + fx);
+      prescale->translate[0] += fx;
+      prescale->scale[0] *= fw / (fw + fx);
       fw += fx;
       fx = 0.0f;
    }
 
    if (fy < 0) {
       if (invertY) {
-         prescale.translate[1] -= fy;
+         prescale->translate[1] -= fy;
       }
       else {
-         prescale.translate[1] += fy;
+         prescale->translate[1] += fy;
       }
-      prescale.scale[1] *= fh / (fh + fy);
+      prescale->scale[1] *= fh / (fh + fy);
       fh += fy;
       fy = 0.0f;
    }
 
    if (fx + fw > fb_width) {
-      prescale.scale[0] *= fw / (fb_width - fx);
-      prescale.translate[0] -= fx * (fw / (fb_width - fx));
-      prescale.translate[0] += fx;
+      prescale->scale[0] *= fw / (fb_width - fx);
+      prescale->translate[0] -= fx * (fw / (fb_width - fx));
+      prescale->translate[0] += fx;
       fw = fb_width - fx;
    }
 
    if (fy + fh > fb_height) {
-      prescale.scale[1] *= fh / (fb_height - fy);
+      prescale->scale[1] *= fh / (fb_height - fy);
       if (invertY) {
          float in = fb_height - fy;       /* number of vp pixels inside view */
          float out = fy + fh - fb_height; /* number of vp pixels out of view */
-         prescale.translate[1] += fy * out / in;
+         prescale->translate[1] += fy * out / in;
       }
       else {
-         prescale.translate[1] -= fy * (fh / (fb_height - fy));
-         prescale.translate[1] += fy;
+         prescale->translate[1] -= fy * (fh / (fb_height - fy));
+         prescale->translate[1] += fy;
       }
       fh = fb_height - fy;
    }
@@ -566,10 +582,10 @@ emit_viewport( struct svga_context *svga,
       if (invertY)
          adjust_y = -adjust_y;
 
-      prescale.translate[0] += adjust_x;
-      prescale.translate[1] += adjust_y;
-      prescale.translate[2] = 0.5; /* D3D clip space */
-      prescale.scale[2]     = 0.5; /* D3D clip space */
+      prescale->translate[0] += adjust_x;
+      prescale->translate[1] += adjust_y;
+      prescale->translate[2] = 0.5; /* D3D clip space */
+      prescale->scale[2]     = 0.5; /* D3D clip space */
    }
 
    range_min = viewport->scale[2] * -1.0f + viewport->translate[2];
@@ -584,7 +600,7 @@ emit_viewport( struct svga_context *svga,
       range_tmp = range_min;
       range_min = range_max;
       range_max = range_tmp;
-      prescale.scale[2] = -prescale.scale[2];
+      prescale->scale[2] = -prescale->scale[2];
    }
 
    /* If zmin is less than 0, clamp zmin to 0 and adjust the prescale.
@@ -594,21 +610,21 @@ emit_viewport( struct svga_context *svga,
    if (range_min < 0.0f) {
       range_min = -0.5f * viewport->scale[2] + 0.5f + viewport->translate[2];
       range_max = 0.5f * viewport->scale[2] + 0.5f + viewport->translate[2];
-      prescale.scale[2] *= 2.0f;
-      prescale.translate[2] -= 0.5f;
+      prescale->scale[2] *= 2.0f;
+      prescale->translate[2] -= 0.5f;
    }
 
-   if (prescale.enabled) {
+   if (prescale->enabled) {
       float H[2];
       float J[2];
       int i;
 
       SVGA_DBG(DEBUG_VIEWPORT,
                "prescale %f,%f %fx%f\n",
-               prescale.translate[0],
-               prescale.translate[1],
-               prescale.scale[0],
-               prescale.scale[1]);
+               prescale->translate[0],
+               prescale->translate[1],
+               prescale->scale[0],
+               prescale->scale[1]);
 
       H[0] = (float)rect.w / 2.0f;
       H[1] = -(float)rect.h / 2.0f;
@@ -645,16 +661,16 @@ emit_viewport( struct svga_context *svga,
        * Overwrite prescale.translate with values for K:
        */
       for (i = 0; i < 2; i++) {
-         prescale.translate[i] = ((prescale.translate[i] +
-                                   (prescale.scale[i] - 1.0f) * J[i]) / H[i]);
+         prescale->translate[i] = ((prescale->translate[i] +
+                                   (prescale->scale[i] - 1.0f) * J[i]) / H[i]);
       }
 
       SVGA_DBG(DEBUG_VIEWPORT,
                "clipspace %f,%f %fx%f\n",
-               prescale.translate[0],
-               prescale.translate[1],
-               prescale.scale[0],
-               prescale.scale[1]);
+               prescale->translate[0],
+               prescale->translate[1],
+               prescale->scale[0],
+               prescale->scale[1]);
    }
 
 out:
@@ -663,59 +679,90 @@ out:
       rect.y = 0;
       rect.w = 1;
       rect.h = 1;
-      prescale.enabled = FALSE;
+      prescale->enabled = FALSE;
    }
 
-   if (!svga_rects_equal(&rect, &svga->state.hw_clear.viewport)) {
-      if (svga_have_vgpu10(svga)) {
-         emit_vgpu10_viewport = TRUE;
-      }
-      else {
+   vp->x = (float) rect.x;
+   vp->y = (float) rect.y;
+   vp->width = (float) rect.w;
+   vp->height = (float) rect.h;
+   vp->minDepth = range_min;
+   vp->maxDepth = range_max;
+}
+
+
+static enum pipe_error
+emit_viewport( struct svga_context *svga,
+               uint64_t dirty )
+{
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   SVGA3dViewport viewports[SVGA3D_DX_MAX_VIEWPORTS];
+   struct svga_prescale prescale[SVGA3D_DX_MAX_VIEWPORTS];
+   unsigned i;
+   enum pipe_error ret;
+   unsigned max_viewports = svgascreen->max_viewports;
+
+   for (i = 0; i < max_viewports; i++) {
+      get_viewport_prescale(svga, &svga->curr.viewport[i],
+                            &viewports[i], &prescale[i]);
+   }
+
+   if (memcmp(viewports, svga->state.hw_clear.viewports,
+              max_viewports * sizeof viewports[0]) != 0) {
+
+      if (!svga_have_vgpu10(svga)) {
+         SVGA3dRect rect;
+         SVGA3dViewport *vp = &viewports[0];
+
+         rect.x = (uint32)vp->x;
+         rect.y = (uint32)vp->y;
+         rect.w = (uint32)vp->width;
+         rect.h = (uint32)vp->height;
+
          ret = SVGA3D_SetViewport(svga->swc, &rect);
          if (ret != PIPE_OK)
             return ret;
 
-         svga->state.hw_clear.viewport = rect;
-      }
-   }
+         ret = SVGA3D_SetZRange(svga->swc, vp->minDepth, vp->maxDepth);
+         if (ret != PIPE_OK)
+            return ret;
 
-   if (svga->state.hw_clear.depthrange.zmin != range_min ||
-       svga->state.hw_clear.depthrange.zmax != range_max)
-   {
-      if (svga_have_vgpu10(svga)) {
-         emit_vgpu10_viewport = TRUE;
+         svga->state.hw_clear.viewport = rect;
+         svga->state.hw_clear.depthrange.zmin = vp->minDepth;
+         svga->state.hw_clear.depthrange.zmax = vp->maxDepth;
       }
       else {
-         ret = SVGA3D_SetZRange(svga->swc, range_min, range_max );
+         ret = SVGA3D_vgpu10_SetViewports(svga->swc, max_viewports,
+                                          viewports);
          if (ret != PIPE_OK)
             return ret;
-
-         svga->state.hw_clear.depthrange.zmin = range_min;
-         svga->state.hw_clear.depthrange.zmax = range_max;
       }
+      memcpy(svga->state.hw_clear.viewports, viewports,
+             max_viewports * sizeof viewports[0]);
    }
 
-   if (emit_vgpu10_viewport) {
-      SVGA3dViewport vp;
-      vp.x = (float) rect.x;
-      vp.y = (float) rect.y;
-      vp.width = (float) rect.w;
-      vp.height = (float) rect.h;
-      vp.minDepth = range_min;
-      vp.maxDepth = range_max;
-      ret = SVGA3D_vgpu10_SetViewports(svga->swc, 1, &vp);
-      if (ret != PIPE_OK)
-         return ret;
-
-      svga->state.hw_clear.viewport = rect;
-
-      svga->state.hw_clear.depthrange.zmin = range_min;
-      svga->state.hw_clear.depthrange.zmax = range_max;
-   }
-
-   if (memcmp(&prescale, &svga->state.hw_clear.prescale, sizeof prescale) != 0) {
+   if (memcmp(prescale, svga->state.hw_clear.prescale,
+              max_viewports * sizeof prescale[0]) != 0) {
       svga->dirty |= SVGA_NEW_PRESCALE;
-      svga->state.hw_clear.prescale = prescale;
+      memcpy(svga->state.hw_clear.prescale, prescale,
+             max_viewports * sizeof prescale[0]);
+
+      /*
+       * Determine number of unique prescales. This is to minimize the
+       * if check needed in the geometry shader to identify the prescale
+       * for the specified viewport.
+       */
+      unsigned last_prescale = SVGA3D_DX_MAX_VIEWPORTS - 1;
+      unsigned i;
+      for (i = SVGA3D_DX_MAX_VIEWPORTS-1; i > 0; i--) {
+         if (memcmp(&svga->state.hw_clear.prescale[i],
+                    &svga->state.hw_clear.prescale[i-1],
+                    sizeof svga->state.hw_clear.prescale[0])) {
+            break;
+         }
+         last_prescale--;
+      }
+      svga->state.hw_clear.num_prescale = last_prescale + 1;
    }
 
    return PIPE_OK;
@@ -738,32 +785,49 @@ struct svga_tracked_state svga_hw_viewport =
  */
 static enum pipe_error
 emit_scissor_rect( struct svga_context *svga,
-                   unsigned dirty )
+                   uint64_t dirty )
 {
-   const struct pipe_scissor_state *scissor = &svga->curr.scissor;
+   struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
+   const struct pipe_scissor_state *scissor = svga->curr.scissor;
+   unsigned max_viewports = svgascreen->max_viewports;
+   enum pipe_error ret;
 
-   if (svga_have_vgpu10(svga)) {
-      SVGASignedRect rect;
+   if (memcmp(&svga->state.hw_clear.scissors[0], scissor,
+              max_viewports * sizeof *scissor) != 0) {
 
-      rect.left = scissor->minx;
-      rect.top = scissor->miny;
-      rect.right = scissor->maxx;
-      rect.bottom = scissor->maxy;
+      if (svga_have_vgpu10(svga)) {
+         SVGASignedRect rect[SVGA3D_DX_MAX_VIEWPORTS];
+         unsigned i;
+
+         for (i = 0; i < max_viewports; i++) {
+            rect[i].left = scissor[i].minx;
+            rect[i].top = scissor[i].miny;
+            rect[i].right = scissor[i].maxx;
+            rect[i].bottom = scissor[i].maxy;
+         }
 
-      return SVGA3D_vgpu10_SetScissorRects(svga->swc, 1, &rect);
-   }
-   else {
-      SVGA3dRect rect;
+         ret = SVGA3D_vgpu10_SetScissorRects(svga->swc, max_viewports, rect);
+      }
+      else {
+         SVGA3dRect rect;
 
-      rect.x = scissor->minx;
-      rect.y = scissor->miny;
-      rect.w = scissor->maxx - scissor->minx; /* + 1 ?? */
-      rect.h = scissor->maxy - scissor->miny; /* + 1 ?? */
+         rect.x = scissor[0].minx;
+         rect.y = scissor[0].miny;
+         rect.w = scissor[0].maxx - scissor[0].minx; /* + 1 ?? */
+         rect.h = scissor[0].maxy - scissor[0].miny; /* + 1 ?? */
 
-      return SVGA3D_SetScissorRect(svga->swc, &rect);
+         ret = SVGA3D_SetScissorRect(svga->swc, &rect);
+      }
+
+      if (ret != PIPE_OK)
+         return ret;
+
+      memcpy(svga->state.hw_clear.scissors, scissor,
+             max_viewports * sizeof *scissor);
    }
-}
 
+   return PIPE_OK;
+}
 
 struct svga_tracked_state svga_hw_scissor =
 {
@@ -779,7 +843,7 @@ struct svga_tracked_state svga_hw_scissor =
 
 static enum pipe_error
 emit_clip_planes( struct svga_context *svga,
-                  unsigned dirty )
+                  uint64_t dirty )
 {
    unsigned i;
    enum pipe_error ret;
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index d55a799d435..675fec96cf8 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -196,8 +196,10 @@ make_fs_key(const struct svga_context *svga,
     */
    if (svga->curr.gs) {
       key->fs.gs_generic_outputs = svga->curr.gs->generic_outputs;
+      key->fs.layer_to_zero = !svga->curr.gs->base.info.writes_layer;
    } else {
       key->fs.vs_generic_outputs = svga->curr.vs->generic_outputs;
+      key->fs.layer_to_zero = 1;
    }
 
    /* Only need fragment shader fixup for twoside lighting if doing
@@ -276,7 +278,7 @@ make_fs_key(const struct svga_context *svga,
     *
     * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
     */
-   svga_init_shader_key_common(svga, shader, key);
+   svga_init_shader_key_common(svga, shader, &fs->base, key);
 
    for (i = 0; i < svga->curr.num_samplers[shader]; ++i) {
       struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
@@ -317,15 +319,6 @@ make_fs_key(const struct svga_context *svga,
                   debug_warn_once("Unsupported shadow compare function");
                }
             }
-            else {
-               /* For other texture formats, just use the compare func/mode
-                * as-is.  Should be no-ops for color textures.  For depth
-                * textures, we do not get automatic depth compare.  We have
-                * to do it ourselves in the shader.  And we don't get PCF.
-                */
-               key->tex[i].compare_mode = sampler->compare_mode;
-               key->tex[i].compare_func = sampler->compare_func;
-            }
          }
       }
    }
@@ -401,22 +394,26 @@ svga_reemit_fs_bindings(struct svga_context *svga)
 
 
 static enum pipe_error
-emit_hw_fs(struct svga_context *svga, unsigned dirty)
+emit_hw_fs(struct svga_context *svga, uint64_t dirty)
 {
    struct svga_shader_variant *variant = NULL;
    enum pipe_error ret = PIPE_OK;
    struct svga_fragment_shader *fs = svga->curr.fs;
    struct svga_compile_key key;
+   struct svga_shader *prevShader = NULL;   /* shader in the previous stage */
 
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITFS);
 
+   prevShader = svga->curr.gs ?
+      &svga->curr.gs->base : (svga->curr.tes ?
+      &svga->curr.tes->base : &svga->curr.vs->base);
+
    /* Disable rasterization if rasterizer_discard flag is set or
     * vs/gs does not output position.
     */
    svga->disable_rasterizer =
       svga->curr.rast->templ.rasterizer_discard ||
-      (svga->curr.gs && !svga->curr.gs->base.info.writes_position) ||
-      (!svga->curr.gs && !svga->curr.vs->base.info.writes_position);
+      !prevShader->info.writes_position;
 
    /* Set FS to NULL when rasterization is to be disabled */
    if (svga->disable_rasterizer) {
diff --git a/src/gallium/drivers/svga/svga_state_gs.c b/src/gallium/drivers/svga/svga_state_gs.c
index 1eb4cebc08d..670b757c45f 100644
--- a/src/gallium/drivers/svga/svga_state_gs.c
+++ b/src/gallium/drivers/svga/svga_state_gs.c
@@ -109,34 +109,45 @@ make_gs_key(struct svga_context *svga, struct svga_compile_key *key)
    /*
     * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
     */
-   svga_init_shader_key_common(svga, PIPE_SHADER_GEOMETRY, key);
+   svga_init_shader_key_common(svga, PIPE_SHADER_GEOMETRY, &gs->base, key);
 
    memcpy(key->generic_remap_table, gs->generic_remap_table,
           sizeof(gs->generic_remap_table));
 
    key->gs.vs_generic_outputs = svga->curr.vs->generic_outputs;
 
-   key->gs.need_prescale = svga->state.hw_clear.prescale.enabled;
+   key->gs.need_prescale = svga->state.hw_clear.prescale[0].enabled;
 
    key->gs.writes_psize = gs->base.info.writes_psize;
    key->gs.wide_point = gs->wide_point;
+   key->gs.writes_viewport_index = gs->base.info.writes_viewport_index;
+   if (key->gs.writes_viewport_index) {
+      key->gs.num_prescale = svga->state.hw_clear.num_prescale;
+   } else {
+      key->gs.num_prescale = 1;
+   }
    key->sprite_coord_enable = svga->curr.rast->templ.sprite_coord_enable;
    key->sprite_origin_lower_left = (svga->curr.rast->templ.sprite_coord_mode
                                     == PIPE_SPRITE_COORD_LOWER_LEFT);
 
    /* SVGA_NEW_RAST */
    key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* Mark this as the last shader in the vertex processing stage */
+   key->last_vertex_stage = 1;
 }
 
 
 static enum pipe_error
-emit_hw_gs(struct svga_context *svga, unsigned dirty)
+emit_hw_gs(struct svga_context *svga, uint64_t dirty)
 {
    struct svga_shader_variant *variant;
    struct svga_geometry_shader *gs = svga->curr.gs;
    enum pipe_error ret = PIPE_OK;
    struct svga_compile_key key;
 
+   assert(svga_have_vgpu10(svga));
+
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITGS);
 
    /* If there's a user-defined GS, we should have a pointer to a derived
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
index f9cea143ac9..5a52c25a4c1 100644
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -33,7 +33,7 @@
 
 
 static enum pipe_error
-update_need_swvfetch(struct svga_context *svga, unsigned dirty)
+update_need_swvfetch(struct svga_context *svga, uint64_t dirty)
 {
    if (!svga->curr.velems) {
       /* No vertex elements bound. */
@@ -58,7 +58,7 @@ struct svga_tracked_state svga_update_need_swvfetch =
 
 
 static enum pipe_error
-update_need_pipeline(struct svga_context *svga, unsigned dirty)
+update_need_pipeline(struct svga_context *svga, uint64_t dirty)
 {
    boolean need_pipeline = FALSE;
    struct svga_vertex_shader *vs = svga->curr.vs;
@@ -156,7 +156,7 @@ struct svga_tracked_state svga_update_need_pipeline =
 
 
 static enum pipe_error
-update_need_swtnl(struct svga_context *svga, unsigned dirty)
+update_need_swtnl(struct svga_context *svga, uint64_t dirty)
 {
    boolean need_swtnl;
 
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
index 3c42b4e8595..3549ce2938d 100644
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -97,7 +97,7 @@ translate_fill_mode(unsigned fill)
  * the "to" state.
  */
 static enum pipe_error
-emit_rss_vgpu9(struct svga_context *svga, unsigned dirty)
+emit_rss_vgpu9(struct svga_context *svga, uint64_t dirty)
 {
    struct svga_screen *screen = svga_screen(svga->pipe.screen);
    struct rs_queue queue;
@@ -363,7 +363,7 @@ get_no_depth_stencil_test_state(struct svga_context *svga)
 
 
 static enum pipe_error
-emit_rss_vgpu10(struct svga_context *svga, unsigned dirty)
+emit_rss_vgpu10(struct svga_context *svga, uint64_t dirty)
 {
    enum pipe_error ret = PIPE_OK;
 
@@ -487,7 +487,7 @@ emit_rss_vgpu10(struct svga_context *svga, unsigned dirty)
 
 
 static enum pipe_error
-emit_rss(struct svga_context *svga, unsigned dirty)
+emit_rss(struct svga_context *svga, uint64_t dirty)
 {
    if (svga_have_vgpu10(svga)) {
       return emit_rss_vgpu10(svga, dirty);
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c
index 306c55dbb11..bbfd889e9f4 100644
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -131,7 +131,7 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
    if (sv->id == SVGA3D_INVALID_ID) {
       struct svga_screen *ss = svga_screen(svga->pipe.screen);
       struct pipe_resource *texture = sv->base.texture;
-      struct svga_winsys_surface *surface = svga_resource_handle(texture);
+      struct svga_winsys_surface *surface;
       SVGA3dSurfaceFormat format;
       SVGA3dResourceType resourceDim;
       SVGA3dShaderResourceViewDesc viewDesc;
@@ -154,6 +154,7 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
          svga_translate_texture_buffer_view_format(viewFormat,
                                                    &format,
                                                    &pf_flags);
+         surface = svga_buffer_handle(svga, texture, PIPE_BIND_SAMPLER_VIEW);
       }
       else {
          format = svga_translate_format(ss, viewFormat,
@@ -161,6 +162,8 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
 
          /* Convert the format to a sampler-friendly format, if needed */
          format = svga_sampler_format(format);
+
+         surface = svga_texture(texture)->handle;
       }
 
       assert(format != SVGA3D_FORMAT_INVALID);
@@ -234,15 +237,14 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
 
 
 static enum pipe_error
-update_sampler_resources(struct svga_context *svga, unsigned dirty)
+update_sampler_resources(struct svga_context *svga, uint64_t dirty)
 {
    enum pipe_error ret = PIPE_OK;
    enum pipe_shader_type shader;
 
-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));
 
-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_TESS_EVAL; shader++) {
       SVGA3dShaderResourceViewId ids[PIPE_MAX_SAMPLERS];
       struct svga_winsys_surface *surfaces[PIPE_MAX_SAMPLERS];
       struct pipe_sampler_view *sampler_views[PIPE_MAX_SAMPLERS];
@@ -349,7 +351,8 @@ update_sampler_resources(struct svga_context *svga, unsigned dirty)
 
    /* Handle polygon stipple sampler view */
    if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
       struct svga_pipe_sampler_view *sv = svga->polygon_stipple.sampler_view;
       struct svga_winsys_surface *surface;
 
@@ -385,15 +388,14 @@ struct svga_tracked_state svga_hw_sampler_bindings = {
 
 
 static enum pipe_error
-update_samplers(struct svga_context *svga, unsigned dirty )
+update_samplers(struct svga_context *svga, uint64_t dirty )
 {
    enum pipe_error ret = PIPE_OK;
    enum pipe_shader_type shader;
 
-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));
 
-   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_GEOMETRY; shader++) {
+   for (shader = PIPE_SHADER_VERTEX; shader <= PIPE_SHADER_TESS_EVAL; shader++) {
       const unsigned count = svga->curr.num_samplers[shader];
       SVGA3dSamplerId ids[PIPE_MAX_SAMPLERS];
       unsigned i;
@@ -404,7 +406,8 @@ update_samplers(struct svga_context *svga, unsigned dirty )
 
          /* _NEW_FS */
          if (shader == PIPE_SHADER_FRAGMENT) {
-            struct svga_shader_variant *fs = svga->state.hw_draw.fs;
+            struct svga_fs_variant *fs =
+               svga_fs_variant(svga->state.hw_draw.fs);
             /* If the fragment shader is doing the shadow comparison
              * for this texture unit, don't enable shadow compare in
              * the texture sampler state.
@@ -449,7 +452,8 @@ update_samplers(struct svga_context *svga, unsigned dirty )
 
    /* Handle polygon stipple sampler texture */
    if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
       struct svga_sampler_state *sampler = svga->polygon_stipple.sampler;
 
       assert(sampler);
diff --git a/src/gallium/drivers/svga/svga_state_tgsi_transform.c b/src/gallium/drivers/svga/svga_state_tgsi_transform.c
index 1dcc05cfaf0..e0b054acbcf 100644
--- a/src/gallium/drivers/svga/svga_state_tgsi_transform.c
+++ b/src/gallium/drivers/svga/svga_state_tgsi_transform.c
@@ -29,7 +29,10 @@
 #include "util/u_simple_shaders.h"
 #include "tgsi/tgsi_ureg.h"
 #include "tgsi/tgsi_point_sprite.h"
+#include "tgsi/tgsi_dynamic_indexing.h"
+#include "tgsi/tgsi_vpos.h"
 #include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
 
 #include "svga_context.h"
 #include "svga_shader.h"
@@ -49,6 +52,171 @@ bind_gs_state(struct svga_context *svga,
 }
 
 
+static void
+insert_at_head(struct svga_shader *head, struct svga_shader *shader)
+{
+   shader->parent = head;
+   shader->next = head->next;
+   head->next = shader;
+}
+
+
+/**
+ * Bind shader
+ */
+static void
+bind_shader(struct svga_context *svga,
+            const enum pipe_shader_type shader_type,
+            struct svga_shader *shader)
+{
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      svga->pipe.bind_vs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      /**
+       * Avoid pipe->bind_fs_state call because it goes through aapoint
+       * layer. We loose linked list of all transformed shaders if aapoint
+       * is used.
+       */
+      svga_bind_fs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      svga->pipe.bind_gs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      svga->pipe.bind_tcs_state(&svga->pipe, shader);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      svga->pipe.bind_tes_state(&svga->pipe, shader);
+      break;
+   default:
+      return;
+   }
+}
+
+
+
+/**
+ * Create shader
+ */
+static void *
+create_shader(struct svga_context *svga,
+              const enum pipe_shader_type shader_type,
+              struct pipe_shader_state *state)
+{
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
+      return svga->pipe.create_vs_state(&svga->pipe, state);
+   case PIPE_SHADER_FRAGMENT:
+      /**
+       * Avoid pipe->create_fs_state call because it goes through aapoint
+       * layer. We loose linked list of all transformed shaders if aapoint
+       * is used.
+       */
+      return svga_create_fs_state(&svga->pipe, state);
+   case PIPE_SHADER_GEOMETRY:
+      return svga->pipe.create_gs_state(&svga->pipe, state);
+   case PIPE_SHADER_TESS_CTRL:
+      return svga->pipe.create_tcs_state(&svga->pipe, state);
+   case PIPE_SHADER_TESS_EVAL:
+      return svga->pipe.create_tes_state(&svga->pipe, state);
+   default:
+      return NULL;
+   }
+}
+
+
+static void
+write_vpos(struct svga_context *svga,
+           struct svga_shader *shader)
+{
+   struct svga_token_key key;
+   boolean use_existing = FALSE;
+   struct svga_shader *transform_shader;
+   const struct tgsi_shader_info *info = &shader->info;
+
+   /* Create a token key */
+   memset(&key, 0, sizeof key);
+   key.vs.write_position = 1;
+
+   if (shader->next) {
+      transform_shader = svga_search_shader_token_key(shader->next, &key);
+      if (transform_shader) {
+         use_existing = TRUE;
+      }
+   }
+
+   if (!use_existing) {
+      struct pipe_shader_state state;
+      struct tgsi_token *new_tokens = NULL;
+
+      new_tokens = tgsi_write_vpos(shader->tokens,
+                                   info->immediate_count);
+      if (!new_tokens)
+         return;
+
+      pipe_shader_state_from_tgsi(&state, new_tokens);
+
+      transform_shader = create_shader(svga, info->processor, &state);
+      insert_at_head(shader, transform_shader);
+      FREE(new_tokens);
+   }
+   transform_shader->token_key = key;
+   bind_shader(svga, info->processor, transform_shader);
+}
+
+
+/**
+ * transform_dynamic_indexing searches shader variant list to see if
+ * we have transformed shader for dynamic indexing and reuse/bind it. If we
+ * don't have transformed shader, then it will create new shader from which
+ * dynamic indexing will be removed. It will also be added to the shader
+ * variant list and this new shader will be bind to current svga state.
+ */
+static void
+transform_dynamic_indexing(struct svga_context *svga,
+                           struct svga_shader *shader)
+{
+   struct svga_token_key key;
+   boolean use_existing = FALSE;
+   struct svga_shader *transform_shader;
+   const struct tgsi_shader_info *info = &shader->info;
+
+   /* Create a token key */
+   memset(&key, 0, sizeof key);
+   key.dynamic_indexing = 1;
+
+   if (shader->next) {
+      transform_shader = svga_search_shader_token_key(shader->next, &key);
+      if (transform_shader) {
+         use_existing = TRUE;
+      }
+   }
+
+   struct tgsi_token *new_tokens = NULL;
+
+   if (!use_existing) {
+      struct pipe_shader_state state;
+      new_tokens = tgsi_remove_dynamic_indexing(shader->tokens,
+                                                info->const_buffers_declared,
+                                                info->samplers_declared,
+                                                info->immediate_count);
+      if (!new_tokens)
+         return;
+
+      pipe_shader_state_from_tgsi(&state, new_tokens);
+
+      transform_shader = create_shader(svga, info->processor, &state);
+      insert_at_head(shader, transform_shader);
+   }
+   transform_shader->token_key = key;
+   bind_shader(svga, info->processor, transform_shader);
+   if (new_tokens)
+      FREE(new_tokens);
+}
+
+
 /**
  * emulate_point_sprite searches the shader variants list to see it there is
  * a shader variant with a token string that matches the emulation
@@ -233,18 +401,49 @@ add_point_sprite_shader(struct svga_context *svga)
    return &new_gs->base;
 }
 
+
+static boolean
+has_dynamic_indexing(const struct tgsi_shader_info *info)
+{
+   return (info->dim_indirect_files & (1u << TGSI_FILE_CONSTANT)) ||
+      (info->indirect_files & (1u << TGSI_FILE_SAMPLER));
+}
+
+
 /* update_tgsi_transform provides a hook to transform a shader if needed.
  */
 static enum pipe_error
-update_tgsi_transform(struct svga_context *svga, unsigned dirty)
+update_tgsi_transform(struct svga_context *svga, uint64_t dirty)
 {
    struct svga_geometry_shader *gs = svga->curr.user_gs;   /* current gs */
    struct svga_vertex_shader *vs = svga->curr.vs;     /* currently bound vs */
+   struct svga_fragment_shader *fs = svga->curr.fs;   /* currently bound fs */
+   struct svga_tcs_shader *tcs = svga->curr.tcs;      /* currently bound tcs */
+   struct svga_tes_shader *tes = svga->curr.tes;      /* currently bound tes */
    struct svga_shader *orig_gs;                       /* original gs */
    struct svga_shader *new_gs;                        /* new gs */
 
-   if (!svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(svga_have_vgpu10(svga));
+
+   if (vs->base.info.num_outputs == 0) {
+      write_vpos(svga, &vs->base);
+   }
+
+   if (vs && has_dynamic_indexing(&vs->base.info)) {
+      transform_dynamic_indexing(svga, &vs->base);
+   }
+   if (fs && has_dynamic_indexing(&fs->base.info)) {
+      transform_dynamic_indexing(svga, &fs->base);
+   }
+   if (gs && has_dynamic_indexing(&gs->base.info)) {
+      transform_dynamic_indexing(svga, &gs->base);
+   }
+   if (tcs && has_dynamic_indexing(&tcs->base.info)) {
+      transform_dynamic_indexing(svga, &tcs->base);
+   }
+   if (tes && has_dynamic_indexing(&tes->base.info)) {
+      transform_dynamic_indexing(svga, &tes->base);
+   }
 
    if (svga->curr.reduced_prim == PIPE_PRIM_POINTS) {
       /* If the current prim type is POINTS and the current geometry shader
diff --git a/src/gallium/drivers/svga/svga_state_ts.c b/src/gallium/drivers/svga/svga_state_ts.c
new file mode 100644
index 00000000000..890d153c7d6
--- /dev/null
+++ b/src/gallium/drivers/svga/svga_state_ts.c
@@ -0,0 +1,392 @@
+/**********************************************************
+ * Copyright 2018-2020 VMware, Inc.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ **********************************************************/
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+
+#include "svga_context.h"
+#include "svga_cmd.h"
+#include "svga_tgsi.h"
+#include "svga_shader.h"
+
+
+/**
+ * Translate TGSI shader into an svga shader variant.
+ */
+static enum pipe_error
+compile_tcs(struct svga_context *svga,
+           struct svga_tcs_shader *tcs,
+           const struct svga_compile_key *key,
+           struct svga_shader_variant **out_variant)
+{
+   struct svga_shader_variant *variant;
+   enum pipe_error ret = PIPE_ERROR;
+
+   variant = svga_tgsi_vgpu10_translate(svga, &tcs->base, key,
+                                        PIPE_SHADER_TESS_CTRL);
+   if (!variant)
+      return PIPE_ERROR;
+
+   ret = svga_define_shader(svga, variant);
+   if (ret != PIPE_OK) {
+      svga_destroy_shader_variant(svga, variant);
+      return ret;
+   }
+
+   *out_variant = variant;
+
+   return PIPE_OK;
+}
+
+
+static void
+make_tcs_key(struct svga_context *svga, struct svga_compile_key *key)
+{
+   struct svga_tcs_shader *tcs = svga->curr.tcs;
+
+   memset(key, 0, sizeof *key);
+
+   /*
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   svga_init_shader_key_common(svga, PIPE_SHADER_TESS_CTRL, &tcs->base, key);
+
+   /* SVGA_NEW_TCS_PARAM */
+   key->tcs.vertices_per_patch = svga->curr.vertices_per_patch;
+
+   /* The tessellator parameters come from the layout section in the
+    * tessellation evaluation shader. Get these parameters from the
+    * current tessellation evaluation shader variant.
+    * Note: this requires the tessellation evaluation shader to be
+    * compiled first.
+    */
+   struct svga_tes_variant *tes = svga_tes_variant(svga->state.hw_draw.tes);
+   key->tcs.prim_mode = tes->prim_mode;
+   key->tcs.spacing = tes->spacing;
+   key->tcs.vertices_order_cw = tes->vertices_order_cw;
+   key->tcs.point_mode = tes->point_mode;
+
+   if (svga->tcs.passthrough)
+      key->tcs.passthrough = 1;
+
+   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* tcs is always followed by tes */
+   key->last_vertex_stage = 0;
+}
+
+
+static enum pipe_error
+emit_hw_tcs(struct svga_context *svga, uint64_t dirty)
+{
+   struct svga_shader_variant *variant;
+   struct svga_tcs_shader *tcs = svga->curr.tcs;
+   enum pipe_error ret = PIPE_OK;
+   struct svga_compile_key key;
+
+   assert(svga_have_sm5(svga));
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITTCS);
+
+   if (!tcs) {
+      /* If there is no active tcs, then there should not be
+       * active tes either
+       */
+      assert(!svga->curr.tes);
+      if (svga->state.hw_draw.tcs != NULL) {
+
+         /** The previous tessellation control shader is made inactive.
+          *  Needs to unbind the tessellation control shader.
+          */
+         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, NULL);
+         if (ret != PIPE_OK)
+            goto done;
+         svga->state.hw_draw.tcs = NULL;
+      }
+      goto done;
+   }
+
+   make_tcs_key(svga, &key);
+
+   /* See if we already have a TCS variant that matches the key */
+   variant = svga_search_shader_key(&tcs->base, &key);
+
+   if (!variant) {
+      ret = compile_tcs(svga, tcs, &key, &variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      /* insert the new variant at head of linked list */
+      assert(variant);
+      variant->next = tcs->base.variants;
+      tcs->base.variants = variant;
+   }
+
+   if (variant != svga->state.hw_draw.tcs) {
+      /* Bind the new variant */
+      ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_HS, variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      svga->rebind.flags.tcs = FALSE;
+      svga->dirty |= SVGA_NEW_TCS_VARIANT;
+      svga->state.hw_draw.tcs = variant;
+   }
+
+done:
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tcs =
+{
+   "tessellation control shader (hwtnl)",
+   (SVGA_NEW_VS |
+    SVGA_NEW_TCS |
+    SVGA_NEW_TES |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_RAST),
+   emit_hw_tcs
+};
+
+
+/**
+ * Translate TGSI shader into an svga shader variant.
+ */
+static enum pipe_error
+compile_tes(struct svga_context *svga,
+           struct svga_tes_shader *tes,
+           const struct svga_compile_key *key,
+           struct svga_shader_variant **out_variant)
+{
+   struct svga_shader_variant *variant;
+   enum pipe_error ret = PIPE_ERROR;
+
+   variant = svga_tgsi_vgpu10_translate(svga, &tes->base, key,
+                                        PIPE_SHADER_TESS_EVAL);
+   if (!variant)
+      return PIPE_ERROR;
+
+   ret = svga_define_shader(svga, variant);
+   if (ret != PIPE_OK) {
+      svga_destroy_shader_variant(svga, variant);
+      return ret;
+   }
+
+   *out_variant = variant;
+
+   return PIPE_OK;
+}
+
+
+static void
+make_tes_key(struct svga_context *svga, struct svga_compile_key *key)
+{
+   struct svga_tes_shader *tes = svga->curr.tes;
+
+   memset(key, 0, sizeof *key);
+
+   /*
+    * SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER
+    */
+   svga_init_shader_key_common(svga, PIPE_SHADER_TESS_EVAL, &tes->base, key);
+
+   assert(svga->curr.tcs);
+   key->tes.vertices_per_patch =
+      svga->curr.tcs->base.info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
+
+   key->tes.need_prescale = svga->state.hw_clear.prescale[0].enabled &&
+                            (svga->curr.gs == NULL);
+
+   /* tcs emits tessellation factors as extra outputs.
+    * Since tes depends on them, save the tessFactor output index
+    * from tcs in the tes compile key, so that if a different
+    * tcs is bound and if the tessFactor index is different,
+    * a different tes variant will be generated.
+    */
+   key->tes.tessfactor_index = svga->curr.tcs->base.info.num_outputs;
+
+   key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* This is the last vertex stage if there is no geometry shader. */
+   key->last_vertex_stage = !svga->curr.gs;
+
+   key->tes.need_tessinner = 0;
+   key->tes.need_tessouter = 0;
+
+   for (int i = 0; i < svga->curr.tcs->base.info.num_outputs; i++) {
+      switch (svga->curr.tcs->base.info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_TESSOUTER:
+         key->tes.need_tessouter = 1;
+         break;
+      case TGSI_SEMANTIC_TESSINNER:
+         key->tes.need_tessinner = 1;
+         break;
+      default:
+         break;
+      }
+   }
+
+}
+
+
+static void
+get_passthrough_tcs(struct svga_context *svga)
+{
+   if (svga->tcs.passthrough_tcs &&
+       svga->tcs.vs == svga->curr.vs &&
+       svga->tcs.tes == svga->curr.tes &&
+       svga->tcs.vertices_per_patch == svga->curr.vertices_per_patch) {
+      svga->pipe.bind_tcs_state(&svga->pipe,
+                                svga->tcs.passthrough_tcs);
+   }
+   else {
+      struct svga_tcs_shader *new_tcs;
+
+      /* delete older passthrough shader*/
+      if (svga->tcs.passthrough_tcs) {
+         svga->pipe.delete_tcs_state(&svga->pipe,
+                                     svga->tcs.passthrough_tcs);
+      }
+
+      new_tcs = (struct svga_tcs_shader *)
+         util_make_tess_ctrl_passthrough_shader(&svga->pipe,
+            svga->curr.vs->base.info.num_outputs,
+            svga->curr.tes->base.info.num_inputs,
+            svga->curr.vs->base.info.output_semantic_name,
+            svga->curr.vs->base.info.output_semantic_index,
+            svga->curr.tes->base.info.input_semantic_name,
+            svga->curr.tes->base.info.input_semantic_index,
+            svga->curr.vertices_per_patch);
+      svga->pipe.bind_tcs_state(&svga->pipe, new_tcs);
+      svga->tcs.passthrough_tcs = new_tcs;
+      svga->tcs.vs = svga->curr.vs;
+      svga->tcs.tes = svga->curr.tes;
+      svga->tcs.vertices_per_patch = svga->curr.vertices_per_patch;
+   }
+
+   struct pipe_constant_buffer cb;
+
+   cb.buffer = NULL;
+   cb.user_buffer = (void *) svga->curr.default_tesslevels;
+   cb.buffer_offset = 0;
+   cb.buffer_size = 2 * 4 * sizeof(float);
+   svga->pipe.set_constant_buffer(&svga->pipe, PIPE_SHADER_TESS_CTRL, 0, &cb);
+}
+
+
+static enum pipe_error
+emit_hw_tes(struct svga_context *svga, uint64_t dirty)
+{
+   struct svga_shader_variant *variant;
+   struct svga_tes_shader *tes = svga->curr.tes;
+   enum pipe_error ret = PIPE_OK;
+   struct svga_compile_key key;
+
+   assert(svga_have_sm5(svga));
+
+   SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_EMITTES);
+
+   if (!tes) {
+      /* The GL spec implies that TES is optional when there's a TCS,
+       * but that's apparently a spec error. Assert if we have a TCS
+       * but no TES.
+       */
+      assert(!svga->curr.tcs);
+      if (svga->state.hw_draw.tes != NULL) {
+
+         /** The previous tessellation evaluation shader is made inactive.
+          *  Needs to unbind the tessellation evaluation shader.
+          */
+         ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, NULL);
+         if (ret != PIPE_OK)
+            goto done;
+         svga->state.hw_draw.tes = NULL;
+      }
+      goto done;
+   }
+
+   if (!svga->curr.tcs) {
+      /* TES state is processed before the TCS
+       * shader and that's why we're checking for and creating the
+       * passthough TCS in the emit_hw_tes() function.
+       */
+      get_passthrough_tcs(svga);
+      svga->tcs.passthrough = TRUE;
+   }
+   else {
+      svga->tcs.passthrough = FALSE;
+   }
+
+   make_tes_key(svga, &key);
+
+   /* See if we already have a TES variant that matches the key */
+   variant = svga_search_shader_key(&tes->base, &key);
+
+   if (!variant) {
+      ret = compile_tes(svga, tes, &key, &variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      /* insert the new variant at head of linked list */
+      assert(variant);
+      variant->next = tes->base.variants;
+      tes->base.variants = variant;
+   }
+
+   if (variant != svga->state.hw_draw.tes) {
+      /* Bind the new variant */
+      ret = svga_set_shader(svga, SVGA3D_SHADERTYPE_DS, variant);
+      if (ret != PIPE_OK)
+         goto done;
+
+      svga->rebind.flags.tes = FALSE;
+      svga->dirty |= SVGA_NEW_TES_VARIANT;
+      svga->state.hw_draw.tes = variant;
+   }
+
+done:
+   SVGA_STATS_TIME_POP(svga_sws(svga));
+   return ret;
+}
+
+
+struct svga_tracked_state svga_hw_tes =
+{
+   "tessellation evaluation shader (hwtnl)",
+   /* TBD SVGA_NEW_VS/SVGA_NEW_FS/SVGA_NEW_GS are required or not*/
+   (SVGA_NEW_VS |
+    SVGA_NEW_FS |
+    SVGA_NEW_GS |
+    SVGA_NEW_TCS |
+    SVGA_NEW_TES |
+    SVGA_NEW_TEXTURE_BINDING |
+    SVGA_NEW_SAMPLER |
+    SVGA_NEW_RAST),
+   emit_hw_tes
+};
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 95b1a9e952d..75b0ac60f58 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -139,7 +139,7 @@ emit_tex_binding_unit(struct svga_context *svga,
 
 
 static enum pipe_error
-update_tss_binding(struct svga_context *svga, unsigned dirty)
+update_tss_binding(struct svga_context *svga, uint64_t dirty )
 {
    const enum pipe_shader_type shader = PIPE_SHADER_FRAGMENT;
    boolean reemit = svga->rebind.flags.texture_samplers;
@@ -149,8 +149,7 @@ update_tss_binding(struct svga_context *svga, unsigned dirty)
 
    struct bind_queue queue;
 
-   if (svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(!svga_have_vgpu10(svga));
 
    queue.bind_count = 0;
 
@@ -167,7 +166,8 @@ update_tss_binding(struct svga_context *svga, unsigned dirty)
 
    /* Polygon stipple */
    if (svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
       emit_tex_binding_unit(svga, unit,
                             svga->polygon_stipple.sampler,
                             &svga->polygon_stipple.sampler_view->base,
@@ -257,7 +257,8 @@ svga_reemit_tss_bindings(struct svga_context *svga)
 
    /* Polygon stipple */
    if (svga->curr.rast && svga->curr.rast->templ.poly_stipple_enable) {
-      const unsigned unit = svga->state.hw_draw.fs->pstipple_sampler_unit;
+      const unsigned unit =
+         svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit;
       struct svga_hw_view_state *view = &svga->state.hw_draw.views[unit];
 
       if (view->v) {
@@ -380,14 +381,13 @@ emit_tss_unit(struct svga_context *svga, unsigned unit,
 }
 
 static enum pipe_error
-update_tss(struct svga_context *svga, unsigned dirty)
+update_tss(struct svga_context *svga, uint64_t dirty )
 {
    const enum pipe_shader_type shader = PIPE_SHADER_FRAGMENT;
    unsigned i;
    struct ts_queue queue;
 
-   if (svga_have_vgpu10(svga))
-      return PIPE_OK;
+   assert(!svga_have_vgpu10(svga));
 
    queue.ts_count = 0;
    for (i = 0; i < svga->curr.num_samplers[shader]; i++) {
@@ -400,7 +400,7 @@ update_tss(struct svga_context *svga, unsigned dirty)
    /* polygon stipple sampler */
    if (svga->curr.rast->templ.poly_stipple_enable) {
       emit_tss_unit(svga,
-                    svga->state.hw_draw.fs->pstipple_sampler_unit,
+                    svga_fs_variant(svga->state.hw_draw.fs)->pstipple_sampler_unit,
                     svga->polygon_stipple.sampler,
                     &queue);
    }
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
index fd6a238ef16..a49bcd0a263 100644
--- a/src/gallium/drivers/svga/svga_state_vdecl.c
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -40,7 +40,7 @@
 
 
 static enum pipe_error
-emit_hw_vs_vdecl(struct svga_context *svga, unsigned dirty)
+emit_hw_vs_vdecl(struct svga_context *svga, uint64_t dirty)
 {
    const struct pipe_vertex_element *ve = svga->curr.velems->velem;
    SVGA3dVertexDecl decls[SVGA3D_INPUTREG_MAX];
@@ -136,7 +136,7 @@ emit_hw_vs_vdecl(struct svga_context *svga, unsigned dirty)
 
 
 static enum pipe_error
-emit_hw_vdecl(struct svga_context *svga, unsigned dirty)
+emit_hw_vdecl(struct svga_context *svga, uint64_t dirty)
 {
    /* SVGA_NEW_NEED_SWTNL
     */
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index d63b52454ca..147b07aaeb1 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -164,7 +164,7 @@ compile_vs(struct svga_context *svga,
 static void
 make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
 {
-   const enum pipe_shader_type shader = PIPE_SHADER_VERTEX;
+   struct svga_vertex_shader *vs = svga->curr.vs;
 
    memset(key, 0, sizeof *key);
 
@@ -176,7 +176,8 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
    }
 
    /* SVGA_NEW_PRESCALE */
-   key->vs.need_prescale = svga->state.hw_clear.prescale.enabled &&
+   key->vs.need_prescale = svga->state.hw_clear.prescale[0].enabled &&
+                           (svga->curr.tes == NULL) &&
                            (svga->curr.gs == NULL);
 
    /* SVGA_NEW_RAST */
@@ -199,10 +200,16 @@ make_vs_key(struct svga_context *svga, struct svga_compile_key *key)
    key->vs.attrib_puint_to_sscaled = svga->curr.velems->attrib_puint_to_sscaled;
 
    /* SVGA_NEW_TEXTURE_BINDING | SVGA_NEW_SAMPLER */
-   svga_init_shader_key_common(svga, shader, key);
+   svga_init_shader_key_common(svga, PIPE_SHADER_VERTEX, &vs->base, key);
 
    /* SVGA_NEW_RAST */
    key->clip_plane_enable = svga->curr.rast->templ.clip_plane_enable;
+
+   /* Determine if this shader is the last shader in the vertex
+    * processing stage.
+    */
+   key->last_vertex_stage = !(svga->curr.gs ||
+                              svga->curr.tcs || svga->curr.tes);
 }
 
 
@@ -338,7 +345,7 @@ compile_passthrough_vs(struct svga_context *svga,
 
 
 static enum pipe_error
-emit_hw_vs(struct svga_context *svga, unsigned dirty)
+emit_hw_vs(struct svga_context *svga, uint64_t dirty)
 {
    struct svga_shader_variant *variant;
    struct svga_vertex_shader *vs = svga->curr.vs;
diff --git a/src/gallium/drivers/svga/svga_streamout.h b/src/gallium/drivers/svga/svga_streamout.h
index 1daa1ad5352..5e6db247b53 100644
--- a/src/gallium/drivers/svga/svga_streamout.h
+++ b/src/gallium/drivers/svga/svga_streamout.h
@@ -32,6 +32,9 @@ struct svga_stream_output {
    struct pipe_stream_output_info info;
    unsigned pos_out_index;                  // position output index
    unsigned id;
+   unsigned streammask;                     // bitmask to specify which streams are enabled
+   unsigned buffer_stream;
+   struct svga_winsys_buffer *declBuf;
 };
 
 struct svga_stream_output *
@@ -50,4 +53,20 @@ svga_delete_stream_output(struct svga_context *svga,
 enum pipe_error
 svga_rebind_stream_output_targets(struct svga_context *svga);
 
+void
+svga_create_stream_output_queries(struct svga_context *svga);
+
+void
+svga_destroy_stream_output_queries(struct svga_context *svga);
+
+void
+svga_begin_stream_output_queries(struct svga_context *svga, unsigned mask);
+
+void
+svga_end_stream_output_queries(struct svga_context *svga, unsigned mask);
+
+unsigned
+svga_get_primcount_from_stream_output(struct svga_context *svga,
+                                      unsigned stream);
+
 #endif /* SVGA_STREAMOUT_H */
diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c
index 2c48a66186f..d3dd23d2d81 100644
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -578,6 +578,16 @@ svga_validate_surface_view(struct svga_context *svga, struct svga_surface *s)
       }
    }
 
+   /**
+    * Create an alternate surface view for the specified context if the
+    * view was created for another context.
+    */
+   if (s && s->base.context != &svga->pipe) {
+      struct pipe_surface *surf;
+      surf = svga_create_surface_view(&svga->pipe, s->base.texture, &s->base, FALSE);
+      s = svga_surface(surf);
+   }
+
    if (s && s->view_id == SVGA3D_INVALID_ID) {
       SVGA3dResourceType resType;
       SVGA3dRenderTargetViewDesc desc;
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index 587632d0eb6..1413e3a4b52 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -146,6 +146,8 @@ static inline SVGA3dResourceType
 svga_resource_type(enum pipe_texture_target target)
 {
    switch (target) {
+   case PIPE_BUFFER:
+      return SVGA3D_RESOURCE_BUFFER;
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_1D_ARRAY:
       return SVGA3D_RESOURCE_TEXTURE1D;
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
index b6fd07fe346..3e8c90d8e1e 100644
--- a/src/gallium/drivers/svga/svga_swtnl_backend.c
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -90,11 +90,12 @@ svga_vbuf_render_allocate_vertices(struct vbuf_render *render,
 
    if (!svga_render->vbuf) {
       svga_render->vbuf_size = MAX2(size, svga_render->vbuf_alloc_size);
-      svga_render->vbuf = pipe_buffer_create(screen,
-                                             PIPE_BIND_VERTEX_BUFFER,
-                                             PIPE_USAGE_STREAM,
-                                             svga_render->vbuf_size);
+      svga_render->vbuf = SVGA_TRY_PTR(pipe_buffer_create
+                                       (screen, PIPE_BIND_VERTEX_BUFFER,
+                                        PIPE_USAGE_STREAM,
+                                        svga_render->vbuf_size));
       if (!svga_render->vbuf) {
+         svga_retry_enter(svga);
          svga_context_flush(svga, NULL);
          assert(!svga_render->vbuf);
          svga_render->vbuf = pipe_buffer_create(screen,
@@ -104,6 +105,7 @@ svga_vbuf_render_allocate_vertices(struct vbuf_render *render,
          /* The buffer allocation may fail if we run out of memory.
           * The draw module's vbuf code should handle that without crashing.
           */
+         svga_retry_exit(svga);
       }
 
       svga->swtnl.new_vdecl = TRUE;
@@ -267,7 +269,7 @@ svga_vbuf_submit_state(struct svga_vbuf_render *svga_render)
    else {
       svga_hwtnl_set_flatshade(svga->hwtnl,
                                 svga->curr.rast->templ.flatshade ||
-                                svga->state.hw_draw.fs->uses_flat_interp,
+                                svga_is_using_flat_shading(svga),
                                 svga->curr.rast->templ.flatshade_first);
 
       svga_hwtnl_set_fillmode(svga->hwtnl, svga->curr.rast->hw_fillmode);
@@ -286,10 +288,10 @@ svga_vbuf_render_draw_arrays(struct vbuf_render *render,
    struct svga_context *svga = svga_render->svga;
    unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
       / svga_render->vertex_size;
-   enum pipe_error ret = PIPE_OK;
    /* instancing will already have been resolved at this point by 'draw' */
    const unsigned start_instance = 0;
    const unsigned instance_count = 1;
+   boolean retried;
 
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_VBUFDRAWARRAYS);
 
@@ -301,17 +303,13 @@ svga_vbuf_render_draw_arrays(struct vbuf_render *render,
     * redbook/polys.c
     */
    svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
-
-   ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim, start + bias,
-                                 nr, start_instance, instance_count);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_arrays(svga->hwtnl, svga_render->prim,
-                                   start + bias, nr,
-                                   start_instance, instance_count);
+   SVGA_RETRY_CHECK(svga, svga_hwtnl_draw_arrays
+                    (svga->hwtnl, svga_render->prim, start + bias,
+                     nr, start_instance, instance_count, 0), retried);
+   if (retried) {
       svga->swtnl.new_vbuf = TRUE;
-      assert(ret == PIPE_OK);
    }
+
    SVGA_STATS_TIME_POP(svga_sws(svga));
 }
 
@@ -325,7 +323,7 @@ svga_vbuf_render_draw_elements(struct vbuf_render *render,
    struct svga_context *svga = svga_render->svga;
    int bias = (svga_render->vbuf_offset - svga_render->vdecl_offset)
       / svga_render->vertex_size;
-   boolean ret;
+   boolean retried;
    /* instancing will already have been resolved at this point by 'draw' */
    const struct pipe_draw_info info = {
       .index_size = 2,
@@ -354,13 +352,12 @@ svga_vbuf_render_draw_elements(struct vbuf_render *render,
     * redbook/polys.c
     */
    svga_update_state_retry(svga, SVGA_STATE_HW_DRAW);
-   ret = svga_hwtnl_draw_range_elements(svga->hwtnl, &info, nr_indices);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = svga_hwtnl_draw_range_elements(svga->hwtnl, &info, nr_indices);
+   SVGA_RETRY_CHECK(svga, svga_hwtnl_draw_range_elements(svga->hwtnl, &info,
+                                                         nr_indices), retried);
+   if (retried) {
       svga->swtnl.new_vbuf = TRUE;
-      assert(ret == PIPE_OK);
    }
+
    SVGA_STATS_TIME_POP(svga_sws(svga));
 }
 
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 816fef1c4ea..789ed23e88b 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -51,7 +51,7 @@
 static void
 set_draw_viewport(struct svga_context *svga)
 {
-   struct pipe_viewport_state vp = svga->curr.viewport;
+   struct pipe_viewport_state vp = svga->curr.viewport[0];
    float adjx = 0.0f;
    float adjy = 0.0f;
 
@@ -98,7 +98,7 @@ set_draw_viewport(struct svga_context *svga)
 }
 
 static enum pipe_error
-update_swtnl_draw(struct svga_context *svga, unsigned dirty)
+update_swtnl_draw(struct svga_context *svga, uint64_t dirty)
 {
    SVGA_STATS_TIME_PUSH(svga_sws(svga), SVGA_STATS_TIME_SWTNLUPDATEDRAW);
 
@@ -191,7 +191,6 @@ svga_vdecl_to_input_element(struct svga_context *svga,
 {
    SVGA3dElementLayoutId id;
    SVGA3dInputElementDesc elements[PIPE_MAX_ATTRIBS];
-   enum pipe_error ret;
    unsigned i;
 
    assert(num_decls <= PIPE_MAX_ATTRIBS);
@@ -208,13 +207,8 @@ svga_vdecl_to_input_element(struct svga_context *svga,
 
    id = util_bitmask_add(svga->input_element_object_id_bm);
 
-   ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id, elements);
-   if (ret != PIPE_OK) {
-      svga_context_flush(svga, NULL);
-      ret = SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls,
-                                              id, elements);
-      assert(ret == PIPE_OK);
-   }
+   SVGA_RETRY(svga, SVGA3D_vgpu10_DefineElementLayout(svga->swc, num_decls, id,
+                                                      elements));
 
    return id;
 }
@@ -306,22 +300,14 @@ svga_swtnl_update_vdecl(struct svga_context *svga)
    any_change = memcmp(svga_render->vdecl, vdecl, sizeof(vdecl));
 
    if (svga_have_vgpu10(svga)) {
-      enum pipe_error ret;
-
       if (!any_change && svga_render->layout_id != SVGA3D_INVALID_ID) {
          goto done;
       }
 
       if (svga_render->layout_id != SVGA3D_INVALID_ID) {
          /* destroy old */
-         ret = SVGA3D_vgpu10_DestroyElementLayout(svga->swc,
-                                                  svga_render->layout_id);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_DestroyElementLayout(svga->swc,
-                                                     svga_render->layout_id);
-            assert(ret == PIPE_OK);
-         }
+         SVGA_RETRY(svga, SVGA3D_vgpu10_DestroyElementLayout
+                    (svga->swc, svga_render->layout_id));
 
          /**
           * reset current layout id state after the element layout is
@@ -340,14 +326,8 @@ svga_swtnl_update_vdecl(struct svga_context *svga)
 
       /* bind new */
       if (svga->state.hw_draw.layout_id != svga_render->layout_id) {
-         ret = SVGA3D_vgpu10_SetInputLayout(svga->swc, svga_render->layout_id);
-         if (ret != PIPE_OK) {
-            svga_context_flush(svga, NULL);
-            ret = SVGA3D_vgpu10_SetInputLayout(svga->swc,
-                                               svga_render->layout_id);
-            assert(ret == PIPE_OK);
-         }
-
+         SVGA_RETRY(svga, SVGA3D_vgpu10_SetInputLayout(svga->swc,
+                                                       svga_render->layout_id));
          svga->state.hw_draw.layout_id = svga_render->layout_id;
       }
    }
@@ -366,7 +346,7 @@ done:
 
 
 static enum pipe_error
-update_swtnl_vdecl(struct svga_context *svga, unsigned dirty)
+update_swtnl_vdecl(struct svga_context *svga, uint64_t dirty)
 {
    return svga_swtnl_update_vdecl(svga);
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 5c3afee3845..0f7597f6157 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -238,14 +238,18 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
    memcpy(&variant->key, key, sizeof(*key));
    variant->id = UTIL_BITMASK_INVALID_INDEX;
 
-   variant->pstipple_sampler_unit = emit.pstipple_sampler_unit;
-
-   /* If there was exactly one write to a fragment shader output register
-    * and it came from a constant buffer, we know all fragments will have
-    * the same color (except for blending).
-    */
-   variant->constant_color_output =
-      emit.constant_color_output && emit.num_output_writes == 1;
+   if (unit == PIPE_SHADER_FRAGMENT) {
+      struct svga_fs_variant *fs_variant = svga_fs_variant(variant);
+
+      fs_variant->pstipple_sampler_unit = emit.pstipple_sampler_unit;
+
+      /* If there was exactly one write to a fragment shader output register
+       * and it came from a constant buffer, we know all fragments will have
+       * the same color (except for blending).
+       */
+      fs_variant->constant_color_output =
+         emit.constant_color_output && emit.num_output_writes == 1;
+   }
 
 #if 0
    if (!svga_shader_verify(variant->tokens, variant->nr_tokens) ||
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index e98601127f4..9c467cc7814 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -30,7 +30,7 @@
 #include "svga3d_reg.h"
 
 
-#define MAX_VGPU10_ADDR_REGS 2
+#define MAX_VGPU10_ADDR_REGS 4
 
 struct svga_compile_key;
 struct svga_context;
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 099ede6017d..6e607cd0616 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -40,6 +40,7 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_strings.h"
 #include "tgsi/tgsi_two_side.h"
 #include "tgsi/tgsi_aa_point.h"
 #include "tgsi/tgsi_util.h"
@@ -87,6 +88,100 @@ enum clipping_mode
 };
 
 
+/* Shader signature info */
+struct svga_shader_signature
+{
+   SVGA3dDXShaderSignatureHeader header;
+   SVGA3dDXShaderSignatureEntry inputs[PIPE_MAX_SHADER_INPUTS];
+   SVGA3dDXShaderSignatureEntry outputs[PIPE_MAX_SHADER_OUTPUTS];
+   SVGA3dDXShaderSignatureEntry patchConstants[PIPE_MAX_SHADER_OUTPUTS];
+};
+
+static inline void
+set_shader_signature_entry(SVGA3dDXShaderSignatureEntry *e,
+                           unsigned index,
+                           SVGA3dDXSignatureSemanticName sgnName,
+                           unsigned mask,
+                           SVGA3dDXSignatureRegisterComponentType compType,
+                           SVGA3dDXSignatureMinPrecision minPrecision)
+{
+   e->registerIndex = index;
+   e->semanticName = sgnName;
+   e->mask = mask;
+   e->componentType = compType;
+   e->minPrecision = minPrecision;
+};
+
+static const SVGA3dDXSignatureSemanticName
+tgsi_semantic_to_sgn_name[TGSI_SEMANTIC_COUNT] = {
+   SVGADX_SIGNATURE_SEMANTIC_NAME_POSITION,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_IS_FRONT_FACE,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_PRIMITIVE_ID,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_INSTANCE_ID,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_VERTEX_ID,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_CLIP_DISTANCE,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_VIEWPORT_ARRAY_INDEX,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_RENDER_TARGET_ARRAY_INDEX,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_SAMPLE_INDEX,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_INSTANCE_ID,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_VERTEX_ID,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED,
+   SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED
+};
+
+
+/**
+ * Map tgsi semantic name to SVGA signature semantic name
+ */
+static inline SVGA3dDXSignatureSemanticName
+map_tgsi_semantic_to_sgn_name(enum tgsi_semantic name)
+{
+   assert(name < TGSI_SEMANTIC_COUNT);
+
+   /* Do a few asserts here to spot check the mapping */
+   assert(tgsi_semantic_to_sgn_name[TGSI_SEMANTIC_PRIMID] ==
+          SVGADX_SIGNATURE_SEMANTIC_NAME_PRIMITIVE_ID);
+   assert(tgsi_semantic_to_sgn_name[TGSI_SEMANTIC_VIEWPORT_INDEX] ==
+          SVGADX_SIGNATURE_SEMANTIC_NAME_VIEWPORT_ARRAY_INDEX);
+   assert(tgsi_semantic_to_sgn_name[TGSI_SEMANTIC_INVOCATIONID] ==
+          SVGADX_SIGNATURE_SEMANTIC_NAME_INSTANCE_ID);
+
+   return tgsi_semantic_to_sgn_name[name];
+}
+
+
 struct svga_shader_emitter_v10
 {
    /* The token output buffer */
@@ -100,12 +195,16 @@ struct svga_shader_emitter_v10
    unsigned unit;
    unsigned version; /**< Either 40 or 41 at this time */
 
+   unsigned cur_tgsi_token;     /**< current tgsi token position */
    unsigned inst_start_token;
    boolean discard_instruction; /**< throw away current instruction? */
+   boolean reemit_instruction;  /**< reemit current instruction */
+   boolean skip_instruction;    /**< skip current instruction */
 
    union tgsi_immediate_data immediates[MAX_IMMEDIATE_COUNT][4];
+   double (*immediates_dbl)[2];
    unsigned num_immediates;      /**< Number of immediates emitted */
-   unsigned common_immediate_pos[8];  /**< literals for common immediates */
+   unsigned common_immediate_pos[10];  /**< literals for common immediates */
    unsigned num_common_immediates;
    boolean immediates_emitted;
 
@@ -126,8 +225,11 @@ struct svga_shader_emitter_v10
    /** Map TGSI temp registers to VGPU10 temp array IDs and indexes */
    struct {
       unsigned arrayId, index;
+      boolean initialized;
    } temp_map[VGPU10_MAX_TEMPS]; /**< arrayId, element */
 
+   unsigned initialize_temp_index;
+
    /** Number of constants used by original shader for each constant buffer.
     * The size should probably always match with that of svga_state.constbufs.
     */
@@ -139,6 +241,16 @@ struct svga_shader_emitter_v10
    ubyte sampler_target[PIPE_MAX_SAMPLERS];  /**< TGSI_TEXTURE_x */
    ubyte sampler_return_type[PIPE_MAX_SAMPLERS];  /**< TGSI_RETURN_TYPE_x */
 
+   /* Index Range declaration */
+   struct {
+      unsigned start_index;
+      unsigned count;
+      boolean required;
+      unsigned operandType;
+      unsigned size;
+      unsigned dim;
+   } index_range;
+
    /* Address regs (really implemented with temps) */
    unsigned num_address_regs;
    unsigned address_reg_index[MAX_VGPU10_ADDR_REGS];
@@ -154,8 +266,12 @@ struct svga_shader_emitter_v10
       unsigned out_index;  /**< the real position output reg */
       unsigned tmp_index;  /**< the fake/temp position output reg */
       unsigned so_index;   /**< the non-adjusted position output reg */
+      unsigned prescale_cbuf_index;  /* index to the const buf for prescale */
       unsigned prescale_scale_index, prescale_trans_index;
-      boolean  need_prescale;
+      unsigned num_prescale;      /* number of prescale factor in const buf */
+      unsigned viewport_index;
+      unsigned need_prescale:1;
+      unsigned have_prescale:1;
    } vposition;
 
    /* For vertex shaders only */
@@ -183,13 +299,20 @@ struct svga_shader_emitter_v10
       unsigned fragcoord_input_index;  /**< real fragment position input reg */
       unsigned fragcoord_tmp_index;    /**< 1/w modified position temp reg */
 
-      /** Which texture units are doing shadow comparison in the FS code */
-      unsigned shadow_compare_units;
-
       unsigned sample_id_sys_index;  /**< TGSI index of sample id sys value */
 
       unsigned sample_pos_sys_index; /**< TGSI index of sample pos sys value */
       unsigned sample_pos_tmp_index; /**< which temp reg has the sample pos */
+
+      /** TGSI index of sample mask input sys value */
+      unsigned sample_mask_in_sys_index;
+
+      /** Which texture units are doing shadow comparison in the FS code */
+      unsigned shadow_compare_units;
+
+      /* layer */
+      unsigned layer_input_index;    /**< TGSI index of layer */
+      unsigned layer_imm_index;      /**< immediate for default layer 0 */
    } fs;
 
    /* For geometry shaders only */
@@ -199,8 +322,63 @@ struct svga_shader_emitter_v10
       unsigned input_size;       /**< size of input arrays */
       unsigned prim_id_index;    /**< primitive id register index */
       unsigned max_out_vertices; /**< maximum number of output vertices */
+      unsigned invocations;
+      unsigned invocation_id_sys_index;
+
+      unsigned viewport_index_out_index;
+      unsigned viewport_index_tmp_index;
    } gs;
 
+   /* For tessellation control shaders only */
+   struct {
+      unsigned vertices_per_patch_index;     /**< vertices_per_patch system value index */
+      unsigned imm_index;                    /**< immediate for tcs */
+      unsigned vertices_out;
+      unsigned invocation_id_sys_index;      /**< invocation id */
+      unsigned invocation_id_tmp_index;
+      unsigned instruction_token_pos;        /* token pos for the first instruction */
+      unsigned control_point_input_index;    /* control point input register index */
+      unsigned control_point_addr_index;     /* control point input address register */
+      unsigned control_point_out_index;      /* control point output register index */
+      unsigned control_point_tmp_index;      /* control point temporary register */
+      unsigned control_point_out_count;      /* control point output count */
+      boolean  control_point_phase;          /* true if in control point phase */
+      unsigned patch_generic_out_count;      /* per-patch generic output count */
+      unsigned patch_generic_out_index;      /* per-patch generic output register index*/
+      unsigned patch_generic_tmp_index;      /* per-patch generic temporary register index*/
+      unsigned prim_id_index;                /* primitive id */
+      struct {
+         unsigned out_index;      /* real tessinner output register */
+         unsigned temp_index;     /* tessinner temp register */
+         unsigned tgsi_index;     /* tgsi tessinner output register */
+      } inner;
+      struct {
+         unsigned out_index;      /* real tessouter output register */
+         unsigned temp_index;     /* tessouter temp register */
+         unsigned tgsi_index;     /* tgsi tessouter output register */
+      } outer;
+   } tcs;
+
+   /* For tessellation evaluation shaders only */
+   struct {
+      enum pipe_prim_type prim_mode;
+      enum pipe_tess_spacing spacing;
+      boolean vertices_order_cw;
+      boolean point_mode;
+      unsigned tesscoord_sys_index;
+      unsigned prim_id_index;                /* primitive id */
+      struct {
+         unsigned in_index;       /* real tessinner input register */
+         unsigned temp_index;     /* tessinner temp register */
+         unsigned tgsi_index;     /* tgsi tessinner input register */
+      } inner;
+      struct {
+         unsigned in_index;       /* real tessouter input register */
+         unsigned temp_index;     /* tessouter temp register */
+         unsigned tgsi_index;     /* tgsi tessouter input register */
+      } outer;
+   } tes;
+
    /* For vertex or geometry shaders */
    enum clipping_mode clip_mode;
    unsigned clip_dist_out_index; /**< clip distance output register index */
@@ -219,19 +397,41 @@ struct svga_shader_emitter_v10
 
    boolean uses_flat_interp;
 
+   unsigned reserved_token;        /* index to the reserved token */
+   boolean uses_precise_qualifier;
+
    /* For all shaders: const reg index for RECT coord scaling */
    unsigned texcoord_scale_index[PIPE_MAX_SAMPLERS];
 
    /* For all shaders: const reg index for texture buffer size */
    unsigned texture_buffer_size_index[PIPE_MAX_SAMPLERS];
 
-   /* VS/GS/FS Linkage info */
+   /* VS/TCS/TES/GS/FS Linkage info */
    struct shader_linkage linkage;
 
+   /* Shader signature */
+   struct svga_shader_signature signature;
+
    bool register_overflow;  /**< Set if we exceed a VGPU10 register limit */
+
+   /* For pipe_debug_message */
+   struct pipe_debug_callback svga_debug_callback;
+
+   /* current loop depth in shader */
+   unsigned current_loop_depth;
 };
 
 
+static void emit_tcs_input_declarations(struct svga_shader_emitter_v10 *emit);
+static void emit_tcs_output_declarations(struct svga_shader_emitter_v10 *emit);
+static boolean emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit);
+static boolean emit_constant_declaration(struct svga_shader_emitter_v10 *emit);
+static boolean emit_sampler_declarations(struct svga_shader_emitter_v10 *emit);
+static boolean emit_resource_declarations(struct svga_shader_emitter_v10 *emit);
+static boolean emit_vgpu10_immediates_block(struct svga_shader_emitter_v10 *emit);
+static boolean emit_index_range_declaration(struct svga_shader_emitter_v10 *emit);
+static void emit_temp_prescale_instructions(struct svga_shader_emitter_v10 *emit);
+
 static boolean
 emit_post_helpers(struct svga_shader_emitter_v10 *emit);
 
@@ -239,6 +439,26 @@ static boolean
 emit_vertex(struct svga_shader_emitter_v10 *emit,
             const struct tgsi_full_instruction *inst);
 
+static boolean
+emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
+                        unsigned inst_number,
+                        const struct tgsi_full_instruction *inst);
+
+static void
+emit_input_declaration(struct svga_shader_emitter_v10 *emit,
+                       unsigned opcodeType, unsigned operandType,
+                       unsigned dim, unsigned index, unsigned size,
+                       unsigned name, unsigned numComp,
+                       unsigned selMode, unsigned usageMask,
+                       unsigned interpMode,
+                       boolean addSignature,
+                       SVGA3dDXSignatureSemanticName sgnName);
+
+static void
+create_temp_array(struct svga_shader_emitter_v10 *emit,
+                  unsigned arrayID, unsigned first, unsigned count,
+                  unsigned startIndex);
+
 static char err_buf[128];
 
 static boolean
@@ -381,7 +601,11 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
           (emit->unit == PIPE_SHADER_GEOMETRY &&
            index >= VGPU10_MAX_GS_INPUTS) ||
           (emit->unit == PIPE_SHADER_FRAGMENT &&
-           index >= VGPU10_MAX_FS_INPUTS)) {
+           index >= VGPU10_MAX_FS_INPUTS) ||
+          (emit->unit == PIPE_SHADER_TESS_CTRL &&
+           index >= VGPU11_MAX_HS_INPUTS) ||
+          (emit->unit == PIPE_SHADER_TESS_EVAL &&
+           index >= VGPU11_MAX_DS_INPUT_CONTROL_POINTS)) {
          emit->register_overflow = TRUE;
       }
       break;
@@ -389,12 +613,22 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
    case VGPU10_OPCODE_DCL_OUTPUT:
    case VGPU10_OPCODE_DCL_OUTPUT_SGV:
    case VGPU10_OPCODE_DCL_OUTPUT_SIV:
+      /* Note: we are skipping two output indices in tcs for
+       * tessinner/outer levels. Implementation will not exceed
+       * number of output count but it allows index to go beyond
+       * VGPU11_MAX_HS_OUTPUTS.
+       * Index will never be >= index >= VGPU11_MAX_HS_OUTPUTS + 2
+       */
       if ((emit->unit == PIPE_SHADER_VERTEX &&
            index >= VGPU10_MAX_VS_OUTPUTS) ||
           (emit->unit == PIPE_SHADER_GEOMETRY &&
            index >= VGPU10_MAX_GS_OUTPUTS) ||
           (emit->unit == PIPE_SHADER_FRAGMENT &&
-           index >= VGPU10_MAX_FS_OUTPUTS)) {
+           index >= VGPU10_MAX_FS_OUTPUTS) ||
+          (emit->unit == PIPE_SHADER_TESS_CTRL &&
+           index >= VGPU11_MAX_HS_OUTPUTS + 2) ||
+          (emit->unit == PIPE_SHADER_TESS_EVAL &&
+           index >= VGPU11_MAX_DS_OUTPUTS)) {
          emit->register_overflow = TRUE;
       }
       break;
@@ -436,13 +670,33 @@ check_register_index(struct svga_shader_emitter_v10 *emit,
 static void
 determine_clipping_mode(struct svga_shader_emitter_v10 *emit)
 {
+   /* num_written_clipdistance in the shader info for tessellation
+    * control shader is always 0 because the TGSI_PROPERTY_NUM_CLIPDIST_ENABLED
+    * is not defined for this shader. So we go through all the output declarations
+    * to set the num_written_clipdistance. This is just to determine the
+    * clipping mode.
+    */
+   if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+      unsigned i;
+      for (i = 0; i < emit->info.num_outputs; i++) {
+         if (emit->info.output_semantic_name[i] == TGSI_SEMANTIC_CLIPDIST) {
+            emit->info.num_written_clipdistance =
+               4 * (emit->info.output_semantic_index[i] + 1);
+         }
+      }
+   }
+
    if (emit->info.num_written_clipdistance > 0) {
       emit->clip_mode = CLIP_DISTANCE;
    }
    else if (emit->info.writes_clipvertex) {
       emit->clip_mode = CLIP_VERTEX;
    }
-   else if (emit->key.clip_plane_enable) {
+   else if (emit->key.clip_plane_enable && emit->key.last_vertex_stage) {
+      /*
+       * Only the last shader in the vertex processing stage needs to
+       * handle the legacy clip mode.
+       */
       emit->clip_mode = CLIP_LEGACY;
    }
    else {
@@ -497,6 +751,12 @@ translate_shader_type(unsigned type)
       return VGPU10_GEOMETRY_SHADER;
    case PIPE_SHADER_FRAGMENT:
       return VGPU10_PIXEL_SHADER;
+   case PIPE_SHADER_TESS_CTRL:
+      return VGPU10_HULL_SHADER;
+   case PIPE_SHADER_TESS_EVAL:
+      return VGPU10_DOMAIN_SHADER;
+   case PIPE_SHADER_COMPUTE:
+      return VGPU10_COMPUTE_SHADER;
    default:
       assert(!"Unexpected shader type");
       return VGPU10_VERTEX_SHADER;
@@ -550,7 +810,7 @@ translate_opcode(enum tgsi_opcode opcode)
    case TGSI_OPCODE_DIV:
       return VGPU10_OPCODE_DIV;
    case TGSI_OPCODE_IDIV:
-      return VGPU10_OPCODE_IDIV;
+      return VGPU10_OPCODE_VMWARE;
    case TGSI_OPCODE_DP2:
       return VGPU10_OPCODE_DP2;
    case TGSI_OPCODE_BRK:
@@ -652,6 +912,64 @@ translate_opcode(enum tgsi_opcode opcode)
       return VGPU10_OPCODE_LT;
    case TGSI_OPCODE_ROUND:
       return VGPU10_OPCODE_ROUND_NE;
+   /* Begin SM5 opcodes */
+   case TGSI_OPCODE_F2D:
+      return VGPU10_OPCODE_FTOD;
+   case TGSI_OPCODE_D2F:
+      return VGPU10_OPCODE_DTOF;
+   case TGSI_OPCODE_DMUL:
+      return VGPU10_OPCODE_DMUL;
+   case TGSI_OPCODE_DADD:
+      return VGPU10_OPCODE_DADD;
+   case TGSI_OPCODE_DMAX:
+      return VGPU10_OPCODE_DMAX;
+   case TGSI_OPCODE_DMIN:
+      return VGPU10_OPCODE_DMIN;
+   case TGSI_OPCODE_DSEQ:
+      return VGPU10_OPCODE_DEQ;
+   case TGSI_OPCODE_DSGE:
+      return VGPU10_OPCODE_DGE;
+   case TGSI_OPCODE_DSLT:
+      return VGPU10_OPCODE_DLT;
+   case TGSI_OPCODE_DSNE:
+      return VGPU10_OPCODE_DNE;
+   case TGSI_OPCODE_IBFE:
+      return VGPU10_OPCODE_IBFE;
+   case TGSI_OPCODE_UBFE:
+      return VGPU10_OPCODE_UBFE;
+   case TGSI_OPCODE_BFI:
+      return VGPU10_OPCODE_BFI;
+   case TGSI_OPCODE_BREV:
+      return VGPU10_OPCODE_BFREV;
+   case TGSI_OPCODE_POPC:
+      return VGPU10_OPCODE_COUNTBITS;
+   case TGSI_OPCODE_LSB:
+      return VGPU10_OPCODE_FIRSTBIT_LO;
+   case TGSI_OPCODE_IMSB:
+      return VGPU10_OPCODE_FIRSTBIT_SHI;
+   case TGSI_OPCODE_UMSB:
+      return VGPU10_OPCODE_FIRSTBIT_HI;
+   case TGSI_OPCODE_INTERP_CENTROID:
+      return VGPU10_OPCODE_EVAL_CENTROID;
+   case TGSI_OPCODE_INTERP_SAMPLE:
+      return VGPU10_OPCODE_EVAL_SAMPLE_INDEX;
+   case TGSI_OPCODE_BARRIER:
+      return VGPU10_OPCODE_SYNC;
+
+   /* DX11.1 Opcodes */
+   case TGSI_OPCODE_DDIV:
+      return VGPU10_OPCODE_DDIV;
+   case TGSI_OPCODE_DRCP:
+      return VGPU10_OPCODE_DRCP;
+   case TGSI_OPCODE_D2I:
+      return VGPU10_OPCODE_DTOI;
+   case TGSI_OPCODE_D2U:
+      return VGPU10_OPCODE_DTOU;
+   case TGSI_OPCODE_I2D:
+      return VGPU10_OPCODE_ITOD;
+   case TGSI_OPCODE_U2D:
+      return VGPU10_OPCODE_UTOD;
+
    case TGSI_OPCODE_SAMPLE_POS:
       /* Note: we never actually get this opcode because there's no GLSL
        * function to query multisample resource sample positions.  There's
@@ -761,66 +1079,60 @@ remap_temp_index(const struct svga_shader_emitter_v10 *emit,
 /**
  * Setup the operand0 fields related to indexing (1D, 2D, relative, etc).
  * Note: the operandType field must already be initialized.
+ * \param file  the register file being accessed
+ * \param indirect  using indirect addressing of the register file?
+ * \param index2D  if true, 2-D indexing is being used (const or temp registers)
+ * \param indirect2D  if true, 2-D indirect indexing being used (for const buf)
  */
 static VGPU10OperandToken0
 setup_operand0_indexing(struct svga_shader_emitter_v10 *emit,
                         VGPU10OperandToken0 operand0,
                         enum tgsi_file_type file,
-                        boolean indirect, boolean index2D,
-                        unsigned tempArrayID)
+                        boolean indirect,
+                        boolean index2D, bool indirect2D)
 {
-   unsigned indexDim, index0Rep, index1Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+   VGPU10_OPERAND_INDEX_REPRESENTATION index0Rep, index1Rep;
+   VGPU10_OPERAND_INDEX_DIMENSION indexDim;
 
    /*
     * Compute index dimensions
     */
    if (operand0.operandType == VGPU10_OPERAND_TYPE_IMMEDIATE32 ||
-       operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID) {
+       operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID ||
+       operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID ||
+       operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_THREAD_ID ||
+       operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_THREAD_ID_IN_GROUP ||
+       operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID) {
       /* there's no swizzle for in-line immediates */
       indexDim = VGPU10_OPERAND_INDEX_0D;
       assert(operand0.selectionMode == 0);
    }
+   else if (operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_DOMAIN_POINT) {
+      indexDim = VGPU10_OPERAND_INDEX_0D;
+   }
    else {
-      if (index2D ||
-          tempArrayID > 0 ||
-          operand0.operandType == VGPU10_OPERAND_TYPE_CONSTANT_BUFFER) {
-         indexDim = VGPU10_OPERAND_INDEX_2D;
-      }
-      else {
-         indexDim = VGPU10_OPERAND_INDEX_1D;
-      }
+      indexDim = index2D ? VGPU10_OPERAND_INDEX_2D : VGPU10_OPERAND_INDEX_1D;
    }
 
    /*
-    * Compute index representations (immediate, relative, etc).
+    * Compute index representation(s) (immediate vs relative).
     */
-   if (tempArrayID > 0) {
-      assert(file == TGSI_FILE_TEMPORARY);
-      /* First index is the array ID, second index is the array element */
-      index0Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
-      if (indirect) {
-         index1Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE;
-      }
-      else {
-         index1Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
-      }
+   if (indexDim == VGPU10_OPERAND_INDEX_2D) {
+      index0Rep = indirect2D ? VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE
+         : VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      index1Rep = indirect ? VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE
+         : VGPU10_OPERAND_INDEX_IMMEDIATE32;
    }
-   else if (indirect) {
-      if (file == TGSI_FILE_CONSTANT) {
-         /* index[0] indicates which constant buffer while index[1] indicates
-          * the position in the constant buffer.
-          */
-         index0Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
-         index1Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE;
-      }
-      else {
-         /* All other register files are 1-dimensional */
-         index0Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE;
-      }
+   else if (indexDim == VGPU10_OPERAND_INDEX_1D) {
+      index0Rep = indirect ? VGPU10_OPERAND_INDEX_IMMEDIATE32_PLUS_RELATIVE
+         : VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+      index1Rep = 0;
    }
    else {
-      index0Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
-      index1Rep = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+      index0Rep = 0;
+      index1Rep = 0;
    }
 
    operand0.indexDimension = indexDim;
@@ -879,13 +1191,18 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
    const unsigned sem_index = emit->info.output_semantic_index[index];
    unsigned writemask = reg->Register.WriteMask;
    const boolean indirect = reg->Register.Indirect;
-   const unsigned tempArrayId = get_temp_array_id(emit, file, index);
-   const boolean index2d = reg->Register.Dimension;
+   unsigned tempArrayId = get_temp_array_id(emit, file, index);
+   boolean index2d = reg->Register.Dimension || tempArrayId > 0;
    VGPU10OperandToken0 operand0;
 
+   if (file == TGSI_FILE_TEMPORARY) {
+      emit->temp_map[index].initialized = TRUE;
+   }
+
    if (file == TGSI_FILE_OUTPUT) {
       if (emit->unit == PIPE_SHADER_VERTEX ||
-          emit->unit == PIPE_SHADER_GEOMETRY) {
+          emit->unit == PIPE_SHADER_GEOMETRY ||
+          emit->unit == PIPE_SHADER_TESS_EVAL) {
          if (index == emit->vposition.out_index &&
              emit->vposition.tmp_index != INVALID_INDEX) {
             /* replace OUTPUT[POS] with TEMP[POS].  We need to store the
@@ -913,6 +1230,21 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
             file = TGSI_FILE_TEMPORARY;
             index = emit->clip_vertex_tmp_index;
          }
+         else if (sem_name == TGSI_SEMANTIC_COLOR &&
+                  emit->key.clamp_vertex_color) {
+
+            /* set the saturate modifier of the instruction
+             * to clamp the vertex color.
+             */
+            VGPU10OpcodeToken0 *token =
+               (VGPU10OpcodeToken0 *)emit->buf + emit->inst_start_token;
+            token->saturate = TRUE;
+         }
+         else if (sem_name == TGSI_SEMANTIC_VIEWPORT_INDEX &&
+                  emit->gs.viewport_index_out_index != INVALID_INDEX) {
+            file = TGSI_FILE_TEMPORARY;
+            index = emit->gs.viewport_index_tmp_index;
+         }
       }
       else if (emit->unit == PIPE_SHADER_FRAGMENT) {
          if (sem_name == TGSI_SEMANTIC_POSITION) {
@@ -955,6 +1287,116 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
             emit->num_output_writes++;
          }
       }
+      else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         if (index == emit->tcs.inner.tgsi_index) {
+            /* replace OUTPUT[TESSLEVEL] with temp. We are storing it
+             * in temporary for now so that will be store into appropriate
+             * registers in post_helper() in patch constant phase.
+             */
+            if (emit->tcs.control_point_phase) {
+               /* Discard writing into tessfactor in control point phase */
+               emit->discard_instruction =  TRUE;
+            }
+            else {
+               file = TGSI_FILE_TEMPORARY;
+               index = emit->tcs.inner.temp_index;
+            }
+         }
+         else if (index == emit->tcs.outer.tgsi_index) {
+            /* replace OUTPUT[TESSLEVEL] with temp. We are storing it
+             * in temporary for now so that will be store into appropriate
+             * registers in post_helper().
+             */
+            if (emit->tcs.control_point_phase) {
+               /* Discard writing into tessfactor in control point phase */
+               emit->discard_instruction =  TRUE;
+            }
+            else {
+               file = TGSI_FILE_TEMPORARY;
+               index = emit->tcs.outer.temp_index;
+            }
+         }
+         else if (index >= emit->tcs.patch_generic_out_index &&
+                  index < (emit->tcs.patch_generic_out_index +
+                          emit->tcs.patch_generic_out_count)) {
+            if (emit->tcs.control_point_phase) {
+               /* Discard writing into generic patch constant outputs in
+                  control point phase */
+               emit->discard_instruction =  TRUE;
+            }
+            else {
+               if (emit->reemit_instruction) {
+                  /* Store results of reemitted instruction in temporary register. */
+                  file = TGSI_FILE_TEMPORARY;
+                  index = emit->tcs.patch_generic_tmp_index +
+                          (index - emit->tcs.patch_generic_out_index);
+                  /**
+                   * Temporaries for patch constant data can be done
+                   * as indexable temporaries.
+                   */
+                  tempArrayId = get_temp_array_id(emit, file, index);
+                  index2d = tempArrayId > 0;
+
+                  emit->reemit_instruction = FALSE;
+               }
+               else {
+                  /* If per-patch outputs is been read in shader, we
+                   * reemit instruction and store results in temporaries in
+                   * patch constant phase. */
+                  if (emit->info.reads_perpatch_outputs) {
+                     emit->reemit_instruction = TRUE;
+                  }
+               }
+            }
+         }
+         else if (reg->Register.Dimension) {
+            /* Only control point outputs are declared 2D in tgsi */
+            if (emit->tcs.control_point_phase) {
+               if (emit->reemit_instruction) {
+                  /* Store results of reemitted instruction in temporary register. */
+                  index2d = FALSE;
+                  file = TGSI_FILE_TEMPORARY;
+                  index = emit->tcs.control_point_tmp_index +
+                          (index - emit->tcs.control_point_out_index);
+                  emit->reemit_instruction = FALSE;
+               }
+               else {
+                  /* The mapped control point outputs are 1-D */
+                  index2d = FALSE;
+                  if (emit->info.reads_pervertex_outputs) {
+                     /* If per-vertex outputs is been read in shader, we
+                      * reemit instruction and store results in temporaries
+                      * control point phase. */
+                     emit->reemit_instruction = TRUE;
+                  }
+               }
+
+               if (sem_name == TGSI_SEMANTIC_CLIPDIST &&
+                   emit->clip_dist_tmp_index != INVALID_INDEX) {
+                  /* replace OUTPUT[CLIPDIST] with TEMP[CLIPDIST].
+                   * We store the clip distance in a temporary first, then
+                   * we'll copy it to the shadow copy and to CLIPDIST with the
+                   * enabled planes mask in emit_clip_distance_instructions().
+                   */
+                  file = TGSI_FILE_TEMPORARY;
+                  index = emit->clip_dist_tmp_index + sem_index;
+               }
+               else if (sem_name == TGSI_SEMANTIC_CLIPVERTEX &&
+                        emit->clip_vertex_tmp_index != INVALID_INDEX) {
+                  /* replace the CLIPVERTEX output register with a temporary */
+                  assert(emit->clip_mode == CLIP_VERTEX);
+                  assert(sem_index == 0);
+                  file = TGSI_FILE_TEMPORARY;
+                  index = emit->clip_vertex_tmp_index;
+               }
+            }
+            else {
+               /* Discard writing into control point outputs in
+                  patch constant phase */
+               emit->discard_instruction =  TRUE;
+            }
+         }
+      }
    }
 
    /* init operand tokens to all zero */
@@ -977,7 +1419,7 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
    check_register_index(emit, operand0.operandType, index);
 
    operand0 = setup_operand0_indexing(emit, operand0, file, indirect,
-                                      index2d, tempArrayId);
+                                      index2d, FALSE);
 
    /* Emit tokens */
    emit_dword(emit, operand0.value);
@@ -994,6 +1436,28 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * Check if temporary register needs to be initialize when
+ * shader is not using indirect addressing for temporary and uninitialized
+ * temporary is not used in loop. In these two scenarios, we cannot
+ * determine if temporary is initialized or not.
+ */
+static boolean
+need_temp_reg_initialization(struct svga_shader_emitter_v10 *emit,
+                             unsigned index)
+{
+   if (!(emit->info.indirect_files & (1u << TGSI_FILE_TEMPORARY))
+       && emit->current_loop_depth == 0) {
+      if (!emit->temp_map[index].initialized &&
+          emit->temp_map[index].index < emit->num_shader_temps) {
+         return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+
+/**
  * Translate a src register of a TGSI instruction and emit VGPU10 tokens.
  * In quite a few cases, we do register substitution.  For example, if
  * the TGSI register is the front/back-face register, we replace that with
@@ -1006,19 +1470,23 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
    enum tgsi_file_type file = reg->Register.File;
    unsigned index = reg->Register.Index;
    const boolean indirect = reg->Register.Indirect;
-   const unsigned tempArrayId = get_temp_array_id(emit, file, index);
-   const boolean index2d = reg->Register.Dimension;
-   const unsigned swizzleX = reg->Register.SwizzleX;
-   const unsigned swizzleY = reg->Register.SwizzleY;
-   const unsigned swizzleZ = reg->Register.SwizzleZ;
-   const unsigned swizzleW = reg->Register.SwizzleW;
+   unsigned tempArrayId = get_temp_array_id(emit, file, index);
+   boolean index2d = (reg->Register.Dimension ||
+                            tempArrayId > 0 ||
+                            file == TGSI_FILE_CONSTANT);
+   unsigned index2 = tempArrayId > 0 ? tempArrayId : reg->Dimension.Index;
+   boolean indirect2d = reg->Dimension.Indirect;
+   unsigned swizzleX = reg->Register.SwizzleX;
+   unsigned swizzleY = reg->Register.SwizzleY;
+   unsigned swizzleZ = reg->Register.SwizzleZ;
+   unsigned swizzleW = reg->Register.SwizzleW;
    const boolean absolute = reg->Register.Absolute;
    const boolean negate = reg->Register.Negate;
-   bool is_prim_id = FALSE;
-
    VGPU10OperandToken0 operand0;
    VGPU10OperandToken1 operand1;
 
+   operand0.value = operand1.value = 0;
+
    if (emit->unit == PIPE_SHADER_FRAGMENT){
       if (file == TGSI_FILE_INPUT) {
          if (index == emit->fs.face_input_index) {
@@ -1031,6 +1499,12 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
             file = TGSI_FILE_TEMPORARY;
             index = emit->fs.fragcoord_tmp_index;
          }
+         else if (index == emit->fs.layer_input_index) {
+            /* Replace INPUT[LAYER] with zero.x */
+            file = TGSI_FILE_IMMEDIATE;
+            index = emit->fs.layer_imm_index;
+            swizzleX = swizzleY = swizzleZ = swizzleW = TGSI_SWIZZLE_X;
+         }
          else {
             /* We remap fragment shader inputs to that FS input indexes
              * match up with VS/GS output indexes.
@@ -1045,6 +1519,23 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
             file = TGSI_FILE_TEMPORARY;
             index = emit->fs.sample_pos_tmp_index;
          }
+         else if (index == emit->fs.sample_mask_in_sys_index) {
+            /* Emitted as vCoverage0.x */
+            /* According to GLSL spec, the gl_SampleMaskIn array has ceil(s / 32)
+             * elements where s is the maximum number of color samples supported
+             * by the implementation. With current implementation, we should not
+             * have more than one element. So assert if Index != 0
+             */
+            assert((!reg->Register.Indirect && reg->Register.Index == 0) ||
+                   reg->Register.Indirect);
+            operand0.value = 0;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_COVERAGE_MASK;
+            operand0.indexDimension = VGPU10_OPERAND_INDEX_0D;
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+            operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_SELECT_1_MODE;
+            emit_dword(emit, operand0.value);
+            return;
+         }
          else {
             /* Map the TGSI system value to a VGPU10 input register */
             assert(index < ARRAY_SIZE(emit->system_value_indexes));
@@ -1055,9 +1546,19 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
    }
    else if (emit->unit == PIPE_SHADER_GEOMETRY) {
       if (file == TGSI_FILE_INPUT) {
-         is_prim_id = (index == emit->gs.prim_id_index);
+         if (index == emit->gs.prim_id_index) {
+            operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
+         }
          index = emit->linkage.input_map[index];
       }
+      else if (file == TGSI_FILE_SYSTEM_VALUE &&
+               index == emit->gs.invocation_id_sys_index) {
+         /* Emitted as vGSInstanceID0.x */
+         operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+         operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID;
+         index = 0;
+      }
    }
    else if (emit->unit == PIPE_SHADER_VERTEX) {
       if (file == TGSI_FILE_INPUT) {
@@ -1080,23 +1581,178 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
          index = emit->system_value_indexes[index];
       }
    }
+   else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
 
-   operand0.value = operand1.value = 0;
+      if (file == TGSI_FILE_SYSTEM_VALUE) {
+         if (index == emit->tcs.vertices_per_patch_index) {
+            /**
+             * if source register is the system value for vertices_per_patch,
+             * replace it with the immediate.
+             */
+            file = TGSI_FILE_IMMEDIATE;
+            index = emit->tcs.imm_index;
+            swizzleX = swizzleY = swizzleZ = swizzleW = TGSI_SWIZZLE_X;
+         }
+         else if (index == emit->tcs.invocation_id_sys_index) {
+            if (emit->tcs.control_point_phase) {
+               /**
+                * Emitted as vOutputControlPointID.x
+                */
+               operand0.numComponents = VGPU10_OPERAND_1_COMPONENT;
+               operand0.operandType = VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID;
+               index = 0;
+            }
+            else {
+               /* There is no control point ID input declaration in
+                * the patch constant phase in hull shader.
+                * Since for now we are emitting all instructions in
+                * the patch constant phase, we are replacing the
+                * control point ID reference with the immediate 0.
+                */
+               file = TGSI_FILE_IMMEDIATE;
+               index = emit->tcs.imm_index;
+               swizzleX = swizzleY = swizzleZ = swizzleW = TGSI_SWIZZLE_W;
+            }
+         }
+         else if (index == emit->tcs.prim_id_index) {
+            /**
+             * Emitted as vPrim.x
+             */
+            operand0.numComponents = VGPU10_OPERAND_1_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
+            index = 0;
+         }
+      }
+      else if (file == TGSI_FILE_INPUT) {
+         index = emit->linkage.input_map[index];
+         if (!emit->tcs.control_point_phase) {
+            /* Emitted as vicp */
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT;
+            assert(reg->Register.Dimension);
+         }
+      }
+      else if (file == TGSI_FILE_OUTPUT) {
+         if ((index >= emit->tcs.patch_generic_out_index &&
+             index < (emit->tcs.patch_generic_out_index +
+                      emit->tcs.patch_generic_out_count)) ||
+             index == emit->tcs.inner.tgsi_index ||
+             index == emit->tcs.outer.tgsi_index) {
+            if (emit->tcs.control_point_phase) {
+               emit->discard_instruction = TRUE;
+            }
+            else {
+               /* Device doesn't allow reading from output so
+                * use corresponding temporary register as source */
+               file = TGSI_FILE_TEMPORARY;
+               if (index == emit->tcs.inner.tgsi_index) {
+                  index = emit->tcs.inner.temp_index;
+               }
+               else if (index == emit->tcs.outer.tgsi_index) {
+                  index = emit->tcs.outer.temp_index;
+               }
+               else {
+                  index = emit->tcs.patch_generic_tmp_index +
+                          (index - emit->tcs.patch_generic_out_index);
+               }
 
-   if (is_prim_id) {
-      /* NOTE: we should be using VGPU10_OPERAND_1_COMPONENT here, but
-       * our virtual GPU accepts this as-is.
-       */
-      operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
-      operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
+               /**
+                * Temporaries for patch constant data can be done
+                * as indexable temporaries.
+                */
+               tempArrayId = get_temp_array_id(emit, file, index);
+               index2d = tempArrayId > 0;
+               index2 = tempArrayId > 0 ? tempArrayId : reg->Dimension.Index;
+            }
+         }
+         else if (index2d) {
+            if (emit->tcs.control_point_phase) {
+               /* Device doesn't allow reading from output so
+                * use corresponding temporary register as source */
+               file = TGSI_FILE_TEMPORARY;
+               index2d = FALSE;
+               index = emit->tcs.control_point_tmp_index +
+                       (index - emit->tcs.control_point_out_index);
+            }
+            else {
+               emit->discard_instruction = TRUE;
+            }
+         }
+      }
    }
-   else {
+   else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+      if (file == TGSI_FILE_SYSTEM_VALUE) {
+         if (index == emit->tes.tesscoord_sys_index) {
+            /**
+             * Emitted as vDomain
+             */
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_DOMAIN_POINT;
+            index = 0;
+         }
+         else if (index == emit->tes.inner.tgsi_index) {
+            file = TGSI_FILE_TEMPORARY;
+            index = emit->tes.inner.temp_index;
+         }
+         else if (index == emit->tes.outer.tgsi_index) {
+            file = TGSI_FILE_TEMPORARY;
+            index = emit->tes.outer.temp_index;
+         }
+         else if (index == emit->tes.prim_id_index) {
+            /**
+             * Emitted as vPrim.x
+             */
+            operand0.numComponents = VGPU10_OPERAND_1_COMPONENT;
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
+            index = 0;
+         }
+
+      }
+      else if (file == TGSI_FILE_INPUT) {
+         if (index2d) {
+            /* 2D input is emitted as vcp (input control point). */
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT;
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+
+            /* index specifies the element index and is remapped
+             * to align with the tcs output index.
+             */
+            index = emit->linkage.input_map[index];
+         }
+         else {
+            if (index < emit->key.tes.tessfactor_index)
+               /* index specifies the generic patch index.
+                * Remapped to match up with the tcs output index.
+                */
+               index = emit->linkage.input_map[index];
+
+            operand0.operandType = VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT;
+            operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+         }
+      }
+   }
+
+   if (file == TGSI_FILE_ADDRESS) {
+      index = emit->address_reg_index[index];
+      file = TGSI_FILE_TEMPORARY;
+   }
+
+   if (file == TGSI_FILE_TEMPORARY) {
+      if (need_temp_reg_initialization(emit, index)) {
+         emit->initialize_temp_index = index;
+         emit->discard_instruction = TRUE;
+      }
+   }
+
+   if (operand0.value == 0) {
+      /* if operand0 was not set above for a special case, do the general
+       * case now.
+       */
       operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
       operand0.operandType = translate_register_file(file, tempArrayId > 0);
    }
-
    operand0 = setup_operand0_indexing(emit, operand0, file, indirect,
-                                      index2d, tempArrayId);
+                                      index2d, indirect2d);
 
    if (operand0.operandType != VGPU10_OPERAND_TYPE_IMMEDIATE32 &&
        operand0.operandType != VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID) {
@@ -1149,13 +1805,12 @@ emit_src_register(struct svga_shader_emitter_v10 *emit,
    }
    else if (operand0.indexDimension >= VGPU10_OPERAND_INDEX_1D) {
       /* Emit the register index(es) */
-      if (index2d ||
-          operand0.operandType == VGPU10_OPERAND_TYPE_CONSTANT_BUFFER) {
-         emit_dword(emit, reg->Dimension.Index);
-      }
+      if (index2d) {
+         emit_dword(emit, index2);
 
-      if (tempArrayId > 0) {
-         emit_dword(emit, tempArrayId);
+         if (indirect2d) {
+            emit_indirect_register(emit, reg->DimIndirect.Index);
+         }
       }
 
       emit_dword(emit, remap_temp_index(emit, file, index));
@@ -1271,12 +1926,34 @@ emit_rasterizer_register(struct svga_shader_emitter_v10 *emit)
 
 
 /**
- * Emit the token for a VGPU10 opcode.
+ * Emit tokens for the "stream" register used by the 
+ * DCL_STREAM, CUT_STREAM, EMIT_STREAM instructions.
+ */
+static void
+emit_stream_register(struct svga_shader_emitter_v10 *emit, unsigned index)
+{
+   VGPU10OperandToken0 operand0;
+
+   /* init */
+   operand0.value = 0;
+
+   /* No register index for rasterizer index (there's only one) */
+   operand0.operandType = VGPU10_OPERAND_TYPE_STREAM;
+   operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+   operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
+
+   emit_dword(emit, operand0.value);
+   emit_dword(emit, index);
+}
+
+
+/**
+ * Emit the token for a VGPU10 opcode, with precise parameter.
  * \param saturate   clamp result to [0,1]?
  */
 static void
-emit_opcode(struct svga_shader_emitter_v10 *emit,
-            VGPU10_OPCODE_TYPE vgpu10_opcode, boolean saturate)
+emit_opcode_precise(struct svga_shader_emitter_v10 *emit,
+                    unsigned vgpu10_opcode, boolean saturate, boolean precise)
 {
    VGPU10OpcodeToken0 token0;
 
@@ -1285,7 +1962,26 @@ emit_opcode(struct svga_shader_emitter_v10 *emit,
    token0.instructionLength = 0; /* Filled in by end_emit_instruction() */
    token0.saturate = saturate;
 
+   /* Mesa's GLSL IR -> TGSI translator will set the TGSI precise flag for
+    * 'invariant' declarations.  Only set preciseValues=1 if we have SM5.
+    */
+   token0.preciseValues = precise && emit->version >= 50;
+
    emit_dword(emit, token0.value);
+
+   emit->uses_precise_qualifier |= token0.preciseValues;
+}
+
+
+/**
+ * Emit the token for a VGPU10 opcode.
+ * \param saturate   clamp result to [0,1]?
+ */
+static void
+emit_opcode(struct svga_shader_emitter_v10 *emit,
+            unsigned vgpu10_opcode, boolean saturate)
+{
+   emit_opcode_precise(emit, vgpu10_opcode, saturate, FALSE);
 }
 
 
@@ -1695,6 +2391,32 @@ find_immediate(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * As above, but search for a double[2] pair.
+ */
+static int
+find_immediate_dbl(struct svga_shader_emitter_v10 *emit,
+                   double x, double y)
+{
+   const unsigned endIndex = emit->num_immediates;
+   unsigned i;
+
+   assert(emit->immediates_emitted);
+
+   /* Search immediates for x, y, z, w */
+   for (i = 0; i < endIndex; i++) {
+      if (x == emit->immediates_dbl[i][0] &&
+          y == emit->immediates_dbl[i][1]) {
+         return i;
+      }
+   }
+   /* Should never try to use an immediate value that wasn't pre-declared */
+   assert(!"find_immediate_dbl() failed!");
+   return -1;
+}
+
+
+
+/**
  * Return a tgsi_full_src_register for an immediate/literal
  * union tgsi_immediate_data[4] value.
  * Note: the values must have been previously declared/allocated in
@@ -1831,6 +2553,26 @@ make_immediate_reg_int(struct svga_shader_emitter_v10 *emit, int value)
 }
 
 
+static struct tgsi_full_src_register
+make_immediate_reg_double(struct svga_shader_emitter_v10 *emit, double value)
+{
+   struct tgsi_full_src_register reg;
+   int immpos = find_immediate_dbl(emit, value, value);
+
+   assert(immpos >= 0);
+
+   memset(&reg, 0, sizeof(reg));
+   reg.Register.File = TGSI_FILE_IMMEDIATE;
+   reg.Register.Index = immpos;
+   reg.Register.SwizzleX = TGSI_SWIZZLE_X;
+   reg.Register.SwizzleY = TGSI_SWIZZLE_Y;
+   reg.Register.SwizzleZ = TGSI_SWIZZLE_Z;
+   reg.Register.SwizzleW = TGSI_SWIZZLE_W;
+
+   return reg;
+}
+
+
 /**
  * Allocate space for a union tgsi_immediate_data[4] immediate.
  * \return  the index/position of the immediate.
@@ -1884,6 +2626,20 @@ alloc_immediate_int4(struct svga_shader_emitter_v10 *emit,
 }
 
 
+static unsigned
+alloc_immediate_double2(struct svga_shader_emitter_v10 *emit,
+                        double x, double y)
+{
+   unsigned n = emit->num_immediates++;
+   assert(!emit->immediates_emitted);
+   assert(n < ARRAY_SIZE(emit->immediates));
+   emit->immediates_dbl[n][0] = x;
+   emit->immediates_dbl[n][1] = y;
+   return n;
+
+}
+
+
 /**
  * Allocate a shader input to store a system value.
  */
@@ -2057,8 +2813,39 @@ emit_vgpu10_property(struct svga_shader_emitter_v10 *emit,
       emit->gs.max_out_vertices = prop->u[0].Data;
       break;
 
-   default:
+   case TGSI_PROPERTY_GS_INVOCATIONS:
+      emit->gs.invocations = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+   case TGSI_PROPERTY_NEXT_SHADER:
+   case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+      /* no-op */
       break;
+
+   case TGSI_PROPERTY_TCS_VERTICES_OUT:
+      emit->tcs.vertices_out = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_TES_PRIM_MODE:
+      emit->tes.prim_mode = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_TES_SPACING:
+      emit->tes.spacing = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_TES_VERTEX_ORDER_CW:
+      emit->tes.vertices_order_cw = prop->u[0].Data;
+      break;
+
+   case TGSI_PROPERTY_TES_POINT_MODE:
+      emit->tes.point_mode = prop->u[0].Data;
+      break;
+
+   default:
+      debug_printf("Unexpected TGSI property %s\n",
+                   tgsi_property_names[prop->Property.PropertyName]);
    }
 
    return TRUE;
@@ -2094,16 +2881,498 @@ emit_property_instructions(struct svga_shader_emitter_v10 *emit)
    opcode0.primitive = emit->gs.prim_type;
    emit_property_instruction(emit, opcode0, 0, 0);
 
-   /* emit output primitive topology declaration */
-   opcode0.value = 0;
-   opcode0.opcodeType = VGPU10_OPCODE_DCL_GS_OUTPUT_PRIMITIVE_TOPOLOGY;
-   opcode0.primitiveTopology = emit->gs.prim_topology;
-   emit_property_instruction(emit, opcode0, 0, 0);
-
    /* emit max output vertices */
    opcode0.value = 0;
    opcode0.opcodeType = VGPU10_OPCODE_DCL_MAX_OUTPUT_VERTEX_COUNT;
    emit_property_instruction(emit, opcode0, 1, emit->gs.max_out_vertices);
+
+   if (emit->version >= 50 && emit->gs.invocations > 0) {
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_GS_INSTANCE_COUNT;
+      emit_property_instruction(emit, opcode0, 1, emit->gs.invocations);
+   }
+}
+
+
+/**
+ * A helper function to declare tessellator domain in a hull shader or
+ * in the domain shader.
+ */
+static void
+emit_tessellator_domain(struct svga_shader_emitter_v10 *emit,
+                        enum pipe_prim_type prim_mode)
+{
+   VGPU10OpcodeToken0 opcode0;
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_TESS_DOMAIN;
+   switch (prim_mode) {
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_LINES:
+      opcode0.tessDomain = VGPU10_TESSELLATOR_DOMAIN_QUAD;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      opcode0.tessDomain = VGPU10_TESSELLATOR_DOMAIN_TRI;
+      break;
+   default:
+      debug_printf("Invalid tessellator prim mode %d\n", prim_mode);
+      opcode0.tessDomain = VGPU10_TESSELLATOR_DOMAIN_UNDEFINED;
+   }
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+}
+
+
+/**
+ * Emit domain shader declarations.
+ */
+static void
+emit_domain_shader_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   VGPU10OpcodeToken0 opcode0;
+
+   assert(emit->unit == PIPE_SHADER_TESS_EVAL);
+
+   /* Emit the input control point count */
+   assert(emit->key.tes.vertices_per_patch > 0 &&
+          emit->key.tes.vertices_per_patch <= 32);
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_INPUT_CONTROL_POINT_COUNT;
+   opcode0.controlPointCount = emit->key.tes.vertices_per_patch;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   emit_tessellator_domain(emit, emit->tes.prim_mode);
+}
+
+
+/**
+ * Some common values like 0.0, 1.0, 0.5, etc. are frequently needed
+ * to implement some instructions.  We pre-allocate those values here
+ * in the immediate constant buffer.
+ */
+static void
+alloc_common_immediates(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned n = 0;
+
+   emit->common_immediate_pos[n++] =
+      alloc_immediate_float4(emit, 0.0f, 1.0f, 0.5f, -1.0f);
+
+   if (emit->info.opcode_count[TGSI_OPCODE_LIT] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_float4(emit, 128.0f, -128.0f, 0.0f, 0.0f);
+   }
+
+   emit->common_immediate_pos[n++] =
+      alloc_immediate_int4(emit, 0, 1, 0, -1);
+
+   if (emit->info.opcode_count[TGSI_OPCODE_IMSB] > 0 ||
+       emit->info.opcode_count[TGSI_OPCODE_UMSB] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_int4(emit, 31, 0, 0, 0);
+   }
+
+   if (emit->info.opcode_count[TGSI_OPCODE_UBFE] > 0 ||
+       emit->info.opcode_count[TGSI_OPCODE_IBFE] > 0 ||
+       emit->info.opcode_count[TGSI_OPCODE_BFI] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_int4(emit, 32, 0, 0, 0);
+   }
+
+   if (emit->key.vs.attrib_puint_to_snorm) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_float4(emit, -2.0f, 2.0f, 3.0f, -1.66666f);
+   }
+
+   if (emit->key.vs.attrib_puint_to_uscaled) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_float4(emit, 1023.0f, 3.0f, 0.0f, 0.0f);
+   }
+
+   if (emit->key.vs.attrib_puint_to_sscaled) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_int4(emit, 22, 12, 2, 0);
+
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_int4(emit, 22, 30, 0, 0);
+   }
+
+   if (emit->vposition.num_prescale > 1) {
+      unsigned i;
+      for (i = 0; i < emit->vposition.num_prescale; i+=4) {
+         emit->common_immediate_pos[n++] =
+            alloc_immediate_int4(emit, i, i+1, i+2, i+3);
+      }
+   }
+
+   emit->immediates_dbl = (double (*)[2]) emit->immediates;
+
+   if (emit->info.opcode_count[TGSI_OPCODE_DNEG] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_double2(emit, -1.0, -1.0);
+   }
+
+   if (emit->info.opcode_count[TGSI_OPCODE_DSQRT] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_double2(emit, 0.0, 0.0);
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_double2(emit, 1.0, 1.0);
+   }
+
+   if (emit->info.opcode_count[TGSI_OPCODE_INTERP_OFFSET] > 0) {
+      emit->common_immediate_pos[n++] =
+         alloc_immediate_float4(emit, 16.0f, -16.0f, 0.0, 0.0);
+   }
+
+   assert(n <= ARRAY_SIZE(emit->common_immediate_pos));
+
+   unsigned i;
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (emit->key.tex[i].texel_bias) {
+         /* Replace 0.0f if more immediate float value is needed */
+         emit->common_immediate_pos[n++] =
+            alloc_immediate_float4(emit, 0.0001f, 0.0f, 0.0f, 0.0f);
+         break;
+      }
+   }
+
+   assert(n <= ARRAY_SIZE(emit->common_immediate_pos));
+   emit->num_common_immediates = n;
+}
+
+
+/**
+ * Emit hull shader declarations.
+*/
+static void
+emit_hull_shader_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   VGPU10OpcodeToken0 opcode0;
+
+   /* Emit the input control point count */
+   assert(emit->key.tcs.vertices_per_patch > 0 &&
+          emit->key.tcs.vertices_per_patch <= 32);
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_INPUT_CONTROL_POINT_COUNT;
+   opcode0.controlPointCount = emit->key.tcs.vertices_per_patch;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Emit the output control point count */
+   assert(emit->tcs.vertices_out >= 0 && emit->tcs.vertices_out <= 32);
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_OUTPUT_CONTROL_POINT_COUNT;
+   opcode0.controlPointCount = emit->tcs.vertices_out;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Emit tessellator domain */
+   emit_tessellator_domain(emit, emit->key.tcs.prim_mode);
+
+   /* Emit tessellator output primitive */
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_TESS_OUTPUT_PRIMITIVE;
+   if (emit->key.tcs.point_mode) {
+      opcode0.tessOutputPrimitive = VGPU10_TESSELLATOR_OUTPUT_POINT;
+   }
+   else if (emit->key.tcs.prim_mode == PIPE_PRIM_LINES) {
+      opcode0.tessOutputPrimitive = VGPU10_TESSELLATOR_OUTPUT_LINE;
+   }
+   else {
+      assert(emit->key.tcs.prim_mode == PIPE_PRIM_QUADS ||
+             emit->key.tcs.prim_mode == PIPE_PRIM_TRIANGLES);
+
+      if (emit->key.tcs.vertices_order_cw)
+         opcode0.tessOutputPrimitive = VGPU10_TESSELLATOR_OUTPUT_TRIANGLE_CCW;
+      else
+         opcode0.tessOutputPrimitive = VGPU10_TESSELLATOR_OUTPUT_TRIANGLE_CW;
+   }
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Emit tessellator partitioning */
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_TESS_PARTITIONING;
+   switch (emit->key.tcs.spacing) {
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+      opcode0.tessPartitioning = VGPU10_TESSELLATOR_PARTITIONING_FRACTIONAL_ODD;
+      break;
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+      opcode0.tessPartitioning = VGPU10_TESSELLATOR_PARTITIONING_FRACTIONAL_EVEN;
+      break;
+   case PIPE_TESS_SPACING_EQUAL:
+      opcode0.tessPartitioning = VGPU10_TESSELLATOR_PARTITIONING_INTEGER;
+      break;
+   default:
+      debug_printf("invalid tessellator spacing %d\n", emit->key.tcs.spacing);
+      opcode0.tessPartitioning = VGPU10_TESSELLATOR_PARTITIONING_UNDEFINED;
+   }
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Declare constant registers */
+   emit_constant_declaration(emit);
+
+   /* Declare samplers and resources */
+   emit_sampler_declarations(emit);
+   emit_resource_declarations(emit);
+
+   alloc_common_immediates(emit);
+
+   int nVertices = emit->key.tcs.vertices_per_patch;
+   emit->tcs.imm_index =
+      alloc_immediate_int4(emit, nVertices, nVertices, nVertices, 0);
+
+   /* Now, emit the constant block containing all the immediates
+    * declared by shader, as well as the extra ones seen above.
+    */
+   emit_vgpu10_immediates_block(emit);
+
+}
+
+
+/**
+ * A helper function to determine if control point phase is needed.
+ * Returns TRUE if there is control point output.
+ */
+static boolean
+needs_control_point_phase(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+
+   assert(emit->unit == PIPE_SHADER_TESS_CTRL);
+
+   /* If output control point count does not match the input count,
+    * we need a control point phase to explicitly set the output control
+    * points.
+    */
+   if (emit->key.tcs.vertices_per_patch != emit->tcs.vertices_out)
+      return TRUE;
+
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      switch (emit->info.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_PATCH:
+      case TGSI_SEMANTIC_TESSOUTER:
+      case TGSI_SEMANTIC_TESSINNER:
+         break;
+      default:
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+/**
+ * Start the hull shader control point phase
+ */
+static boolean
+emit_hull_shader_control_point_phase(struct svga_shader_emitter_v10 *emit)
+{
+   VGPU10OpcodeToken0 opcode0;
+
+   /* If there is no control point output, skip the control point phase. */
+   if (!needs_control_point_phase(emit))
+      return FALSE;
+
+   /* Start the control point phase in the hull shader */
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_HS_CONTROL_POINT_PHASE;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Declare the output control point ID */
+   if (emit->tcs.invocation_id_sys_index == INVALID_INDEX) {
+      /* Add invocation id declaration if it does not exist */
+      emit->tcs.invocation_id_sys_index = emit->info.num_system_values + 1;
+   }
+
+   emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                          VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID,
+                          VGPU10_OPERAND_INDEX_0D,
+                          0, 1,
+                          VGPU10_NAME_UNDEFINED,
+                          VGPU10_OPERAND_0_COMPONENT, 0,
+                          0,
+                          VGPU10_INTERPOLATION_CONSTANT, TRUE,
+                          SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+
+   if (emit->tcs.prim_id_index != INVALID_INDEX) {
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID,
+                             VGPU10_OPERAND_INDEX_0D,
+                             0, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_0_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             0,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             SVGADX_SIGNATURE_SEMANTIC_NAME_PRIMITIVE_ID);
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Start the hull shader patch constant phase and
+ * do the second pass of the tcs translation and emit
+ * the relevant declarations and instructions for this phase.
+ */
+static boolean
+emit_hull_shader_patch_constant_phase(struct svga_shader_emitter_v10 *emit,
+                                      struct tgsi_parse_context *parse)
+{
+   unsigned inst_number = 0;
+   boolean ret = TRUE;
+   VGPU10OpcodeToken0 opcode0;
+
+   emit->skip_instruction = FALSE;
+
+   /* Start the patch constant phase */
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_HS_FORK_PHASE;
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   end_emit_instruction(emit);
+
+   /* Set the current phase to patch constant phase */
+   emit->tcs.control_point_phase = FALSE;
+
+   if (emit->tcs.prim_id_index != INVALID_INDEX) {
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID,
+                             VGPU10_OPERAND_INDEX_0D,
+                             0, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_0_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             0,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             SVGADX_SIGNATURE_SEMANTIC_NAME_PRIMITIVE_ID);
+   }
+
+   /* Emit declarations for this phase */
+   emit->index_range.required =
+      emit->info.indirect_files & (1 << TGSI_FILE_INPUT) ? TRUE : FALSE;
+   emit_tcs_input_declarations(emit);
+
+   if (emit->index_range.start_index != INVALID_INDEX) {
+      emit_index_range_declaration(emit);
+   }
+
+   emit->index_range.required =
+      emit->info.indirect_files & (1 << TGSI_FILE_OUTPUT) ? TRUE : FALSE;
+   emit_tcs_output_declarations(emit);
+
+   if (emit->index_range.start_index != INVALID_INDEX) {
+      emit_index_range_declaration(emit);
+   }
+   emit->index_range.required = FALSE;
+
+   emit_temporaries_declaration(emit);
+
+   /* Reset the token position to the first instruction token
+    * in preparation for the second pass of the shader
+    */
+   parse->Position = emit->tcs.instruction_token_pos;
+
+   while (!tgsi_parse_end_of_tokens(parse)) {
+      tgsi_parse_token(parse);
+
+      assert(parse->FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION);
+      ret = emit_vgpu10_instruction(emit, inst_number++,
+                                    &parse->FullToken.FullInstruction);
+
+      /* Usually this applies to TCS only. If shader is reading output of
+       * patch constant in fork phase, we should reemit all instructions
+       * which are writting into ouput of patch constant in fork phase
+       * to store results into temporaries.
+       */
+      if (emit->reemit_instruction) {
+         assert(emit->unit == PIPE_SHADER_TESS_CTRL);
+         ret = emit_vgpu10_instruction(emit, inst_number,
+                                       &parse->FullToken.FullInstruction);
+      }
+
+      if (!ret)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Emit index range declaration.
+ */
+static boolean
+emit_index_range_declaration(struct svga_shader_emitter_v10 *emit)
+{
+   if (emit->version < 50)
+      return TRUE;
+
+   assert(emit->index_range.start_index != INVALID_INDEX);
+   assert(emit->index_range.count != 0);
+   assert(emit->index_range.required);
+   assert(emit->index_range.operandType != VGPU10_NUM_OPERANDS);
+   assert(emit->index_range.dim != 0);
+   assert(emit->index_range.size != 0);
+
+   VGPU10OpcodeToken0 opcode0;
+   VGPU10OperandToken0 operand0;
+
+   opcode0.value = 0;
+   opcode0.opcodeType = VGPU10_OPCODE_DCL_INDEX_RANGE;
+
+   operand0.value = 0;
+   operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+   operand0.indexDimension = emit->index_range.dim;
+   operand0.operandType = emit->index_range.operandType;
+   operand0.mask = VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
+   operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+   if (emit->index_range.dim == VGPU10_OPERAND_INDEX_2D)
+      operand0.index1Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+   begin_emit_instruction(emit);
+   emit_dword(emit, opcode0.value);
+   emit_dword(emit, operand0.value);
+
+   if (emit->index_range.dim == VGPU10_OPERAND_INDEX_2D) {
+      emit_dword(emit, emit->index_range.size);
+      emit_dword(emit, emit->index_range.start_index);
+      emit_dword(emit, emit->index_range.count);
+   }
+   else {
+      emit_dword(emit, emit->index_range.start_index);
+      emit_dword(emit, emit->index_range.count);
+   }
+
+   end_emit_instruction(emit);
+
+   /* Reset fields in emit->index_range struct except
+    * emit->index_range.required which will be reset afterwards
+    */
+   emit->index_range.count = 0;
+   emit->index_range.operandType = VGPU10_NUM_OPERANDS;
+   emit->index_range.start_index = INVALID_INDEX;
+   emit->index_range.size = 0;
+   emit->index_range.dim = 0;
+
+   return TRUE;
 }
 
 
@@ -2123,8 +3392,14 @@ emit_decl_instruction(struct svga_shader_emitter_v10 *emit,
 {
    assert(opcode0.opcodeType);
    assert(operand0.mask ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT) ||
           (operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT_DEPTH) ||
-          (operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT_COVERAGE_MASK));
+          (operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT_COVERAGE_MASK) ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID) ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID) ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID) ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_INPUT_COVERAGE_MASK) ||
+          (operand0.operandType == VGPU10_OPERAND_TYPE_STREAM));
 
    begin_emit_instruction(emit);
    emit_dword(emit, opcode0.value);
@@ -2159,7 +3434,8 @@ emit_decl_instruction(struct svga_shader_emitter_v10 *emit,
  * \param index       the input register index
  * \param size        array size of the operand. In most cases, it is 1,
  *                    but for inputs to geometry shader, the array size varies
- *                    depending on the primitive type.
+ *                    depending on the primitive type. For tessellation control
+ *                    shader, the array size is the vertex count per patch.
  * \param name        one of VGPU10_NAME_x
  * \parma numComp     number of components
  * \param selMode     component selection mode
@@ -2176,7 +3452,9 @@ emit_input_declaration(struct svga_shader_emitter_v10 *emit,
                        VGPU10_OPERAND_NUM_COMPONENTS numComp,
                        VGPU10_OPERAND_4_COMPONENT_SELECTION_MODE selMode,
                        unsigned usageMask,
-                       VGPU10_INTERPOLATION_MODE interpMode)
+                       VGPU10_INTERPOLATION_MODE interpMode,
+                       boolean addSignature,
+                       SVGA3dDXSignatureSemanticName sgnName)
 {
    VGPU10OpcodeToken0 opcode0;
    VGPU10OperandToken0 operand0;
@@ -2185,11 +3463,22 @@ emit_input_declaration(struct svga_shader_emitter_v10 *emit,
    assert(usageMask <= VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
    assert(opcodeType == VGPU10_OPCODE_DCL_INPUT ||
           opcodeType == VGPU10_OPCODE_DCL_INPUT_SIV ||
+          opcodeType == VGPU10_OPCODE_DCL_INPUT_SGV ||
           opcodeType == VGPU10_OPCODE_DCL_INPUT_PS ||
           opcodeType == VGPU10_OPCODE_DCL_INPUT_PS_SIV ||
           opcodeType == VGPU10_OPCODE_DCL_INPUT_PS_SGV);
    assert(operandType == VGPU10_OPERAND_TYPE_INPUT ||
-          operandType == VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID);
+          operandType == VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_COVERAGE_MASK ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID ||
+          operandType == VGPU10_OPERAND_TYPE_OUTPUT_CONTROL_POINT_ID ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_DOMAIN_POINT ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_THREAD_ID ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_THREAD_GROUP_ID ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_THREAD_ID_IN_GROUP);
+
    assert(numComp <= VGPU10_OPERAND_4_COMPONENT);
    assert(selMode <= VGPU10_OPERAND_4_COMPONENT_MASK_MODE);
    assert(dim <= VGPU10_OPERAND_INDEX_3D);
@@ -2199,7 +3488,9 @@ emit_input_declaration(struct svga_shader_emitter_v10 *emit,
           name == VGPU10_NAME_VERTEX_ID ||
           name == VGPU10_NAME_PRIMITIVE_ID ||
           name == VGPU10_NAME_IS_FRONT_FACE ||
-          name == VGPU10_NAME_SAMPLE_INDEX);
+          name == VGPU10_NAME_SAMPLE_INDEX ||
+          name == VGPU10_NAME_RENDER_TARGET_ARRAY_INDEX ||
+          name == VGPU10_NAME_VIEWPORT_ARRAY_INDEX);
 
    assert(interpMode == VGPU10_INTERPOLATION_UNDEFINED ||
           interpMode == VGPU10_INTERPOLATION_CONSTANT ||
@@ -2229,6 +3520,78 @@ emit_input_declaration(struct svga_shader_emitter_v10 *emit,
    name_token.name = name;
 
    emit_decl_instruction(emit, opcode0, operand0, name_token, index, size);
+
+   if (addSignature) {
+      struct svga_shader_signature *sgn = &emit->signature;
+      if (operandType == VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT) {
+         /* Set patch constant signature */
+         SVGA3dDXShaderSignatureEntry *sgnEntry =
+            &sgn->patchConstants[sgn->header.numPatchConstantSignatures++];
+         set_shader_signature_entry(sgnEntry, index,
+                                    sgnName, usageMask,
+                                    SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                                    SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+
+      } else if (operandType == VGPU10_OPERAND_TYPE_INPUT ||
+                 operandType == VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT) {
+         /* Set input signature */
+         SVGA3dDXShaderSignatureEntry *sgnEntry =
+            &sgn->inputs[sgn->header.numInputSignatures++];
+         set_shader_signature_entry(sgnEntry, index,
+                                    sgnName, usageMask,
+                                    SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                                    SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+      }
+   }
+
+   if (emit->index_range.required) {
+      /* Here, index_range declaration is only applicable for opcodeType
+       * VGPU10_OPCODE_DCL_INPUT and VGPU10_OPCODE_DCL_INPUT_PS and
+       * for operandType VGPU10_OPERAND_TYPE_INPUT,
+       * VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT and
+       * VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT.
+       */
+      if ((opcodeType != VGPU10_OPCODE_DCL_INPUT &&
+           opcodeType != VGPU10_OPCODE_DCL_INPUT_PS) ||
+          (operandType != VGPU10_OPERAND_TYPE_INPUT &&
+           operandType != VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT &&
+           operandType != VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT)) {
+         if (emit->index_range.start_index != INVALID_INDEX) {
+            emit_index_range_declaration(emit);
+         }
+         return;
+      }
+
+      if (emit->index_range.operandType == VGPU10_NUM_OPERANDS) {
+         /* Need record new index_range */
+         emit->index_range.count = 1;
+         emit->index_range.operandType = operandType;
+         emit->index_range.start_index = index;
+         emit->index_range.size = size;
+         emit->index_range.dim = dim;
+      }
+      else if (index !=
+               (emit->index_range.start_index + emit->index_range.count) ||
+               emit->index_range.operandType != operandType) {
+         /* Input index is not contiguous with index range or operandType is
+          * different from index range's operandType. We need to emit current
+          * index_range first and then start recording next index range.
+          */
+         emit_index_range_declaration(emit);
+
+         emit->index_range.count = 1;
+         emit->index_range.operandType = operandType;
+         emit->index_range.start_index = index;
+         emit->index_range.size = size;
+         emit->index_range.dim = dim;
+      }
+      else if (emit->index_range.operandType == operandType) {
+         /* Since input index is contiguous with index range and operandType
+          * is same as index range's operandType, increment index range count.
+          */
+         emit->index_range.count++;
+      }
+   }
 }
 
 
@@ -2243,13 +3606,15 @@ static void
 emit_output_declaration(struct svga_shader_emitter_v10 *emit,
                         VGPU10_OPCODE_TYPE type, unsigned index,
                         VGPU10_SYSTEM_NAME name,
-                        unsigned usageMask)
+                        unsigned writemask,
+                        boolean addSignature,
+                        SVGA3dDXSignatureSemanticName sgnName)
 {
    VGPU10OpcodeToken0 opcode0;
    VGPU10OperandToken0 operand0;
    VGPU10NameToken name_token;
 
-   assert(usageMask <= VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
+   assert(writemask <= VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
    assert(type == VGPU10_OPCODE_DCL_OUTPUT ||
           type == VGPU10_OPCODE_DCL_OUTPUT_SGV ||
           type == VGPU10_OPCODE_DCL_OUTPUT_SIV);
@@ -2257,6 +3622,7 @@ emit_output_declaration(struct svga_shader_emitter_v10 *emit,
           name == VGPU10_NAME_POSITION ||
           name == VGPU10_NAME_PRIMITIVE_ID ||
           name == VGPU10_NAME_RENDER_TARGET_ARRAY_INDEX ||
+          name == VGPU10_NAME_VIEWPORT_ARRAY_INDEX ||
           name == VGPU10_NAME_CLIP_DISTANCE);
 
    check_register_index(emit, type, index);
@@ -2267,13 +3633,66 @@ emit_output_declaration(struct svga_shader_emitter_v10 *emit,
    operand0.operandType = VGPU10_OPERAND_TYPE_OUTPUT;
    operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
    operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
-   operand0.mask = usageMask;
+   operand0.mask = writemask;
    operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
    operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
 
    name_token.name = name;
 
    emit_decl_instruction(emit, opcode0, operand0, name_token, index, 1);
+
+   /* Capture output signature */
+   if (addSignature) {
+      struct svga_shader_signature *sgn = &emit->signature;
+      SVGA3dDXShaderSignatureEntry *sgnEntry =
+         &sgn->outputs[sgn->header.numOutputSignatures++];
+      set_shader_signature_entry(sgnEntry, index,
+                                 sgnName, writemask,
+                                 SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                                 SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+   }
+
+   if (emit->index_range.required) {
+      /* Here, index_range declaration is only applicable for opcodeType
+       * VGPU10_OPCODE_DCL_OUTPUT and for operandType
+       * VGPU10_OPERAND_TYPE_OUTPUT.
+       */
+      if (type != VGPU10_OPCODE_DCL_OUTPUT) {
+         if (emit->index_range.start_index != INVALID_INDEX) {
+            emit_index_range_declaration(emit);
+         }
+         return;
+      }
+
+      if (emit->index_range.operandType == VGPU10_NUM_OPERANDS) {
+         /* Need record new index_range */
+         emit->index_range.count = 1;
+         emit->index_range.operandType = VGPU10_OPERAND_TYPE_OUTPUT;
+         emit->index_range.start_index = index;
+         emit->index_range.size = 1;
+         emit->index_range.dim = VGPU10_OPERAND_INDEX_1D;
+      }
+      else if (index !=
+               (emit->index_range.start_index + emit->index_range.count)) {
+         /* Output index is not contiguous with index range. We need to
+          * emit current index_range first and then start recording next
+          * index range.
+          */
+         emit_index_range_declaration(emit);
+
+         emit->index_range.count = 1;
+         emit->index_range.operandType = VGPU10_OPERAND_TYPE_OUTPUT;
+         emit->index_range.start_index = index;
+         emit->index_range.size = 1;
+         emit->index_range.dim = VGPU10_OPERAND_INDEX_1D;
+      }
+      else {
+         /* Since output index is contiguous with index range, increment
+          * index range count.
+          */
+         emit->index_range.count++;
+      }
+   }
 }
 
 
@@ -2327,6 +3746,563 @@ emit_samplemask_output_declaration(struct svga_shader_emitter_v10 *emit)
 
 
 /**
+ * Emit output declarations for fragment shader.
+ */
+static void
+emit_fs_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned int i;
+
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      /*const unsigned usage_mask = emit->info.output_usage_mask[i];*/
+      const enum tgsi_semantic semantic_name =
+         emit->info.output_semantic_name[i];
+      const unsigned semantic_index = emit->info.output_semantic_index[i];
+      unsigned index = i;
+
+      if (semantic_name == TGSI_SEMANTIC_COLOR) {
+         assert(semantic_index < ARRAY_SIZE(emit->fs.color_out_index));
+
+         emit->fs.color_out_index[semantic_index] = index;
+
+         emit->fs.num_color_outputs = MAX2(emit->fs.num_color_outputs,
+                                              index + 1);
+
+         /* The semantic index is the shader's color output/buffer index */
+         emit_output_declaration(emit,
+                                 VGPU10_OPCODE_DCL_OUTPUT, semantic_index,
+                                 VGPU10_NAME_UNDEFINED,
+                                 VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                 TRUE,
+                                 map_tgsi_semantic_to_sgn_name(semantic_name));
+
+         if (semantic_index == 0) {
+            if (emit->key.fs.write_color0_to_n_cbufs > 1) {
+               /* Emit declarations for the additional color outputs
+                * for broadcasting.
+                */
+               unsigned j;
+               for (j = 1; j < emit->key.fs.write_color0_to_n_cbufs; j++) {
+                  /* Allocate a new output index */
+                  unsigned idx = emit->info.num_outputs + j - 1;
+                  emit->fs.color_out_index[j] = idx;
+                  emit_output_declaration(emit,
+                                        VGPU10_OPCODE_DCL_OUTPUT, idx,
+                                        VGPU10_NAME_UNDEFINED,
+                                        VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                        TRUE,
+                                        map_tgsi_semantic_to_sgn_name(semantic_name));
+                  emit->info.output_semantic_index[idx] = j;
+               }
+
+               emit->fs.num_color_outputs =
+                     emit->key.fs.write_color0_to_n_cbufs;
+            }
+         }
+         else {
+            assert(!emit->key.fs.write_color0_to_n_cbufs);
+         }
+      }
+      else if (semantic_name == TGSI_SEMANTIC_POSITION) {
+         /* Fragment depth output */
+         emit_fragdepth_output_declaration(emit);
+      }
+      else if (semantic_name == TGSI_SEMANTIC_SAMPLEMASK) {
+         /* Sample mask output */
+         emit_samplemask_output_declaration(emit);
+      }
+      else {
+         assert(!"Bad output semantic name");
+      }
+   }
+}
+
+
+/**
+ * Emit common output declaration for vertex processing.
+ */
+static void
+emit_vertex_output_declaration(struct svga_shader_emitter_v10 *emit,
+                               unsigned index, unsigned writemask,
+                               boolean addSignature)
+{
+   const enum tgsi_semantic semantic_name =
+         emit->info.output_semantic_name[index];
+   const unsigned semantic_index = emit->info.output_semantic_index[index];
+   unsigned name, type;
+   unsigned final_mask = VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
+
+   assert(emit->unit != PIPE_SHADER_FRAGMENT &&
+          emit->unit != PIPE_SHADER_COMPUTE);
+
+   switch (semantic_name) {
+   case TGSI_SEMANTIC_POSITION:
+      if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         /* position will be declared in control point only */
+         assert(emit->tcs.control_point_phase);
+         type = VGPU10_OPCODE_DCL_OUTPUT;
+         name = VGPU10_NAME_UNDEFINED;
+         emit_output_declaration(emit, type, index, name, final_mask, TRUE,
+                                 SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+         return;
+      }
+      else {
+         type = VGPU10_OPCODE_DCL_OUTPUT_SIV;
+         name = VGPU10_NAME_POSITION;
+      }
+      /* Save the index of the vertex position output register */
+      emit->vposition.out_index = index;
+      break;
+   case TGSI_SEMANTIC_CLIPDIST:
+      type = VGPU10_OPCODE_DCL_OUTPUT_SIV;
+      name = VGPU10_NAME_CLIP_DISTANCE;
+      /* save the starting index of the clip distance output register */
+      if (semantic_index == 0)
+         emit->clip_dist_out_index = index;
+      final_mask = apply_clip_plane_mask(emit, writemask, semantic_index);
+      if (final_mask == 0x0)
+         return; /* discard this do-nothing declaration */
+      break;
+   case TGSI_SEMANTIC_CLIPVERTEX:
+      type = VGPU10_OPCODE_DCL_OUTPUT;
+      name = VGPU10_NAME_UNDEFINED;
+      emit->clip_vertex_out_index = index;
+      break;
+   default:
+      /* generic output */
+      type = VGPU10_OPCODE_DCL_OUTPUT;
+      name = VGPU10_NAME_UNDEFINED;
+   }
+
+   emit_output_declaration(emit, type, index, name, final_mask, addSignature,
+                           map_tgsi_semantic_to_sgn_name(semantic_name));
+}
+
+
+/**
+ * Emit declaration for outputs in vertex shader.
+ */
+static void
+emit_vs_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      emit_vertex_output_declaration(emit, i, emit->output_usage_mask[i], TRUE);
+   }
+}
+
+
+/**
+ * A helper function to determine the writemask for an output
+ * for the specified stream.
+ */
+static unsigned
+output_writemask_for_stream(unsigned stream, ubyte output_streams,
+                                 ubyte output_usagemask)
+{
+   unsigned i;
+   unsigned writemask = 0;
+
+   for (i = 0; i < 4; i++) {
+      if ((output_streams & 0x3) == stream)
+         writemask |= (VGPU10_OPERAND_4_COMPONENT_MASK_X << i);
+      output_streams >>= 2;
+   }
+   return writemask & output_usagemask;
+}
+
+
+/**
+ * Emit declaration for outputs in geometry shader.
+ */
+static void
+emit_gs_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+   VGPU10OpcodeToken0 opcode0;
+   unsigned numStreamsSupported = 1;
+   int s;
+
+   if (emit->version >= 50) {
+      numStreamsSupported = ARRAY_SIZE(emit->info.num_stream_output_components);
+   }
+
+   /**
+    * Start emitting from the last stream first, so we end with
+    * stream 0, so any of the auxiliary output declarations will
+    * go to stream 0.
+    */
+   for (s = numStreamsSupported-1; s >= 0; s--) { 
+
+      if (emit->info.num_stream_output_components[s] == 0)
+         continue;
+
+      if (emit->version >= 50) {
+         /* DCL_STREAM stream */
+         begin_emit_instruction(emit);
+         emit_opcode(emit, VGPU10_OPCODE_DCL_STREAM, FALSE);
+         emit_stream_register(emit, s);
+         end_emit_instruction(emit);
+      }
+
+      /* emit output primitive topology declaration */
+      opcode0.value = 0;
+      opcode0.opcodeType = VGPU10_OPCODE_DCL_GS_OUTPUT_PRIMITIVE_TOPOLOGY;
+      opcode0.primitiveTopology = emit->gs.prim_topology;
+      emit_property_instruction(emit, opcode0, 0, 0);
+
+      for (i = 0; i < emit->info.num_outputs; i++) {
+         unsigned writemask;
+
+         /* find out the writemask for this stream */
+         writemask = output_writemask_for_stream(s, emit->info.output_streams[i],
+                                                 emit->output_usage_mask[i]);
+
+         if (writemask) {
+            enum tgsi_semantic semantic_name =
+               emit->info.output_semantic_name[i];
+
+            /* TODO: Still need to take care of a special case where a
+             *       single varying spans across multiple output registers.
+             */
+            switch(semantic_name) {
+            case TGSI_SEMANTIC_PRIMID:
+               emit_output_declaration(emit,
+                                       VGPU10_OPCODE_DCL_OUTPUT_SGV, i,
+                                       VGPU10_NAME_PRIMITIVE_ID,
+                                       VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                       FALSE,
+                                       map_tgsi_semantic_to_sgn_name(semantic_name));
+               break;
+            case TGSI_SEMANTIC_LAYER:
+               emit_output_declaration(emit,
+                                       VGPU10_OPCODE_DCL_OUTPUT_SIV, i,
+                                       VGPU10_NAME_RENDER_TARGET_ARRAY_INDEX,
+                                       VGPU10_OPERAND_4_COMPONENT_MASK_X,
+                                       FALSE,
+                                       map_tgsi_semantic_to_sgn_name(semantic_name));
+               break;
+            case TGSI_SEMANTIC_VIEWPORT_INDEX:
+               emit_output_declaration(emit,
+                                       VGPU10_OPCODE_DCL_OUTPUT_SIV, i,
+                                       VGPU10_NAME_VIEWPORT_ARRAY_INDEX,
+                                       VGPU10_OPERAND_4_COMPONENT_MASK_X,
+                                       FALSE,
+                                       map_tgsi_semantic_to_sgn_name(semantic_name));
+               emit->gs.viewport_index_out_index = i;
+               break;
+            default:
+               emit_vertex_output_declaration(emit, i, writemask, FALSE);
+            }
+         }
+      }
+   }
+
+   /* For geometry shader outputs, it is possible the same register is
+    * declared multiple times for different streams. So to avoid
+    * redundant signature entries, geometry shader output signature is done
+    * outside of the declaration.
+    */
+   struct svga_shader_signature *sgn = &emit->signature;
+   SVGA3dDXShaderSignatureEntry *sgnEntry;
+
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      if (emit->output_usage_mask[i]) {
+         enum tgsi_semantic sem_name = emit->info.output_semantic_name[i];
+
+         sgnEntry = &sgn->outputs[sgn->header.numOutputSignatures++];
+         set_shader_signature_entry(sgnEntry, i,
+                                    map_tgsi_semantic_to_sgn_name(sem_name),
+                                    emit->output_usage_mask[i],
+                                    SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                                    SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+      }
+   }
+}
+
+
+/**
+ * Emit the declaration for the tess inner/outer output.
+ * \param opcodeType either VGPU10_OPCODE_DCL_OUTPUT_SIV or _INPUT_SIV
+ * \param operandType either VGPU10_OPERAND_TYPE_OUTPUT or _INPUT
+ * \param name VGPU10_NAME_FINAL_*_TESSFACTOR value
+ */
+static void
+emit_tesslevel_declaration(struct svga_shader_emitter_v10 *emit,
+                           unsigned index, unsigned opcodeType,
+                           unsigned operandType, VGPU10_SYSTEM_NAME name,
+                           SVGA3dDXSignatureSemanticName sgnName)
+{
+   VGPU10OpcodeToken0 opcode0;
+   VGPU10OperandToken0 operand0;
+   VGPU10NameToken name_token;
+
+   assert(emit->version >= 50);
+   assert(name >= VGPU10_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR ||
+          (emit->key.tcs.prim_mode == PIPE_PRIM_LINES &&
+           name == VGPU10_NAME_UNDEFINED));
+   assert(name <= VGPU10_NAME_FINAL_LINE_DENSITY_TESSFACTOR);
+
+   assert(operandType == VGPU10_OPERAND_TYPE_OUTPUT ||
+          operandType == VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT);
+
+   opcode0.value = operand0.value = name_token.value = 0;
+
+   opcode0.opcodeType = opcodeType;
+   operand0.operandType = operandType;
+   operand0.numComponents = VGPU10_OPERAND_4_COMPONENT;
+   operand0.indexDimension = VGPU10_OPERAND_INDEX_1D;
+   operand0.mask = VGPU10_OPERAND_4_COMPONENT_MASK_X;
+   operand0.selectionMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
+   operand0.index0Representation = VGPU10_OPERAND_INDEX_IMMEDIATE32;
+
+   name_token.name = name;
+   emit_decl_instruction(emit, opcode0, operand0, name_token, index, 1);
+
+   /* Capture patch constant signature */
+   struct svga_shader_signature *sgn = &emit->signature;
+   SVGA3dDXShaderSignatureEntry *sgnEntry =
+      &sgn->patchConstants[sgn->header.numPatchConstantSignatures++];
+   set_shader_signature_entry(sgnEntry, index,
+                              sgnName, SVGA3DWRITEMASK_0,
+                              SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                              SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+}
+
+
+/**
+ * Emit output declarations for tessellation control shader.
+ */
+static void
+emit_tcs_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned int i;
+   unsigned outputIndex = emit->num_outputs;
+   struct svga_shader_signature *sgn = &emit->signature;
+
+   /**
+    * Initialize patch_generic_out_count so it won't be counted twice
+    * since this function is called twice, one for control point phase
+    * and another time for patch constant phase.
+    */
+   emit->tcs.patch_generic_out_count = 0;
+
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      unsigned index = i;
+      const enum tgsi_semantic semantic_name =
+         emit->info.output_semantic_name[i];
+
+      switch (semantic_name) {
+      case TGSI_SEMANTIC_TESSINNER:
+         emit->tcs.inner.tgsi_index = i;
+
+         /* skip per-patch output declarations in control point phase */
+         if (emit->tcs.control_point_phase)
+            break;
+
+         emit->tcs.inner.out_index = outputIndex;
+         switch (emit->key.tcs.prim_mode) {
+         case PIPE_PRIM_QUADS:
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR);
+
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR);
+            break;
+         case PIPE_PRIM_TRIANGLES:
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_TRI_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_INSIDE_TESSFACTOR);
+            break;
+         case PIPE_PRIM_LINES:
+            break;
+         default:
+            debug_printf("Unsupported primitive type");
+         }
+         break;
+
+      case TGSI_SEMANTIC_TESSOUTER:
+         emit->tcs.outer.tgsi_index = i;
+
+         /* skip per-patch output declarations in control point phase */
+         if (emit->tcs.control_point_phase)
+            break;
+
+         emit->tcs.outer.out_index = outputIndex;
+         switch (emit->key.tcs.prim_mode) {
+         case PIPE_PRIM_QUADS:
+            for (int j = 0; j < 4; j++) {
+               emit_tesslevel_declaration(emit, outputIndex++,
+                  VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+                  VGPU10_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + j,
+                  SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + j);
+            }
+            break;
+         case PIPE_PRIM_TRIANGLES:
+            for (int j = 0; j < 3; j++) {
+               emit_tesslevel_declaration(emit, outputIndex++,
+                  VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+                  VGPU10_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + j,
+                  SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + j);
+            }
+            break;
+         case PIPE_PRIM_LINES:
+            for (int j = 0; j < 2; j++) {
+               emit_tesslevel_declaration(emit, outputIndex++,
+                  VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+                  VGPU10_NAME_FINAL_LINE_DETAIL_TESSFACTOR + j,
+                  SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_LINE_DETAIL_TESSFACTOR + j);
+            }
+            break;
+         default:
+            debug_printf("Unsupported primitive type");
+         }
+         break;
+
+      case TGSI_SEMANTIC_PATCH:
+         if (emit->tcs.patch_generic_out_index == INVALID_INDEX)
+            emit->tcs.patch_generic_out_index= i;
+         emit->tcs.patch_generic_out_count++;
+
+         /* skip per-patch output declarations in control point phase */
+         if (emit->tcs.control_point_phase)
+            break;
+
+         emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT, index,
+                                 VGPU10_NAME_UNDEFINED,
+                                 VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                 FALSE,
+                                 map_tgsi_semantic_to_sgn_name(semantic_name));
+
+         SVGA3dDXShaderSignatureEntry *sgnEntry =
+            &sgn->patchConstants[sgn->header.numPatchConstantSignatures++];
+         set_shader_signature_entry(sgnEntry, index,
+                                    map_tgsi_semantic_to_sgn_name(semantic_name),
+                                    VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                    SVGADX_SIGNATURE_REGISTER_COMPONENT_UNKNOWN,
+                                    SVGADX_SIGNATURE_MIN_PRECISION_DEFAULT);
+
+         break;
+
+      default:
+         /* save the starting index of control point outputs */
+         if (emit->tcs.control_point_out_index == INVALID_INDEX)
+            emit->tcs.control_point_out_index = i;
+         emit->tcs.control_point_out_count++;
+
+         /* skip control point output declarations in patch constant phase */
+         if (!emit->tcs.control_point_phase)
+            break;
+
+         emit_vertex_output_declaration(emit, i, emit->output_usage_mask[i],
+                                        TRUE);
+
+      }
+   }
+
+   if (emit->tcs.control_point_phase) {
+      /**
+       * Add missing control point output in control point phase.
+       */
+      if (emit->tcs.control_point_out_index == INVALID_INDEX) {
+         /* use register index after tessellation factors */
+         switch (emit->key.tcs.prim_mode) {
+         case PIPE_PRIM_QUADS:
+            emit->tcs.control_point_out_index = outputIndex + 6;
+            break;
+         case PIPE_PRIM_TRIANGLES:
+            emit->tcs.control_point_out_index = outputIndex + 4;
+            break;
+         default:
+            emit->tcs.control_point_out_index = outputIndex + 2;
+            break;
+         }
+         emit->tcs.control_point_out_count++;
+         emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT_SIV,
+                                 emit->tcs.control_point_out_index,
+                                 VGPU10_NAME_POSITION,
+                                 VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                 TRUE,
+                                 SVGADX_SIGNATURE_SEMANTIC_NAME_POSITION);
+
+         /* If tcs does not output any control point output,
+          * we can end the hull shader control point phase here
+          * after emitting the default control point output.
+          */
+         emit->skip_instruction = TRUE;
+      }
+   }
+   else {
+      if (emit->tcs.outer.out_index == INVALID_INDEX) {
+         /* since the TCS did not declare out outer tess level output register,
+          * we declare it here for patch constant phase only.
+          */
+         emit->tcs.outer.out_index = outputIndex;
+         if (emit->key.tcs.prim_mode == PIPE_PRIM_QUADS) {
+            for (int i = 0; i < 4; i++) {
+               emit_tesslevel_declaration(emit, outputIndex++,
+                  VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+                  VGPU10_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + i,
+                  SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + i);
+            }
+         }
+         else if (emit->key.tcs.prim_mode == PIPE_PRIM_TRIANGLES) {
+            for (int i = 0; i < 3; i++) {
+               emit_tesslevel_declaration(emit, outputIndex++,
+                  VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+                  VGPU10_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + i,
+                  SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + i);
+            }
+         }
+      }
+
+      if (emit->tcs.inner.out_index == INVALID_INDEX) {
+         /* since the TCS did not declare out inner tess level output register,
+          * we declare it here
+          */
+         emit->tcs.inner.out_index = outputIndex;
+         if (emit->key.tcs.prim_mode == PIPE_PRIM_QUADS) {
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR);
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR);
+         }
+         else if (emit->key.tcs.prim_mode == PIPE_PRIM_TRIANGLES) {
+            emit_tesslevel_declaration(emit, outputIndex++,
+               VGPU10_OPCODE_DCL_OUTPUT_SIV, VGPU10_OPERAND_TYPE_OUTPUT,
+               VGPU10_NAME_FINAL_TRI_INSIDE_TESSFACTOR,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_INSIDE_TESSFACTOR);
+         }
+      }
+   }
+   emit->num_outputs = outputIndex;
+}
+
+
+/**
+ * Emit output declarations for tessellation evaluation shader.
+ */
+static void
+emit_tes_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned int i;
+
+   for (i = 0; i < emit->info.num_outputs; i++) {
+      emit_vertex_output_declaration(emit, i, emit->output_usage_mask[i], TRUE);
+   }
+}
+
+
+/**
  * Emit the declaration for a system value input/output.
  */
 static void
@@ -2344,7 +4320,8 @@ emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
                              VGPU10_OPERAND_4_COMPONENT,
                              VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
                              VGPU10_OPERAND_4_COMPONENT_MASK_X,
-                             VGPU10_INTERPOLATION_UNDEFINED);
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
       break;
    case TGSI_SEMANTIC_VERTEXID:
       index = alloc_system_value_index(emit, index);
@@ -2356,7 +4333,8 @@ emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
                              VGPU10_OPERAND_4_COMPONENT,
                              VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
                              VGPU10_OPERAND_4_COMPONENT_MASK_X,
-                             VGPU10_INTERPOLATION_UNDEFINED);
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
       break;
    case TGSI_SEMANTIC_SAMPLEID:
       assert(emit->unit == PIPE_SHADER_FRAGMENT);
@@ -2370,7 +4348,8 @@ emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
                              VGPU10_OPERAND_4_COMPONENT,
                              VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
                              VGPU10_OPERAND_4_COMPONENT_MASK_X,
-                             VGPU10_INTERPOLATION_CONSTANT);
+                             VGPU10_INTERPOLATION_CONSTANT, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
       break;
    case TGSI_SEMANTIC_SAMPLEPOS:
       /* This system value contains the position of the current sample
@@ -2382,9 +4361,118 @@ emit_system_value_declaration(struct svga_shader_emitter_v10 *emit,
       emit->fs.sample_pos_sys_index = index;
       index = alloc_system_value_index(emit, index);
       break;
+   case TGSI_SEMANTIC_INVOCATIONID:
+      /* Note: invocation id input is mapped to different register depending
+       * on the shader type. In GS, it will be mapped to vGSInstanceID#.
+       * In TCS, it will be mapped to vOutputControlPointID#.
+       * Since in both cases, the mapped name is unique rather than
+       * just a generic input name ("v#"), so there is no need to remap
+       * the index value.
+       */
+      assert(emit->unit == PIPE_SHADER_GEOMETRY ||
+             emit->unit == PIPE_SHADER_TESS_CTRL);
+      assert(emit->version >= 50);
+
+      if (emit->unit == PIPE_SHADER_GEOMETRY) {
+         emit->gs.invocation_id_sys_index = index;
+         emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                                VGPU10_OPERAND_TYPE_INPUT_GS_INSTANCE_ID,
+                                VGPU10_OPERAND_INDEX_0D,
+                                index, 1,
+                                VGPU10_NAME_UNDEFINED,
+                                VGPU10_OPERAND_0_COMPONENT,
+                                VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                                0,
+                                VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                                SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+      } else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         /* The emission of the control point id will be done
+          * in the control point phase in emit_hull_shader_control_point_phase().
+          */
+         emit->tcs.invocation_id_sys_index = index;
+      }
+      break;
+   case TGSI_SEMANTIC_SAMPLEMASK:
+      /* Note: the PS sample mask input has a unique name ("vCoverage#")
+       * rather than just a generic input name ("v#") so no need to remap the
+       * index value.
+       */
+      assert(emit->unit == PIPE_SHADER_FRAGMENT);
+      assert(emit->version >= 50);
+      emit->fs.sample_mask_in_sys_index = index;
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_COVERAGE_MASK,
+                             VGPU10_OPERAND_INDEX_0D,
+                             index, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_1_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             0,
+                             VGPU10_INTERPOLATION_CONSTANT, TRUE,
+                             SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+      break;
+   case TGSI_SEMANTIC_TESSCOORD:
+      assert(emit->version >= 50);
+
+      unsigned usageMask = 0;
+
+      if (emit->tes.prim_mode == PIPE_PRIM_TRIANGLES) {
+         usageMask = VGPU10_OPERAND_4_COMPONENT_MASK_XYZ;
+      }
+      else if (emit->tes.prim_mode == PIPE_PRIM_LINES ||
+               emit->tes.prim_mode == PIPE_PRIM_QUADS) {
+         usageMask = VGPU10_OPERAND_4_COMPONENT_MASK_XY;
+      }
+
+      emit->tes.tesscoord_sys_index = index;
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT_DOMAIN_POINT,
+                             VGPU10_OPERAND_INDEX_0D,
+                             index, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             usageMask,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+      break;
+   case TGSI_SEMANTIC_TESSINNER:
+      assert(emit->version >= 50);
+      emit->tes.inner.tgsi_index = index;
+      break;
+   case TGSI_SEMANTIC_TESSOUTER:
+      assert(emit->version >= 50);
+      emit->tes.outer.tgsi_index = index;
+      break;
+   case TGSI_SEMANTIC_VERTICESIN:
+      assert(emit->unit == PIPE_SHADER_TESS_CTRL);
+      assert(emit->version >= 50);
+
+      /* save the system value index */
+      emit->tcs.vertices_per_patch_index = index;
+      break;
+   case TGSI_SEMANTIC_PRIMID:
+      assert(emit->version >= 50);
+      if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         emit->tcs.prim_id_index = index;
+      }
+      else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+         emit->tes.prim_id_index = index;
+         emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                                VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID,
+                                VGPU10_OPERAND_INDEX_0D,
+                                index, 1,
+                                VGPU10_NAME_UNDEFINED,
+                                VGPU10_OPERAND_0_COMPONENT,
+                                VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                                0,
+                                VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                                map_tgsi_semantic_to_sgn_name(semantic_name));
+      }
+      break;
    default:
-      debug_printf("unexpected sytem value semantic index %u\n",
-         semantic_name);
+      debug_printf("unexpected system value semantic index %u / %s\n",
+                   semantic_name, tgsi_semantic_names[semantic_name]);
    }
 }
 
@@ -2414,24 +4502,12 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
           * and the size of the array.
           */
          const unsigned arrayID = MIN2(decl->Array.ArrayID, MAX_TEMP_ARRAYS);
-         unsigned i;
-
          assert(arrayID < ARRAY_SIZE(emit->temp_arrays));
 
          /* Save this array so we can emit the declaration for it later */
-         emit->temp_arrays[arrayID].start = decl->Range.First;
-         emit->temp_arrays[arrayID].size =
-            decl->Range.Last - decl->Range.First + 1;
-
-         emit->num_temp_arrays = MAX2(emit->num_temp_arrays, arrayID + 1);
-         assert(emit->num_temp_arrays <= MAX_TEMP_ARRAYS);
-         emit->num_temp_arrays = MIN2(emit->num_temp_arrays, MAX_TEMP_ARRAYS);
-
-         /* Fill in the temp_map entries for this array */
-         for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-            emit->temp_map[i].arrayId = arrayID;
-            emit->temp_map[i].index = i - decl->Range.First;
-         }
+         create_temp_array(emit, arrayID, decl->Range.First,
+                           decl->Range.Last - decl->Range.First + 1,
+                           decl->Range.First);
       }
 
       /* for all temps, indexed or not, keep track of highest index */
@@ -2514,275 +4590,472 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
 
 
 /**
- * Emit all input declarations.
+ * Emit input declarations for fragment shader.
  */
-static boolean
-emit_input_declarations(struct svga_shader_emitter_v10 *emit)
+static void
+emit_fs_input_declarations(struct svga_shader_emitter_v10 *emit)
 {
    unsigned i;
 
-   if (emit->unit == PIPE_SHADER_FRAGMENT) {
-
-      for (i = 0; i < emit->linkage.num_inputs; i++) {
-         enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
-         unsigned usage_mask = emit->info.input_usage_mask[i];
-         unsigned index = emit->linkage.input_map[i];
-         VGPU10_OPCODE_TYPE type;
-         VGPU10_INTERPOLATION_MODE interpolationMode;
-         VGPU10_SYSTEM_NAME name;
-
-         if (usage_mask == 0)
-            continue;  /* register is not actually used */
-
-         if (semantic_name == TGSI_SEMANTIC_POSITION) {
-            /* fragment position input */
-            type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
-            interpolationMode = VGPU10_INTERPOLATION_LINEAR;
-            name = VGPU10_NAME_POSITION;
-            if (usage_mask & TGSI_WRITEMASK_W) {
-               /* we need to replace use of 'w' with '1/w' */
-               emit->fs.fragcoord_input_index = i;
-            }
+   for (i = 0; i < emit->linkage.num_inputs; i++) {
+      enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
+      unsigned usage_mask = emit->info.input_usage_mask[i];
+      unsigned index = emit->linkage.input_map[i];
+      unsigned type, interpolationMode, name;
+      unsigned mask = VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
+
+      if (usage_mask == 0)
+         continue;  /* register is not actually used */
+
+      if (semantic_name == TGSI_SEMANTIC_POSITION) {
+         /* fragment position input */
+         type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
+         interpolationMode = VGPU10_INTERPOLATION_LINEAR;
+         name = VGPU10_NAME_POSITION;
+         if (usage_mask & TGSI_WRITEMASK_W) {
+            /* we need to replace use of 'w' with '1/w' */
+            emit->fs.fragcoord_input_index = i;
          }
-         else if (semantic_name == TGSI_SEMANTIC_FACE) {
-            /* fragment front-facing input */
-            type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
-            interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
-            name = VGPU10_NAME_IS_FRONT_FACE;
-            emit->fs.face_input_index = i;
-         }
-         else if (semantic_name == TGSI_SEMANTIC_PRIMID) {
-            /* primitive ID */
-            type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
-            interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
-            name = VGPU10_NAME_PRIMITIVE_ID;
-         }
-         else if (semantic_name == TGSI_SEMANTIC_SAMPLEID) {
-            /* sample index / ID */
+      }
+      else if (semantic_name == TGSI_SEMANTIC_FACE) {
+         /* fragment front-facing input */
+         type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
+         interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
+         name = VGPU10_NAME_IS_FRONT_FACE;
+         emit->fs.face_input_index = i;
+      }
+      else if (semantic_name == TGSI_SEMANTIC_PRIMID) {
+         /* primitive ID */
+         type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
+         interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
+         name = VGPU10_NAME_PRIMITIVE_ID;
+      }
+      else if (semantic_name == TGSI_SEMANTIC_SAMPLEID) {
+         /* sample index / ID */
+         type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
+         interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
+         name = VGPU10_NAME_SAMPLE_INDEX;
+      }
+      else if (semantic_name == TGSI_SEMANTIC_LAYER) {
+         /* render target array index */
+         if (emit->key.fs.layer_to_zero) {
+            /**
+             * The shader from the previous stage does not write to layer,
+             * so reading the layer index in fragment shader should return 0.
+             */
+            emit->fs.layer_input_index = i;
+            continue;
+         } else {
             type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
             interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
-            name = VGPU10_NAME_SAMPLE_INDEX;
+            name = VGPU10_NAME_RENDER_TARGET_ARRAY_INDEX;
+            mask = VGPU10_OPERAND_4_COMPONENT_MASK_X;
          }
-         else {
-            /* general fragment input */
-            type = VGPU10_OPCODE_DCL_INPUT_PS;
-            interpolationMode =
+      }
+      else if (semantic_name == TGSI_SEMANTIC_VIEWPORT_INDEX) {
+         /* viewport index */
+         type = VGPU10_OPCODE_DCL_INPUT_PS_SGV;
+         interpolationMode = VGPU10_INTERPOLATION_CONSTANT;
+         name = VGPU10_NAME_VIEWPORT_ARRAY_INDEX;
+         mask = VGPU10_OPERAND_4_COMPONENT_MASK_X;
+      }
+      else {
+         /* general fragment input */
+         type = VGPU10_OPCODE_DCL_INPUT_PS;
+         interpolationMode =
                translate_interpolation(emit,
                                        emit->info.input_interpolate[i],
                                        emit->info.input_interpolate_loc[i]);
 
-            /* keeps track if flat interpolation mode is being used */
-            emit->uses_flat_interp |=
+         /* keeps track if flat interpolation mode is being used */
+         emit->uses_flat_interp = emit->uses_flat_interp ||
                (interpolationMode == VGPU10_INTERPOLATION_CONSTANT);
 
-            name = VGPU10_NAME_UNDEFINED;
-         }
-
-         emit_input_declaration(emit, type,
-                                VGPU10_OPERAND_TYPE_INPUT,
-                                VGPU10_OPERAND_INDEX_1D, index, 1,
-                                name,
-                                VGPU10_OPERAND_4_COMPONENT,
-                                VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
-                                VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
-                                interpolationMode);
+         name = VGPU10_NAME_UNDEFINED;
       }
+
+      emit_input_declaration(emit, type,
+                             VGPU10_OPERAND_TYPE_INPUT,
+                             VGPU10_OPERAND_INDEX_1D, index, 1,
+                             name,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             mask,
+                             interpolationMode, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
    }
-   else if (emit->unit == PIPE_SHADER_GEOMETRY) {
+}
 
-      for (i = 0; i < emit->info.num_inputs; i++) {
-         enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
-         unsigned usage_mask = emit->info.input_usage_mask[i];
-         unsigned index = emit->linkage.input_map[i];
-         VGPU10_OPCODE_TYPE opcodeType, operandType;
-         VGPU10_OPERAND_NUM_COMPONENTS numComp;
-         VGPU10_OPERAND_4_COMPONENT_SELECTION_MODE selMode;
-         VGPU10_SYSTEM_NAME name;
-         VGPU10_OPERAND_INDEX_DIMENSION dim;
-
-         if (usage_mask == 0)
-            continue;  /* register is not actually used */
-
-         opcodeType = VGPU10_OPCODE_DCL_INPUT;
-         operandType = VGPU10_OPERAND_TYPE_INPUT;
-         numComp = VGPU10_OPERAND_4_COMPONENT;
-         selMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
-         name = VGPU10_NAME_UNDEFINED;
 
-         /* all geometry shader inputs are two dimensional except
-          * gl_PrimitiveID
+/**
+ * Emit input declarations for vertex shader.
+ */
+static void
+emit_vs_input_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+
+   for (i = 0; i < emit->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
+      unsigned usage_mask = emit->info.input_usage_mask[i];
+      unsigned index = i;
+
+      if (usage_mask == 0)
+         continue;  /* register is not actually used */
+
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             VGPU10_OPERAND_TYPE_INPUT,
+                             VGPU10_OPERAND_INDEX_1D, index, 1,
+                             VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED);
+   }
+}
+
+
+/**
+ * Emit input declarations for geometry shader.
+ */
+static void
+emit_gs_input_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+
+   for (i = 0; i < emit->info.num_inputs; i++) {
+      enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
+      unsigned usage_mask = emit->info.input_usage_mask[i];
+      unsigned index = emit->linkage.input_map[i];
+      unsigned opcodeType, operandType;
+      unsigned numComp, selMode;
+      unsigned name;
+      unsigned dim;
+
+      if (usage_mask == 0)
+         continue;  /* register is not actually used */
+
+      opcodeType = VGPU10_OPCODE_DCL_INPUT;
+      operandType = VGPU10_OPERAND_TYPE_INPUT;
+      numComp = VGPU10_OPERAND_4_COMPONENT;
+      selMode = VGPU10_OPERAND_4_COMPONENT_MASK_MODE;
+      name = VGPU10_NAME_UNDEFINED;
+
+      /* all geometry shader inputs are two dimensional except
+       * gl_PrimitiveID
+       */
+      dim = VGPU10_OPERAND_INDEX_2D;
+
+      if (semantic_name == TGSI_SEMANTIC_PRIMID) {
+         /* Primitive ID */
+         operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
+         dim = VGPU10_OPERAND_INDEX_0D;
+         numComp = VGPU10_OPERAND_0_COMPONENT;
+         selMode = 0;
+
+         /* also save the register index so we can check for
+          * primitive id when emit src register. We need to modify the
+          * operand type, index dimension when emit primitive id src reg.
           */
-         dim = VGPU10_OPERAND_INDEX_2D;
+          emit->gs.prim_id_index = i;
+      }
+      else if (semantic_name == TGSI_SEMANTIC_POSITION) {
+         /* vertex position input */
+         opcodeType = VGPU10_OPCODE_DCL_INPUT_SIV;
+         name = VGPU10_NAME_POSITION;
+      }
 
-         if (semantic_name == TGSI_SEMANTIC_PRIMID) {
-            /* Primitive ID */
-            operandType = VGPU10_OPERAND_TYPE_INPUT_PRIMITIVEID;
-            dim = VGPU10_OPERAND_INDEX_0D;
-            numComp = VGPU10_OPERAND_0_COMPONENT;
-            selMode = 0;
+      emit_input_declaration(emit, opcodeType, operandType,
+                             dim, index,
+                             emit->gs.input_size,
+                             name,
+                             numComp, selMode,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
+   }
+}
 
-            /* also save the register index so we can check for
-             * primitive id when emit src register. We need to modify the
-             * operand type, index dimension when emit primitive id src reg.
-             */
-            emit->gs.prim_id_index = i;
-         }
-         else if (semantic_name == TGSI_SEMANTIC_POSITION) {
-            /* vertex position input */
-            opcodeType = VGPU10_OPCODE_DCL_INPUT_SIV;
-            name = VGPU10_NAME_POSITION;
-         }
 
-         emit_input_declaration(emit, opcodeType, operandType,
-                                dim, index,
-                                emit->gs.input_size,
-                                name,
-                                numComp, selMode,
-                                VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
-                                VGPU10_INTERPOLATION_UNDEFINED);
+/**
+ * Emit input declarations for tessellation control shader.
+ */
+static void
+emit_tcs_input_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   unsigned i;
+   unsigned size = emit->key.tcs.vertices_per_patch;
+   unsigned indicesMask = 0;
+
+   for (i = 0; i < emit->info.num_inputs; i++) {
+      unsigned usage_mask = emit->info.input_usage_mask[i];
+      unsigned index = emit->linkage.input_map[i];
+      enum tgsi_semantic semantic_name = emit->info.input_semantic_name[i];
+      VGPU10_SYSTEM_NAME name = VGPU10_NAME_UNDEFINED;
+      VGPU10_OPERAND_TYPE operandType = VGPU10_OPERAND_TYPE_INPUT;
+      boolean addSignature = TRUE;
+
+      /* indices that are declared */
+      indicesMask |= 1 << index;
+
+      if (semantic_name == TGSI_SEMANTIC_POSITION ||
+          index == emit->linkage.position_index) {
+         /* save the input control point index for later use */
+         emit->tcs.control_point_input_index = i;
+      }
+      else if (usage_mask == 0) {
+         continue;  /* register is not actually used */
+      }
+
+      /* input control points in the patch constant phase are emitted in the
+       * vicp register rather than the v register.
+       */
+      if (!emit->tcs.control_point_phase) {
+         operandType = VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT;
+         addSignature = emit->tcs.control_point_out_count == 0;
       }
+
+      /* Tessellation control shader inputs are two dimensional.
+       * The array size is determined by the patch vertex count.
+       */
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                             operandType,
+                             VGPU10_OPERAND_INDEX_2D,
+                             index, size, name,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED,
+                             addSignature,
+                             map_tgsi_semantic_to_sgn_name(semantic_name));
+
+   }
+
+   if (emit->tcs.control_point_phase) {
+      if (emit->tcs.control_point_input_index == INVALID_INDEX) {
+
+         /* Add input control point declaration if it does not exist */
+         if ((indicesMask & (1 << emit->linkage.position_index)) == 0) {
+            emit->linkage.input_map[emit->linkage.num_inputs] =
+               emit->linkage.position_index;
+            emit->tcs.control_point_input_index = emit->linkage.num_inputs++;
+
+            emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
+                                   VGPU10_OPERAND_TYPE_INPUT,
+                                   VGPU10_OPERAND_INDEX_2D,
+                                   emit->linkage.position_index,
+                                   emit->key.tcs.vertices_per_patch,
+                                   VGPU10_NAME_UNDEFINED,
+                                   VGPU10_OPERAND_4_COMPONENT,
+                                   VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                                   VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                                   VGPU10_INTERPOLATION_UNDEFINED, TRUE,
+                                   SVGADX_SIGNATURE_SEMANTIC_NAME_POSITION);
+         }
+      }
+
+      /* Also add an address register for the indirection to the
+       * input control points
+       */
+      emit->tcs.control_point_addr_index = emit->num_address_regs++;
    }
-   else {
-      assert(emit->unit == PIPE_SHADER_VERTEX);
+}
 
-      for (i = 0; i < emit->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
-         unsigned usage_mask = emit->info.input_usage_mask[i];
-         unsigned index = i;
 
-         if (usage_mask == 0)
-            continue;  /* register is not actually used */
+static void
+emit_tessfactor_input_declarations(struct svga_shader_emitter_v10 *emit)
+{
 
-         emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT,
-                                VGPU10_OPERAND_TYPE_INPUT,
-                                VGPU10_OPERAND_INDEX_1D, index, 1,
-                                VGPU10_NAME_UNDEFINED,
-                                VGPU10_OPERAND_4_COMPONENT,
-                                VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
-                                VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
-                                VGPU10_INTERPOLATION_UNDEFINED);
+   /* In tcs, tess factors are emitted as extra outputs.
+    * The starting register index for the tess factors is captured
+    * in the compile key.
+    */
+   unsigned inputIndex = emit->key.tes.tessfactor_index;
+
+   if (emit->tes.prim_mode == PIPE_PRIM_QUADS) {
+      if (emit->key.tes.need_tessouter) {
+         emit->tes.outer.in_index = inputIndex;
+         for (int i = 0; i < 4; i++) {
+            emit_tesslevel_declaration(emit, inputIndex++,
+               VGPU10_OPCODE_DCL_INPUT_SIV,
+               VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+               VGPU10_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + i,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_EQ_0_EDGE_TESSFACTOR + i);
+         }
+      }
+
+      if (emit->key.tes.need_tessinner) {
+         emit->tes.inner.in_index = inputIndex;
+         emit_tesslevel_declaration(emit, inputIndex++,
+            VGPU10_OPCODE_DCL_INPUT_SIV,
+            VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+            VGPU10_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR,
+            SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_U_INSIDE_TESSFACTOR);
+
+         emit_tesslevel_declaration(emit, inputIndex++,
+            VGPU10_OPCODE_DCL_INPUT_SIV,
+            VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+            VGPU10_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR,
+            SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_QUAD_V_INSIDE_TESSFACTOR);
       }
    }
+   else if (emit->tes.prim_mode == PIPE_PRIM_TRIANGLES) {
+      if (emit->key.tes.need_tessouter) {
+         emit->tes.outer.in_index = inputIndex;
+         for (int i = 0; i < 3; i++) {
+            emit_tesslevel_declaration(emit, inputIndex++,
+               VGPU10_OPCODE_DCL_INPUT_SIV,
+               VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+               VGPU10_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + i,
+               SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_U_EQ_0_EDGE_TESSFACTOR + i);
+         }
+      }
 
-   return TRUE;
+      if (emit->key.tes.need_tessinner) {
+         emit->tes.inner.in_index = inputIndex;
+         emit_tesslevel_declaration(emit, inputIndex++,
+            VGPU10_OPCODE_DCL_INPUT_SIV,
+            VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+            VGPU10_NAME_FINAL_TRI_INSIDE_TESSFACTOR,
+            SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_TRI_INSIDE_TESSFACTOR);
+      }
+   }
+   else if (emit->tes.prim_mode == PIPE_PRIM_LINES) {
+      if (emit->key.tes.need_tessouter) {
+         emit->tes.outer.in_index = inputIndex;
+         emit_tesslevel_declaration(emit, inputIndex++,
+            VGPU10_OPCODE_DCL_INPUT_SIV,
+            VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+            VGPU10_NAME_FINAL_LINE_DETAIL_TESSFACTOR,
+            SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_LINE_DETAIL_TESSFACTOR);
+
+         emit_tesslevel_declaration(emit, inputIndex++,
+            VGPU10_OPCODE_DCL_INPUT_SIV,
+            VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT,
+            VGPU10_NAME_FINAL_LINE_DENSITY_TESSFACTOR,
+            SVGADX_SIGNATURE_SEMANTIC_NAME_FINAL_LINE_DENSITY_TESSFACTOR);
+      }
+   }
 }
 
 
 /**
- * Emit all output declarations.
+ * Emit input declarations for tessellation evaluation shader.
  */
-static boolean
-emit_output_declarations(struct svga_shader_emitter_v10 *emit)
+static void
+emit_tes_input_declarations(struct svga_shader_emitter_v10 *emit)
 {
    unsigned i;
 
-   for (i = 0; i < emit->info.num_outputs; i++) {
-      /*const unsigned usage_mask = emit->info.output_usage_mask[i];*/
+   for (i = 0; i < emit->info.num_inputs; i++) {
+      unsigned usage_mask = emit->info.input_usage_mask[i];
+      unsigned index = emit->linkage.input_map[i];
+      unsigned size;
       const enum tgsi_semantic semantic_name =
-         emit->info.output_semantic_name[i];
-      const unsigned semantic_index = emit->info.output_semantic_index[i];
-      unsigned index = i;
+         emit->info.input_semantic_name[i];
+      SVGA3dDXSignatureSemanticName sgn_name;
+      VGPU10_OPERAND_TYPE operandType;
+      VGPU10_OPERAND_INDEX_DIMENSION dim;
+
+      if (usage_mask == 0)
+         usage_mask = 1;  /* at least set usage mask to one */
+
+      if (semantic_name == TGSI_SEMANTIC_PATCH) {
+         operandType = VGPU10_OPERAND_TYPE_INPUT_PATCH_CONSTANT;
+         dim = VGPU10_OPERAND_INDEX_1D;
+         size = 1;
+         sgn_name = map_tgsi_semantic_to_sgn_name(semantic_name);
+      }
+      else {
+         operandType = VGPU10_OPERAND_TYPE_INPUT_CONTROL_POINT;
+         dim = VGPU10_OPERAND_INDEX_2D;
+         size = emit->key.tes.vertices_per_patch;
+         sgn_name = SVGADX_SIGNATURE_SEMANTIC_NAME_UNDEFINED;
+      }
+
+      emit_input_declaration(emit, VGPU10_OPCODE_DCL_INPUT, operandType,
+                             dim, index, size, VGPU10_NAME_UNDEFINED,
+                             VGPU10_OPERAND_4_COMPONENT,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_MODE,
+                             VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                             VGPU10_INTERPOLATION_UNDEFINED,
+                             TRUE, sgn_name);
+   }
 
-      if (emit->unit == PIPE_SHADER_FRAGMENT) {
-         if (semantic_name == TGSI_SEMANTIC_COLOR) {
-            assert(semantic_index < ARRAY_SIZE(emit->fs.color_out_index));
+   emit_tessfactor_input_declarations(emit);
+}
 
-            emit->fs.color_out_index[semantic_index] = index;
 
-            emit->fs.num_color_outputs = MAX2(emit->fs.num_color_outputs,
-                                              index + 1);
+/**
+ * Emit all input declarations.
+ */
+static boolean
+emit_input_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   emit->index_range.required =
+      emit->info.indirect_files & (1 << TGSI_FILE_INPUT) ? TRUE : FALSE;
 
-            /* The semantic index is the shader's color output/buffer index */
-            emit_output_declaration(emit,
-                                    VGPU10_OPCODE_DCL_OUTPUT, semantic_index,
-                                    VGPU10_NAME_UNDEFINED,
-                                    VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
+   switch (emit->unit) {
+   case PIPE_SHADER_FRAGMENT:
+      emit_fs_input_declarations(emit);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      emit_gs_input_declarations(emit);
+      break;
+   case PIPE_SHADER_VERTEX:
+      emit_vs_input_declarations(emit);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      emit_tcs_input_declarations(emit);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      emit_tes_input_declarations(emit);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      //XXX emit_cs_input_declarations(emit);
+      break;
+   default:
+      assert(0);
+   }
 
-            if (semantic_index == 0) {
-               if (emit->key.fs.write_color0_to_n_cbufs > 1) {
-                  /* Emit declarations for the additional color outputs
-                   * for broadcasting.
-                   */
-                  unsigned j;
-                  for (j = 1; j < emit->key.fs.write_color0_to_n_cbufs; j++) {
-                     /* Allocate a new output index */
-                     unsigned idx = emit->info.num_outputs + j - 1;
-                     emit->fs.color_out_index[j] = idx;
-                     emit_output_declaration(emit,
-                                        VGPU10_OPCODE_DCL_OUTPUT, idx,
-                                        VGPU10_NAME_UNDEFINED,
-                                        VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
-                     emit->info.output_semantic_index[idx] = j;
-                  }
+   if (emit->index_range.start_index != INVALID_INDEX) {
+      emit_index_range_declaration(emit);
+   }
+   emit->index_range.required = FALSE;
+   return TRUE;
+}
 
-                  emit->fs.num_color_outputs =
-                     emit->key.fs.write_color0_to_n_cbufs;
-               }
-            }
-            else {
-               assert(!emit->key.fs.write_color0_to_n_cbufs);
-            }
-         }
-         else if (semantic_name == TGSI_SEMANTIC_POSITION) {
-            /* Fragment depth output */
-            emit_fragdepth_output_declaration(emit);
-         }
-         else if (semantic_name == TGSI_SEMANTIC_SAMPLEMASK) {
-            /* Fragment depth output */
-            emit_samplemask_output_declaration(emit);
-         }
-         else {
-            assert(!"Bad output semantic name");
-         }
-      }
-      else {
-         /* VS or GS */
-         VGPU10_COMPONENT_NAME name;
-         VGPU10_OPCODE_TYPE type;
-         unsigned writemask = VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
-
-         switch (semantic_name) {
-         case TGSI_SEMANTIC_POSITION:
-            assert(emit->unit != PIPE_SHADER_FRAGMENT);
-            type = VGPU10_OPCODE_DCL_OUTPUT_SIV;
-            name = VGPU10_NAME_POSITION;
-            /* Save the index of the vertex position output register */
-            emit->vposition.out_index = index;
-            break;
-         case TGSI_SEMANTIC_CLIPDIST:
-            type = VGPU10_OPCODE_DCL_OUTPUT_SIV;
-            name = VGPU10_NAME_CLIP_DISTANCE;
-            /* save the starting index of the clip distance output register */
-            if (semantic_index == 0)
-               emit->clip_dist_out_index = index;
-            writemask = emit->output_usage_mask[index];
-            writemask = apply_clip_plane_mask(emit, writemask, semantic_index);
-            if (writemask == 0x0) {
-               continue; /* discard this do-nothing declaration */
-            }
-            break;
-         case TGSI_SEMANTIC_PRIMID:
-            assert(emit->unit == PIPE_SHADER_GEOMETRY);
-            type = VGPU10_OPCODE_DCL_OUTPUT_SGV;
-            name = VGPU10_NAME_PRIMITIVE_ID;
-            break;
-         case TGSI_SEMANTIC_LAYER:
-            assert(emit->unit == PIPE_SHADER_GEOMETRY);
-            type = VGPU10_OPCODE_DCL_OUTPUT_SGV;
-            name = VGPU10_NAME_RENDER_TARGET_ARRAY_INDEX;
-            break;
-         case TGSI_SEMANTIC_CLIPVERTEX:
-            type = VGPU10_OPCODE_DCL_OUTPUT;
-            name = VGPU10_NAME_UNDEFINED;
-            emit->clip_vertex_out_index = index;
-            break;
-         default:
-            /* generic output */
-            type = VGPU10_OPCODE_DCL_OUTPUT;
-            name = VGPU10_NAME_UNDEFINED;
-         }
 
-         emit_output_declaration(emit, type, index, name, writemask);
-      }
+/**
+ * Emit all output declarations.
+ */
+static boolean
+emit_output_declarations(struct svga_shader_emitter_v10 *emit)
+{
+   emit->index_range.required =
+      emit->info.indirect_files & (1 << TGSI_FILE_OUTPUT) ? TRUE : FALSE;
+
+   switch (emit->unit) {
+   case PIPE_SHADER_FRAGMENT:
+      emit_fs_output_declarations(emit);
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      emit_gs_output_declarations(emit);
+      break;
+   case PIPE_SHADER_VERTEX:
+      emit_vs_output_declarations(emit);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      emit_tcs_output_declarations(emit);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      emit_tes_output_declarations(emit);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      //XXX emit_cs_output_declarations(emit);
+      break;
+   default:
+      assert(0);
    }
 
    if (emit->vposition.so_index != INVALID_INDEX &&
@@ -2796,7 +5069,9 @@ emit_output_declarations(struct svga_shader_emitter_v10 *emit)
       emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT,
                               emit->vposition.so_index,
                               VGPU10_NAME_UNDEFINED,
-                              VGPU10_OPERAND_4_COMPONENT_MASK_ALL);
+                              VGPU10_OPERAND_4_COMPONENT_MASK_ALL,
+                              TRUE,
+                              SVGADX_SIGNATURE_SEMANTIC_NAME_POSITION);
    }
 
    if (emit->clip_dist_so_index != INVALID_INDEX &&
@@ -2811,22 +5086,56 @@ emit_output_declarations(struct svga_shader_emitter_v10 *emit)
       emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT,
                               emit->clip_dist_so_index,
                               VGPU10_NAME_UNDEFINED,
-                              emit->output_usage_mask[emit->clip_dist_out_index]);
+                              emit->output_usage_mask[emit->clip_dist_out_index],
+                              TRUE,
+                              SVGADX_SIGNATURE_SEMANTIC_NAME_CLIP_DISTANCE);
 
       if (emit->info.num_written_clipdistance > 4) {
          /* for the second clip distance register, each handles 4 planes */
          emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT,
                                  emit->clip_dist_so_index + 1,
                                  VGPU10_NAME_UNDEFINED,
-                                 emit->output_usage_mask[emit->clip_dist_out_index+1]);
+                                 emit->output_usage_mask[emit->clip_dist_out_index+1],
+                                 TRUE,
+                                 SVGADX_SIGNATURE_SEMANTIC_NAME_CLIP_DISTANCE);
       }
    }
 
+   if (emit->index_range.start_index != INVALID_INDEX) {
+      emit_index_range_declaration(emit);
+   }
+   emit->index_range.required = FALSE;
    return TRUE;
 }
 
 
 /**
+ * A helper function to create a temporary indexable array
+ * and initialize the corresponding entries in the temp_map array.
+ */
+static void
+create_temp_array(struct svga_shader_emitter_v10 *emit,
+                  unsigned arrayID, unsigned first, unsigned count,
+                  unsigned startIndex)
+{
+   unsigned i, tempIndex = startIndex;
+
+   emit->num_temp_arrays = MAX2(emit->num_temp_arrays, arrayID + 1);
+   assert(emit->num_temp_arrays <= MAX_TEMP_ARRAYS);
+   emit->num_temp_arrays = MIN2(emit->num_temp_arrays, MAX_TEMP_ARRAYS);
+
+   emit->temp_arrays[arrayID].start = first;
+   emit->temp_arrays[arrayID].size = count;
+
+   /* Fill in the temp_map entries for this temp array */
+   for (i = 0; i < count; i++, tempIndex++) {
+      emit->temp_map[tempIndex].arrayId = arrayID;
+      emit->temp_map[tempIndex].index = i;
+   }
+}
+
+
+/**
  * Emit the declaration for the temporary registers.
  */
 static boolean
@@ -2844,18 +5153,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
     */
    if ((emit->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) &&
        emit->num_temp_arrays == 0) {
-      unsigned arrayID;
-
-      arrayID = 1;
-      emit->num_temp_arrays = arrayID + 1; 
-      emit->temp_arrays[arrayID].start = 0;
-      emit->temp_arrays[arrayID].size = total_temps;
-
-      /* Fill in the temp_map entries for this temp array */
-      for (i = 0; i < total_temps; i++) {
-         emit->temp_map[i].arrayId = arrayID;
-         emit->temp_map[i].index = i;
-      }
+      create_temp_array(emit, 1, 0, total_temps, 0);
    }
 
    /* Allocate extra temps for specially-implemented instructions,
@@ -2863,6 +5161,29 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
     */
    total_temps += MAX_INTERNAL_TEMPS;
 
+   /* Allocate extra temps for clip distance or clip vertex.
+    */
+   if (emit->clip_mode == CLIP_DISTANCE) {
+      /* We need to write the clip distance to a temporary register
+       * first. Then it will be copied to the shadow copy for
+       * the clip distance varying variable and stream output purpose.
+       * It will also be copied to the actual CLIPDIST register
+       * according to the enabled clip planes
+       */
+      emit->clip_dist_tmp_index = total_temps++;
+      if (emit->info.num_written_clipdistance > 4)
+         total_temps++; /* second clip register */
+   }
+   else if (emit->clip_mode == CLIP_VERTEX && emit->key.last_vertex_stage) {
+      /* If the current shader is in the last vertex processing stage,
+       * We need to convert the TGSI CLIPVERTEX output to one or more
+       * clip distances.  Allocate a temp reg for the clipvertex here.
+       */
+      assert(emit->info.writes_clipvertex > 0);
+      emit->clip_vertex_tmp_index = total_temps;
+      total_temps++;
+   }
+
    if (emit->unit == PIPE_SHADER_VERTEX || emit->unit == PIPE_SHADER_GEOMETRY) {
       if (emit->vposition.need_prescale || emit->key.vs.undo_viewport ||
           emit->key.clip_plane_enable ||
@@ -2871,6 +5192,11 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
          total_temps += 1;
       }
 
+      if (emit->vposition.need_prescale) {
+         emit->vposition.prescale_scale_index = total_temps++;
+         emit->vposition.prescale_trans_index = total_temps++;
+      }
+
       if (emit->unit == PIPE_SHADER_VERTEX) {
          unsigned attrib_mask = (emit->key.vs.adjust_attrib_w_1 |
                                  emit->key.vs.adjust_attrib_itof |
@@ -2884,25 +5210,9 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
             emit->vs.adjusted_input[index] = total_temps++;
          }
       }
-
-      if (emit->clip_mode == CLIP_DISTANCE) {
-         /* We need to write the clip distance to a temporary register
-          * first. Then it will be copied to the shadow copy for
-          * the clip distance varying variable and stream output purpose.
-          * It will also be copied to the actual CLIPDIST register
-          * according to the enabled clip planes
-          */
-         emit->clip_dist_tmp_index = total_temps++;
-         if (emit->info.num_written_clipdistance > 4)
-            total_temps++; /* second clip register */
-      }
-      else if (emit->clip_mode == CLIP_VERTEX) {
-         /* We need to convert the TGSI CLIPVERTEX output to one or more
-          * clip distances.  Allocate a temp reg for the clipvertex here.
-          */
-         assert(emit->info.writes_clipvertex > 0);
-         emit->clip_vertex_tmp_index = total_temps;
-         total_temps++;
+      else if (emit->unit == PIPE_SHADER_GEOMETRY) {
+         if (emit->key.gs.writes_viewport_index)
+            emit->gs.viewport_index_tmp_index = total_temps++;
       }
    }
    else if (emit->unit == PIPE_SHADER_FRAGMENT) {
@@ -2930,6 +5240,63 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
          emit->fs.sample_pos_tmp_index = total_temps++;
       }
    }
+   else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+      if (emit->vposition.need_prescale) {
+         emit->vposition.tmp_index = total_temps++;
+         emit->vposition.prescale_scale_index = total_temps++;
+         emit->vposition.prescale_trans_index = total_temps++;
+      }
+
+      if (emit->tes.inner.tgsi_index) {
+         emit->tes.inner.temp_index = total_temps;
+         total_temps += 1;
+      }
+
+      if (emit->tes.outer.tgsi_index) {
+         emit->tes.outer.temp_index = total_temps;
+         total_temps += 1;
+      }
+   }
+   else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+      if (emit->tcs.inner.tgsi_index != INVALID_INDEX) {
+         if (!emit->tcs.control_point_phase) {
+            emit->tcs.inner.temp_index = total_temps;
+            total_temps += 1;
+         }
+      }
+      if (emit->tcs.outer.tgsi_index != INVALID_INDEX) {
+         if (!emit->tcs.control_point_phase) {
+            emit->tcs.outer.temp_index = total_temps;
+            total_temps += 1;
+         }
+      }
+
+      if (emit->tcs.control_point_phase &&
+          emit->info.reads_pervertex_outputs) {
+         emit->tcs.control_point_tmp_index = total_temps;
+         total_temps += emit->tcs.control_point_out_count;
+      }
+      else if (!emit->tcs.control_point_phase &&
+               emit->info.reads_perpatch_outputs) {
+
+         /* If there is indirect access to the patch constant outputs
+          * in the control point phase, then an indexable temporary array
+          * will be created for these patch constant outputs.
+          * Note, indirect access can only be applicable to
+          * patch constant outputs in the control point phase.
+          */
+         if (emit->info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
+            unsigned arrayID =
+               emit->num_temp_arrays ? emit->num_temp_arrays : 1;
+            create_temp_array(emit, arrayID, 0,
+                              emit->tcs.patch_generic_out_count, total_temps);
+         }
+         emit->tcs.patch_generic_tmp_index = total_temps;
+         total_temps += emit->tcs.patch_generic_out_count;
+      }
+
+      emit->tcs.invocation_id_tmp_index = total_temps++;
+   }
 
    for (i = 0; i < emit->num_address_regs; i++) {
       emit->address_reg_index[i] = total_temps++;
@@ -3065,8 +5432,8 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
 
    /* Vertex position scale/translation */
    if (emit->vposition.need_prescale) {
-      emit->vposition.prescale_scale_index = total_consts++;
-      emit->vposition.prescale_trans_index = total_consts++;
+      emit->vposition.prescale_cbuf_index = total_consts;
+      total_consts += (2 * emit->vposition.num_prescale);
    }
 
    if (emit->unit == PIPE_SHADER_VERTEX) {
@@ -3078,8 +5445,8 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
    /* user-defined clip planes */
    if (emit->key.clip_plane_enable) {
       unsigned n = util_bitcount(emit->key.clip_plane_enable);
-      assert(emit->unit == PIPE_SHADER_VERTEX ||
-             emit->unit == PIPE_SHADER_GEOMETRY);
+      assert(emit->unit != PIPE_SHADER_FRAGMENT &&
+             emit->unit != PIPE_SHADER_COMPUTE);
       for (i = 0; i < n; i++) {
          emit->clip_plane_const[i] = total_consts++;
       }
@@ -3309,34 +5676,48 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
    return TRUE;
 }
 
+/**
+ * Emit instruction with n=1, 2 or 3 source registers.
+ */
 static void
-emit_instruction_op1(struct svga_shader_emitter_v10 *emit,
-                     VGPU10_OPCODE_TYPE opcode,
+emit_instruction_opn(struct svga_shader_emitter_v10 *emit,
+                     unsigned opcode,
                      const struct tgsi_full_dst_register *dst,
-                     const struct tgsi_full_src_register *src,
-                     boolean saturate)
+                     const struct tgsi_full_src_register *src1,
+                     const struct tgsi_full_src_register *src2,
+                     const struct tgsi_full_src_register *src3,
+                     boolean saturate, bool precise)
 {
    begin_emit_instruction(emit);
-   emit_opcode(emit, opcode, saturate);
+   emit_opcode_precise(emit, opcode, saturate, precise);
    emit_dst_register(emit, dst);
-   emit_src_register(emit, src);
+   emit_src_register(emit, src1);
+   if (src2) {
+      emit_src_register(emit, src2);
+   }
+   if (src3) {
+      emit_src_register(emit, src3);
+   }
    end_emit_instruction(emit);
 }
 
 static void
+emit_instruction_op1(struct svga_shader_emitter_v10 *emit,
+                     unsigned opcode,
+                     const struct tgsi_full_dst_register *dst,
+                     const struct tgsi_full_src_register *src)
+{
+   emit_instruction_opn(emit, opcode, dst, src, NULL, NULL, FALSE, FALSE);
+}
+
+static void
 emit_instruction_op2(struct svga_shader_emitter_v10 *emit,
                      VGPU10_OPCODE_TYPE opcode,
                      const struct tgsi_full_dst_register *dst,
                      const struct tgsi_full_src_register *src1,
-                     const struct tgsi_full_src_register *src2,
-                     boolean saturate)
+                     const struct tgsi_full_src_register *src2)
 {
-   begin_emit_instruction(emit);
-   emit_opcode(emit, opcode, saturate);
-   emit_dst_register(emit, dst);
-   emit_src_register(emit, src1);
-   emit_src_register(emit, src2);
-   end_emit_instruction(emit);
+   emit_instruction_opn(emit, opcode, dst, src1, src2, NULL, FALSE, FALSE);
 }
 
 static void
@@ -3345,19 +5726,115 @@ emit_instruction_op3(struct svga_shader_emitter_v10 *emit,
                      const struct tgsi_full_dst_register *dst,
                      const struct tgsi_full_src_register *src1,
                      const struct tgsi_full_src_register *src2,
-                     const struct tgsi_full_src_register *src3,
-                     boolean saturate)
+                     const struct tgsi_full_src_register *src3)
+{
+   emit_instruction_opn(emit, opcode, dst, src1, src2, src3, FALSE, FALSE);
+}
+
+static void
+emit_instruction_op0(struct svga_shader_emitter_v10 *emit,
+                     VGPU10_OPCODE_TYPE opcode)
 {
    begin_emit_instruction(emit);
-   emit_opcode(emit, opcode, saturate);
-   emit_dst_register(emit, dst);
-   emit_src_register(emit, src1);
-   emit_src_register(emit, src2);
-   emit_src_register(emit, src3);
+   emit_opcode(emit, opcode, FALSE);
    end_emit_instruction(emit);
 }
 
 /**
+ * Tessellation inner/outer levels needs to be store into its
+ * appropriate registers depending on prim_mode.
+ */
+static void
+store_tesslevels(struct svga_shader_emitter_v10 *emit)
+{
+   int i;
+
+   /* tessellation levels are required input/out in hull shader.
+    * emitting the inner/outer tessellation levels, either from
+    * values provided in tcs or fallback default values which is 1.0
+    */
+   if (emit->key.tcs.prim_mode == PIPE_PRIM_QUADS) {
+      struct tgsi_full_src_register temp_src;
+
+      if (emit->tcs.inner.tgsi_index != INVALID_INDEX)
+         temp_src = make_src_temp_reg(emit->tcs.inner.temp_index);
+      else
+         temp_src = make_immediate_reg_float(emit, 1.0f);
+
+      for (i = 0; i < 2; i++) {
+         struct tgsi_full_src_register src =
+            scalar_src(&temp_src, TGSI_SWIZZLE_X + i);
+         struct tgsi_full_dst_register dst =
+            make_dst_reg(TGSI_FILE_OUTPUT, emit->tcs.inner.out_index + i);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      }
+
+      if (emit->tcs.outer.tgsi_index != INVALID_INDEX)
+         temp_src = make_src_temp_reg(emit->tcs.outer.temp_index);
+      else
+         temp_src = make_immediate_reg_float(emit, 1.0f);
+
+      for (i = 0; i < 4; i++) {
+         struct tgsi_full_src_register src =
+            scalar_src(&temp_src, TGSI_SWIZZLE_X + i);
+         struct tgsi_full_dst_register dst =
+            make_dst_reg(TGSI_FILE_OUTPUT, emit->tcs.outer.out_index + i);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      }
+   }
+   else if (emit->key.tcs.prim_mode == PIPE_PRIM_TRIANGLES) {
+      struct tgsi_full_src_register temp_src;
+
+      if (emit->tcs.inner.tgsi_index != INVALID_INDEX)
+         temp_src = make_src_temp_reg(emit->tcs.inner.temp_index);
+      else
+         temp_src = make_immediate_reg_float(emit, 1.0f);
+
+      struct tgsi_full_src_register src =
+         scalar_src(&temp_src, TGSI_SWIZZLE_X);
+      struct tgsi_full_dst_register dst =
+         make_dst_reg(TGSI_FILE_OUTPUT, emit->tcs.inner.out_index);
+      dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+
+      if (emit->tcs.outer.tgsi_index != INVALID_INDEX)
+         temp_src = make_src_temp_reg(emit->tcs.outer.temp_index);
+      else
+         temp_src = make_immediate_reg_float(emit, 1.0f);
+
+      for (i = 0; i < 3; i++) {
+         struct tgsi_full_src_register src =
+            scalar_src(&temp_src, TGSI_SWIZZLE_X + i);
+         struct tgsi_full_dst_register dst =
+            make_dst_reg(TGSI_FILE_OUTPUT, emit->tcs.outer.out_index + i);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      }
+   }
+   else if (emit->key.tcs.prim_mode ==  PIPE_PRIM_LINES) {
+      if (emit->tcs.outer.tgsi_index != INVALID_INDEX) {
+         struct tgsi_full_src_register temp_src =
+            make_src_temp_reg(emit->tcs.outer.temp_index);
+         for (i = 0; i < 2; i++) {
+            struct tgsi_full_src_register src =
+               scalar_src(&temp_src, TGSI_SWIZZLE_X + i);
+            struct tgsi_full_dst_register dst =
+               make_dst_reg(TGSI_FILE_OUTPUT,
+                            emit->tcs.outer.out_index + i);
+            dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+            emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+         }
+      }
+   }
+   else {
+      debug_printf("Unsupported primitive type");
+   }
+}
+
+
+/**
  * Emit the actual clip distance instructions to be used for clipping
  * by copying the clip distance from the temporary registers to the
  * CLIPDIST registers written with the enabled planes mask.
@@ -3399,7 +5876,7 @@ emit_clip_distance_instructions(struct svga_shader_emitter_v10 *emit)
 
       /* MOV clip_dist_so, tmp_clip_dist */
       emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &clip_dist_dst,
-                           &tmp_clip_dist_src, FALSE);
+                           &tmp_clip_dist_src);
 
       /**
        * copy those clip distances to enabled clipping planes
@@ -3412,7 +5889,7 @@ emit_clip_distance_instructions(struct svga_shader_emitter_v10 *emit)
 
          /* MOV CLIPDIST, tmp_clip_dist */
          emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &clip_dist_dst,
-                              &tmp_clip_dist_src, FALSE);
+                              &tmp_clip_dist_src);
       }
       /* four clip planes per clip register */
       clip_plane_enable >>= 4;
@@ -3434,8 +5911,7 @@ emit_clip_distance_declarations(struct svga_shader_emitter_v10 *emit)
    unsigned index = emit->num_outputs;
    unsigned plane_mask;
 
-   assert(emit->unit == PIPE_SHADER_VERTEX ||
-          emit->unit == PIPE_SHADER_GEOMETRY);
+   assert(emit->unit != PIPE_SHADER_FRAGMENT);
    assert(num_clip_planes <= 8);
 
    if (emit->clip_mode != CLIP_LEGACY &&
@@ -3446,6 +5922,10 @@ emit_clip_distance_declarations(struct svga_shader_emitter_v10 *emit)
    if (num_clip_planes == 0)
       return;
 
+   /* Convert clip vertex to clip distances only in the last vertex stage */
+   if (!emit->key.last_vertex_stage)
+      return;
+
    /* Declare one or two clip output registers.  The number of components
     * in the mask reflects the number of clip planes.  For example, if 5
     * clip planes are needed, we'll declare outputs similar to:
@@ -3458,13 +5938,15 @@ emit_clip_distance_declarations(struct svga_shader_emitter_v10 *emit)
    if (plane_mask & 0xf) {
       unsigned cmask = plane_mask & VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
       emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT_SIV, index,
-                              VGPU10_NAME_CLIP_DISTANCE, cmask);
+                              VGPU10_NAME_CLIP_DISTANCE, cmask, TRUE,
+                              SVGADX_SIGNATURE_SEMANTIC_NAME_CLIP_DISTANCE);
       emit->num_outputs++;
    }
    if (plane_mask & 0xf0) {
       unsigned cmask = (plane_mask >> 4) & VGPU10_OPERAND_4_COMPONENT_MASK_ALL;
       emit_output_declaration(emit, VGPU10_OPCODE_DCL_OUTPUT_SIV, index + 1,
-                              VGPU10_NAME_CLIP_DISTANCE, cmask);
+                              VGPU10_NAME_CLIP_DISTANCE, cmask, TRUE,
+                              SVGADX_SIGNATURE_SEMANTIC_NAME_CLIP_DISTANCE);
       emit->num_outputs++;
    }
 }
@@ -3488,7 +5970,8 @@ emit_clip_distance_from_vpos(struct svga_shader_emitter_v10 *emit,
    assert(num_clip_planes <= 8);
 
    assert(emit->unit == PIPE_SHADER_VERTEX ||
-          emit->unit == PIPE_SHADER_GEOMETRY);
+          emit->unit == PIPE_SHADER_GEOMETRY ||
+          emit->unit == PIPE_SHADER_TESS_EVAL);
 
    for (i = 0; i < num_clip_planes; i++) {
       struct tgsi_full_dst_register dst;
@@ -3506,7 +5989,7 @@ emit_clip_distance_from_vpos(struct svga_shader_emitter_v10 *emit,
 
       /* DP4 clip_dist, plane, vpos */
       emit_instruction_op2(emit, VGPU10_OPCODE_DP4, &dst,
-                           &plane_src, &vpos_src, FALSE);
+                           &plane_src, &vpos_src);
    }
 }
 
@@ -3527,7 +6010,8 @@ emit_clip_vertex_instructions(struct svga_shader_emitter_v10 *emit)
    const unsigned clip_vertex_tmp = emit->clip_vertex_tmp_index;
 
    assert(emit->unit == PIPE_SHADER_VERTEX ||
-          emit->unit == PIPE_SHADER_GEOMETRY);
+          emit->unit == PIPE_SHADER_GEOMETRY ||
+          emit->unit == PIPE_SHADER_TESS_EVAL);
 
    assert(emit->clip_mode == CLIP_VERTEX);
 
@@ -3547,7 +6031,7 @@ emit_clip_vertex_instructions(struct svga_shader_emitter_v10 *emit)
 
       /* DP4 clip_dist, plane, vpos */
       emit_instruction_op2(emit, VGPU10_OPCODE_DP4, &dst,
-                           &plane_src, &clipvert_src, FALSE);
+                           &plane_src, &clipvert_src);
    }
 
    /* copy temporary clip vertex register to the clip vertex register */
@@ -3564,7 +6048,7 @@ emit_clip_vertex_instructions(struct svga_shader_emitter_v10 *emit)
    /* MOV clip_vertex, clip_vertex_tmp */
    dst = make_dst_reg(TGSI_FILE_OUTPUT, emit->clip_vertex_out_index);
    emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                        &dst, &clipvert_src, FALSE);
+                        &dst, &clipvert_src);
 
    /**
     * set the temporary clip vertex register index back to the
@@ -3613,20 +6097,18 @@ emit_puint_to_snorm(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register bias_src = make_src_temp_reg(bias_tmp);
 
    /* val = src * 2.0 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &val_dst,
-                        src, &two, FALSE);
+   emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &val_dst, src, &two);
 
    /* bias = src > 0.5 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_GE, &bias_dst,
-                        src, &half, FALSE);
+   emit_instruction_op2(emit, VGPU10_OPCODE_GE, &bias_dst, src, &half);
 
    /* bias = bias & -2.0 */
    emit_instruction_op2(emit, VGPU10_OPCODE_AND, &bias_dst,
-                        &bias_src, &neg_two, FALSE);
+                        &bias_src, &neg_two);
 
    /* dst = val + bias */
    emit_instruction_op2(emit, VGPU10_OPCODE_ADD, dst,
-                        &val_src, &bias_src, FALSE);
+                        &val_src, &bias_src);
 
    free_temp_indexes(emit);
 }
@@ -3642,7 +6124,7 @@ emit_puint_to_uscaled(struct svga_shader_emitter_v10 *emit,
       make_immediate_reg_float4(emit, 1023.0f, 1023.0f, 1023.0f, 3.0f);
 
    /* dst = src * scale */
-   emit_instruction_op2(emit, VGPU10_OPCODE_MUL, dst, src, &scale, FALSE);
+   emit_instruction_op2(emit, VGPU10_OPCODE_MUL, dst, src, &scale);
 }
 
 
@@ -3671,10 +6153,10 @@ emit_puint_to_sscaled(struct svga_shader_emitter_v10 *emit,
     * dst = i_to_f(r,g,b,a);     # convert to float
     */
    emit_instruction_op2(emit, VGPU10_OPCODE_ISHL, &tmp_dst,
-                        &src_xxxx, &lshift, FALSE);
+                        &src_xxxx, &lshift);
    emit_instruction_op2(emit, VGPU10_OPCODE_ISHR, &tmp_dst,
-                        &tmp_src, &rshift, FALSE);
-   emit_instruction_op1(emit, VGPU10_OPCODE_ITOF, dst, &tmp_src, FALSE);
+                        &tmp_src, &rshift);
+   emit_instruction_op1(emit, VGPU10_OPCODE_ITOF, dst, &tmp_src);
 
    free_temp_indexes(emit);
 }
@@ -3693,6 +6175,7 @@ emit_arl_uarl(struct svga_shader_emitter_v10 *emit,
 
    assert(index < MAX_VGPU10_ADDR_REGS);
    dst = make_dst_temp_reg(emit->address_reg_index[index]);
+   dst = writemask_dst(&dst, inst->Dst[0].Register.WriteMask);
 
    /* ARL dst, s0
     * Translates into:
@@ -3707,7 +6190,7 @@ emit_arl_uarl(struct svga_shader_emitter_v10 *emit,
    else
       opcode = VGPU10_OPCODE_MOV;
 
-   emit_instruction_op1(emit, opcode, &dst, &inst->Src[0], FALSE);
+   emit_instruction_op1(emit, opcode, &dst, &inst->Src[0]);
 
    return TRUE;
 }
@@ -3751,7 +6234,7 @@ emit_iabs(struct svga_shader_emitter_v10 *emit,
     */
    struct tgsi_full_src_register neg_src = negate_src(&inst->Src[0]);
    emit_instruction_op2(emit, VGPU10_OPCODE_IMAX, &inst->Dst[0],
-                        &inst->Src[0], &neg_src, FALSE);
+                        &inst->Src[0], &neg_src);
 
    return TRUE;
 }
@@ -3778,11 +6261,12 @@ emit_cmp(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
    struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
 
-   emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp_dst,
-                        &inst->Src[0], &zero, FALSE);
-   emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0],
+   emit_instruction_opn(emit, VGPU10_OPCODE_LT, &tmp_dst,
+                        &inst->Src[0], &zero, NULL, FALSE,
+                        inst->Instruction.Precise);
+   emit_instruction_opn(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0],
                         &tmp_src, &inst->Src[1], &inst->Src[2],
-                        inst->Instruction.Saturate);
+                        inst->Instruction.Saturate, FALSE);
 
    free_temp_indexes(emit);
 
@@ -3827,7 +6311,7 @@ emit_dst(struct svga_shader_emitter_v10 *emit,
          writemask_dst(&move_dst, TGSI_WRITEMASK_X);
       struct tgsi_full_src_register one = make_immediate_reg_float(emit, 1.0f);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_x, &one, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_x, &one);
    }
 
    /* MUL dst.y, s0.y, s1.y */
@@ -3835,8 +6319,9 @@ emit_dst(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_y =
          writemask_dst(&move_dst, TGSI_WRITEMASK_Y);
 
-      emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &dst_y, &s0_yyyy,
-                           &s1_yyyy, inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MUL, &dst_y, &s0_yyyy,
+                           &s1_yyyy, NULL, inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
    }
 
    /* MOV dst.z, s0.z */
@@ -3844,8 +6329,10 @@ emit_dst(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_z =
          writemask_dst(&move_dst, TGSI_WRITEMASK_Z);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_z, &s0_zzzz,
-                           inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                           &dst_z, &s0_zzzz, NULL, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
   }
 
    /* MOV dst.w, s1.w */
@@ -3853,18 +6340,30 @@ emit_dst(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_w =
          writemask_dst(&move_dst, TGSI_WRITEMASK_W);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &s1_wwww,
-                           inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                           &dst_w, &s1_wwww, NULL, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
    }
 
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src,
-                        FALSE);
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src);
    free_temp_indexes(emit);
 
    return TRUE;
 }
 
 
+/**
+ * A helper function to return the stream index as specified in
+ * the immediate register
+ */
+static inline unsigned
+find_stream_index(struct svga_shader_emitter_v10 *emit,
+                  const struct tgsi_full_src_register *src)
+{
+   return emit->immediates[src->Register.Index][src->Register.SwizzleX].Int;
+}
+
 
 /**
  * Emit code for TGSI_OPCODE_ENDPRIM (GS only)
@@ -3875,11 +6374,25 @@ emit_endprim(struct svga_shader_emitter_v10 *emit,
 {
    assert(emit->unit == PIPE_SHADER_GEOMETRY);
 
-   /* We can't use emit_simple() because the TGSI instruction has one
-    * operand (vertex stream number) which we must ignore for VGPU10.
-    */
    begin_emit_instruction(emit);
-   emit_opcode(emit, VGPU10_OPCODE_CUT, FALSE);
+   if (emit->version >= 50) {
+      unsigned streamIndex = find_stream_index(emit, &inst->Src[0]);
+
+      if (emit->info.num_stream_output_components[streamIndex] == 0) {
+         /**
+          * If there is no output for this stream, discard this instruction.
+          */
+         emit->discard_instruction = TRUE;
+      }
+      else {
+         emit_opcode(emit, VGPU10_OPCODE_CUT_STREAM, FALSE);
+         assert(inst->Src[0].Register.File == TGSI_FILE_IMMEDIATE);
+         emit_stream_register(emit, streamIndex);
+      }
+   }
+   else {
+      emit_opcode(emit, VGPU10_OPCODE_CUT, FALSE);
+   }
    end_emit_instruction(emit);
    return TRUE;
 }
@@ -3904,8 +6417,10 @@ emit_ex2(struct svga_shader_emitter_v10 *emit,
                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
 
    /* EXP tmp, s0.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &inst->Dst[0], &src_xxxx,
-                        inst->Instruction.Saturate);
+   emit_instruction_opn(emit, VGPU10_OPCODE_EXP, &inst->Dst[0], &src_xxxx,
+                        NULL, NULL,
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    return TRUE;
 }
@@ -3945,15 +6460,17 @@ emit_exp(struct svga_shader_emitter_v10 *emit,
 
    /* ROUND_NI tmp.x, s0.x */
    emit_instruction_op1(emit, VGPU10_OPCODE_ROUND_NI, &tmp_dst,
-                        &src_xxxx, FALSE); /* round to -infinity */
+                        &src_xxxx); /* round to -infinity */
 
    /* EXP dst.x, tmp.x */
    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
       struct tgsi_full_dst_register dst_x =
          writemask_dst(&move_dst, TGSI_WRITEMASK_X);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &dst_x, &tmp_src,
-                           inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_EXP, &dst_x, &tmp_src,
+                           NULL, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
    }
 
    /* ADD dst.y, s0.x, -tmp */
@@ -3962,8 +6479,10 @@ emit_exp(struct svga_shader_emitter_v10 *emit,
          writemask_dst(&move_dst, TGSI_WRITEMASK_Y);
       struct tgsi_full_src_register neg_tmp_src = negate_src(&tmp_src);
 
-      emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &dst_y, &src_xxxx,
-                           &neg_tmp_src, inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_ADD, &dst_y, &src_xxxx,
+                           &neg_tmp_src, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
    }
 
    /* EXP dst.z, s0.x */
@@ -3971,8 +6490,10 @@ emit_exp(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_z =
          writemask_dst(&move_dst, TGSI_WRITEMASK_Z);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &dst_z, &src_xxxx,
-                           inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_EXP, &dst_z, &src_xxxx,
+                           NULL, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
    }
 
    /* MOV dst.w, 1.0 */
@@ -3981,12 +6502,10 @@ emit_exp(struct svga_shader_emitter_v10 *emit,
          writemask_dst(&move_dst, TGSI_WRITEMASK_W);
       struct tgsi_full_src_register one = make_immediate_reg_float(emit, 1.0f);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one,
-                           FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one);
    }
 
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src,
-                        FALSE);
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src);
 
    free_temp_indexes(emit);
 
@@ -3999,14 +6518,14 @@ emit_exp(struct svga_shader_emitter_v10 *emit,
  */
 static boolean
 emit_if(struct svga_shader_emitter_v10 *emit,
-        const struct tgsi_full_instruction *inst)
+        const struct tgsi_full_src_register *src)
 {
    VGPU10OpcodeToken0 opcode0;
 
    /* The src register should be a scalar */
-   assert(inst->Src[0].Register.SwizzleX == inst->Src[0].Register.SwizzleY &&
-          inst->Src[0].Register.SwizzleX == inst->Src[0].Register.SwizzleZ &&
-          inst->Src[0].Register.SwizzleX == inst->Src[0].Register.SwizzleW);
+   assert(src->Register.SwizzleX == src->Register.SwizzleY &&
+          src->Register.SwizzleX == src->Register.SwizzleZ &&
+          src->Register.SwizzleX == src->Register.SwizzleW);
 
    /* The only special thing here is that we need to set the
     * VGPU10_INSTRUCTION_TEST_NONZERO flag since we want to test if
@@ -4018,7 +6537,7 @@ emit_if(struct svga_shader_emitter_v10 *emit,
 
    begin_emit_instruction(emit);
    emit_dword(emit, opcode0.value);
-   emit_src_register(emit, &inst->Src[0]);
+   emit_src_register(emit, src);
    end_emit_instruction(emit);
 
    return TRUE;
@@ -4045,8 +6564,7 @@ emit_kill_if(struct svga_shader_emitter_v10 *emit,
       scalar_src(&tmp_src, TGSI_SWIZZLE_X);
 
    /* tmp = src[0] < 0.0 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp_dst, &inst->Src[0],
-                        &zero, FALSE);
+   emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp_dst, &inst->Src[0], &zero);
 
    if (!same_swizzle_terms(&inst->Src[0])) {
       /* If the swizzle is not XXXX, YYYY, ZZZZ or WWWW we need to
@@ -4061,11 +6579,11 @@ emit_kill_if(struct svga_shader_emitter_v10 *emit,
          scalar_src(&tmp_src, TGSI_SWIZZLE_W);
 
       emit_instruction_op2(emit, VGPU10_OPCODE_OR, &tmp_dst_x, &tmp_src_xxxx,
-                           &tmp_src_yyyy, FALSE);
+                           &tmp_src_yyyy);
       emit_instruction_op2(emit, VGPU10_OPCODE_OR, &tmp_dst_x, &tmp_src_xxxx,
-                           &tmp_src_zzzz, FALSE);
+                           &tmp_src_zzzz);
       emit_instruction_op2(emit, VGPU10_OPCODE_OR, &tmp_dst_x, &tmp_src_xxxx,
-                           &tmp_src_wwww, FALSE);
+                           &tmp_src_wwww);
    }
 
    begin_emit_instruction(emit);
@@ -4117,8 +6635,10 @@ emit_lg2(struct svga_shader_emitter_v10 *emit,
                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
 
    /* LOG tmp, s0.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &inst->Dst[0], &src_xxxx,
-                        inst->Instruction.Saturate);
+   emit_instruction_opn(emit, VGPU10_OPCODE_LOG,
+                        &inst->Dst[0], &src_xxxx, NULL, NULL,
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    return TRUE;
 }
@@ -4152,14 +6672,14 @@ emit_lit(struct svga_shader_emitter_v10 *emit,
    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
       struct tgsi_full_dst_register dst_x =
          writemask_dst(&move_dst, TGSI_WRITEMASK_X);
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_x, &one, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_x, &one);
    }
 
    /* MOV dst.w, 1.0 */
    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
       struct tgsi_full_dst_register dst_w =
          writemask_dst(&move_dst, TGSI_WRITEMASK_W);
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one);
    }
 
    /* MAX dst.y, src.x, 0.0 */
@@ -4172,8 +6692,8 @@ emit_lit(struct svga_shader_emitter_v10 *emit,
          swizzle_src(&inst->Src[0], TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
                      TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
 
-      emit_instruction_op2(emit, VGPU10_OPCODE_MAX, &dst_y, &src_xxxx,
-                           &zero, inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MAX, &dst_y, &src_xxxx,
+                           &zero, NULL, inst->Instruction.Saturate, FALSE);
    }
 
    /*
@@ -4223,42 +6743,37 @@ emit_lit(struct svga_shader_emitter_v10 *emit,
          make_immediate_reg_float(emit, 128.0f);
 
       emit_instruction_op2(emit, VGPU10_OPCODE_MAX, &tmp1_dst, &src_wwww,
-                           &lowerbound, FALSE);
+                           &lowerbound);
       emit_instruction_op2(emit, VGPU10_OPCODE_MIN, &tmp1_dst, &tmp1_src,
-                           &upperbound, FALSE);
+                           &upperbound);
       emit_instruction_op2(emit, VGPU10_OPCODE_MAX, &tmp2_dst, &src_yyyy,
-                           &zero, FALSE);
+                           &zero);
 
       /* POW tmp1, tmp2, tmp1 */
       /* LOG tmp2, tmp2 */
-      emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &tmp2_dst, &tmp2_src,
-                           FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &tmp2_dst, &tmp2_src);
 
       /* MUL tmp1, tmp2, tmp1 */
       emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp1_dst, &tmp2_src,
-                           &tmp1_src, FALSE);
+                           &tmp1_src);
 
       /* EXP tmp1, tmp1 */
-      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &tmp1_dst, &tmp1_src,
-                           FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &tmp1_dst, &tmp1_src);
 
       /* EQ tmp2, 0, src.w */
-      emit_instruction_op2(emit, VGPU10_OPCODE_EQ, &tmp2_dst, &zero,
-                           &src_wwww, FALSE);
+      emit_instruction_op2(emit, VGPU10_OPCODE_EQ, &tmp2_dst, &zero, &src_wwww);
       /* MOVC tmp1.z, tmp2, tmp1, 1.0 */
       emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &tmp1_dst,
-                           &tmp2_src, &one, &tmp1_src, FALSE);
+                           &tmp2_src, &one, &tmp1_src);
 
       /* LT tmp2, 0, src.x */
-      emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp2_dst, &zero,
-                           &src_xxxx, FALSE);
+      emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp2_dst, &zero, &src_xxxx);
       /* MOVC dst.z, tmp2, tmp1, 0.0 */
       emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &dst_z,
-                           &tmp2_src, &tmp1_src, &zero, FALSE);
+                           &tmp2_src, &tmp1_src, &zero);
    }
 
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src,
-                        FALSE);
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &move_src);
    free_temp_indexes(emit);
 
    return TRUE;
@@ -4316,8 +6831,7 @@ emit_log(struct svga_shader_emitter_v10 *emit,
 
    /* LOG tmp.x, abs(s0.x) */
    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
-      emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &tmp_dst,
-                          &abs_src_xxxx, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &tmp_dst, &abs_src_xxxx);
    }
 
    /* MOV dst.z, tmp.x */
@@ -4325,14 +6839,14 @@ emit_log(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_z =
          writemask_dst(&inst->Dst[0], TGSI_WRITEMASK_Z);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_z,
-                           &tmp_src, inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                           &dst_z, &tmp_src, NULL, NULL,
+                           inst->Instruction.Saturate, FALSE);
    }
 
    /* FLR tmp.x, tmp.x */
    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) {
-      emit_instruction_op1(emit, VGPU10_OPCODE_ROUND_NI, &tmp_dst,
-                           &tmp_src, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_ROUND_NI, &tmp_dst, &tmp_src);
    }
 
    /* MOV dst.x, tmp.x */
@@ -4340,8 +6854,9 @@ emit_log(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_x =
          writemask_dst(&inst->Dst[0], TGSI_WRITEMASK_X);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_x, &tmp_src,
-                           inst->Instruction.Saturate);
+      emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                           &dst_x, &tmp_src, NULL, NULL,
+                           inst->Instruction.Saturate, FALSE);
    }
 
    /* EXP tmp.x, tmp.x */
@@ -4350,10 +6865,9 @@ emit_log(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_dst_register dst_y =
          writemask_dst(&inst->Dst[0], TGSI_WRITEMASK_Y);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &tmp_dst, &tmp_src,
-                           FALSE);
-      emit_instruction_op2(emit, VGPU10_OPCODE_DIV, &dst_y, &abs_src_xxxx,
-                           &tmp_src, inst->Instruction.Saturate);
+      emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &tmp_dst, &tmp_src);
+      emit_instruction_opn(emit, VGPU10_OPCODE_DIV, &dst_y, &abs_src_xxxx,
+                           &tmp_src, NULL, inst->Instruction.Saturate, FALSE);
    }
 
    /* MOV dst.w, 1.0 */
@@ -4363,7 +6877,7 @@ emit_log(struct svga_shader_emitter_v10 *emit,
       struct tgsi_full_src_register one =
          make_immediate_reg_float(emit, 1.0f);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst_w, &one);
    }
 
    free_temp_indexes(emit);
@@ -4391,13 +6905,15 @@ emit_lrp(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register neg_src2 = negate_src(&inst->Src[2]);
 
    /* ADD tmp, s1, -s2 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &dst_tmp,
-                        &inst->Src[1], &neg_src2, FALSE);
+   emit_instruction_opn(emit, VGPU10_OPCODE_ADD, &dst_tmp,
+                        &inst->Src[1], &neg_src2, NULL, FALSE,
+                        inst->Instruction.Precise);
 
    /* MAD dst, s1, tmp, s3 */
-   emit_instruction_op3(emit, VGPU10_OPCODE_MAD, &inst->Dst[0],
+   emit_instruction_opn(emit, VGPU10_OPCODE_MAD, &inst->Dst[0],
                         &inst->Src[0], &src_tmp, &inst->Src[2],
-                        inst->Instruction.Saturate);
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    free_temp_indexes(emit);
 
@@ -4429,16 +6945,20 @@ emit_pow(struct svga_shader_emitter_v10 *emit,
                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X);
 
    /* LOG tmp, s0.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_LOG, &tmp_dst, &src0_xxxx,
-                        FALSE);
+   emit_instruction_opn(emit, VGPU10_OPCODE_LOG,
+                        &tmp_dst, &src0_xxxx, NULL, NULL,
+                        FALSE, inst->Instruction.Precise);
 
    /* MUL tmp, tmp, s1.xxxx */
-   emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst, &tmp_src,
-                        &src1_xxxx, FALSE);
+   emit_instruction_opn(emit, VGPU10_OPCODE_MUL,
+                        &tmp_dst, &tmp_src, &src1_xxxx, NULL,
+                        FALSE, inst->Instruction.Precise);
 
    /* EXP tmp, s0.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_EXP, &inst->Dst[0],
-                        &tmp_src, inst->Instruction.Saturate);
+   emit_instruction_opn(emit, VGPU10_OPCODE_EXP,
+                        &inst->Dst[0], &tmp_src, NULL, NULL,
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    /* free tmp */
    free_temp_indexes(emit);
@@ -4454,26 +6974,49 @@ static boolean
 emit_rcp(struct svga_shader_emitter_v10 *emit,
          const struct tgsi_full_instruction *inst)
 {
-   struct tgsi_full_src_register one = make_immediate_reg_float(emit, 1.0f);
+   if (emit->version >= 50) {
+      /* use new RCP instruction.  But VGPU10_OPCODE_RCP is component-wise
+       * while TGSI_OPCODE_RCP computes dst.xyzw = 1.0 / src.xxxx so we need
+       * to manipulate the src register's swizzle.
+       */
+      struct tgsi_full_src_register src = inst->Src[0];
+      src.Register.SwizzleY =
+      src.Register.SwizzleZ =
+      src.Register.SwizzleW = src.Register.SwizzleX;
 
-   unsigned tmp = get_temp_index(emit);
-   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
-   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
+      begin_emit_instruction(emit);
+      emit_opcode_precise(emit, VGPU10_OPCODE_RCP,
+                          inst->Instruction.Saturate,
+                          inst->Instruction.Precise);
+      emit_dst_register(emit, &inst->Dst[0]);
+      emit_src_register(emit, &src);
+      end_emit_instruction(emit);
+   }
+   else {
+      struct tgsi_full_src_register one = make_immediate_reg_float(emit, 1.0f);
 
-   struct tgsi_full_dst_register tmp_dst_x =
-      writemask_dst(&tmp_dst, TGSI_WRITEMASK_X);
-   struct tgsi_full_src_register tmp_src_xxxx =
-      scalar_src(&tmp_src, TGSI_SWIZZLE_X);
+      unsigned tmp = get_temp_index(emit);
+      struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
+      struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
 
-   /* DIV tmp.x, 1.0, s0 */
-   emit_instruction_op2(emit, VGPU10_OPCODE_DIV, &tmp_dst_x, &one,
-                        &inst->Src[0], FALSE);
+      struct tgsi_full_dst_register tmp_dst_x =
+         writemask_dst(&tmp_dst, TGSI_WRITEMASK_X);
+      struct tgsi_full_src_register tmp_src_xxxx =
+         scalar_src(&tmp_src, TGSI_SWIZZLE_X);
 
-   /* MOV dst, tmp.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0],
-                        &tmp_src_xxxx, inst->Instruction.Saturate);
+      /* DIV tmp.x, 1.0, s0 */
+      emit_instruction_opn(emit, VGPU10_OPCODE_DIV,
+                           &tmp_dst_x, &one, &inst->Src[0], NULL,
+                           FALSE, inst->Instruction.Precise);
 
-   free_temp_indexes(emit);
+      /* MOV dst, tmp.xxxx */
+      emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                           &inst->Dst[0], &tmp_src_xxxx, NULL, NULL,
+                           inst->Instruction.Saturate,
+                           inst->Instruction.Precise);
+
+      free_temp_indexes(emit);
+   }
 
    return TRUE;
 }
@@ -4503,12 +7046,15 @@ emit_rsq(struct svga_shader_emitter_v10 *emit,
       scalar_src(&tmp_src, TGSI_SWIZZLE_X);
 
    /* RSQ tmp, src.x */
-   emit_instruction_op1(emit, VGPU10_OPCODE_RSQ, &tmp_dst_x,
-                        &inst->Src[0], FALSE);
+   emit_instruction_opn(emit, VGPU10_OPCODE_RSQ,
+                        &tmp_dst_x, &inst->Src[0], NULL, NULL,
+                        FALSE, inst->Instruction.Precise);
 
    /* MOV dst, tmp.xxxx */
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0],
-                        &tmp_src_xxxx, inst->Instruction.Saturate);
+   emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                        &inst->Dst[0], &tmp_src_xxxx, NULL, NULL,
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    /* free tmp */
    free_temp_indexes(emit);
@@ -4538,11 +7084,11 @@ emit_seq(struct svga_shader_emitter_v10 *emit,
 
    /* EQ tmp, s0, s1 */
    emit_instruction_op2(emit, VGPU10_OPCODE_EQ, &tmp_dst, &inst->Src[0],
-                        &inst->Src[1], FALSE);
+                        &inst->Src[1]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4571,11 +7117,11 @@ emit_sge(struct svga_shader_emitter_v10 *emit,
 
    /* GE tmp, s0, s1 */
    emit_instruction_op2(emit, VGPU10_OPCODE_GE, &tmp_dst, &inst->Src[0],
-                        &inst->Src[1], FALSE);
+                        &inst->Src[1]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4604,11 +7150,11 @@ emit_sgt(struct svga_shader_emitter_v10 *emit,
 
    /* LT tmp, s1, s0 */
    emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp_dst, &inst->Src[1],
-                        &inst->Src[0], FALSE);
+                        &inst->Src[0]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4648,8 +7194,10 @@ emit_sincos(struct svga_shader_emitter_v10 *emit,
    emit_src_register(emit, &inst->Src[0]);
    end_emit_instruction(emit);
 
-   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0],
-                        &tmp_src_xxxx, inst->Instruction.Saturate);
+   emit_instruction_opn(emit, VGPU10_OPCODE_MOV,
+                        &inst->Dst[0], &tmp_src_xxxx, NULL, NULL,
+                        inst->Instruction.Saturate,
+                        inst->Instruction.Precise);
 
    free_temp_indexes(emit);
 
@@ -4678,11 +7226,11 @@ emit_sle(struct svga_shader_emitter_v10 *emit,
 
    /* GE tmp, s1, s0 */
    emit_instruction_op2(emit, VGPU10_OPCODE_GE, &tmp_dst, &inst->Src[1],
-                        &inst->Src[0], FALSE);
+                        &inst->Src[0]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4711,11 +7259,11 @@ emit_slt(struct svga_shader_emitter_v10 *emit,
 
    /* LT tmp, s0, s1 */
    emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp_dst, &inst->Src[0],
-                        &inst->Src[1], FALSE);
+                        &inst->Src[1]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4744,11 +7292,11 @@ emit_sne(struct svga_shader_emitter_v10 *emit,
 
    /* NE tmp, s0, s1 */
    emit_instruction_op2(emit, VGPU10_OPCODE_NE, &tmp_dst, &inst->Src[0],
-                        &inst->Src[1], FALSE);
+                        &inst->Src[1]);
 
    /* MOVC dst, tmp, one, zero */
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp_src,
-                        &one, &zero, FALSE);
+                        &one, &zero);
 
    free_temp_indexes(emit);
 
@@ -4789,13 +7337,13 @@ emit_ssg(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_dst_register tmp2_dst = make_dst_temp_reg(tmp2);
 
    emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp1_dst, &inst->Src[0],
-                        &zero, FALSE);
+                        &zero);
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &tmp2_dst, &tmp1_src,
-                        &neg_one, &zero, FALSE);
+                        &neg_one, &zero);
    emit_instruction_op2(emit, VGPU10_OPCODE_LT, &tmp1_dst, &zero,
-                        &inst->Src[0], FALSE);
+                        &inst->Src[0]);
    emit_instruction_op3(emit, VGPU10_OPCODE_MOVC, &inst->Dst[0], &tmp1_src,
-                        &one, &tmp2_src, FALSE);
+                        &one, &tmp2_src);
 
    free_temp_indexes(emit);
 
@@ -4832,11 +7380,11 @@ emit_issg(struct svga_shader_emitter_v10 *emit,
    struct tgsi_full_src_register neg_tmp2 = negate_src(&tmp2_src);
 
    emit_instruction_op2(emit, VGPU10_OPCODE_ILT, &tmp1_dst,
-                        &inst->Src[0], &zero, FALSE);
+                        &inst->Src[0], &zero);
    emit_instruction_op2(emit, VGPU10_OPCODE_ILT, &tmp2_dst,
-                        &zero, &inst->Src[0], FALSE);
+                        &zero, &inst->Src[0]);
    emit_instruction_op2(emit, VGPU10_OPCODE_IADD, &inst->Dst[0],
-                        &tmp1_src, &neg_tmp2, FALSE);
+                        &tmp1_src, &neg_tmp2);
 
    free_temp_indexes(emit);
 
@@ -4982,15 +7530,15 @@ setup_texcoord(struct svga_shader_emitter_v10 *emit,
 
          /* ADD tmp, coord, offset */
          emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp_dst,
-                              coord, &offset, FALSE);
+                              coord, &offset);
          /* MUL tmp, tmp, scale */
          emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst,
-                              &tmp_src, &scale_src, FALSE);
+                              &tmp_src, &scale_src);
       }
       else {
          /* MUL tmp, coord, const[] */
          emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_dst,
-                              coord, &scale_src, FALSE);
+                              coord, &scale_src);
       }
       return tmp_src;
    }
@@ -5118,8 +7666,6 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
       /* convert gallium comparison func to SVGA comparison func */
       SVGA3dCmpFunc compare_func = emit->key.tex[swz->unit].compare_func + 1;
 
-      assert(emit->unit == PIPE_SHADER_FRAGMENT);
-
       int component =
          tgsi_util_get_shadow_ref_src_index(swz->texture_target) % 4;
       assert(component >= 0);
@@ -5161,7 +7707,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
 
       /* MOV dst, color(tmp).<swizzle> */
       emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                           swz->inst_dst, &src_swizzled, FALSE);
+                           swz->inst_dst, &src_swizzled);
 
       /* handle swizzle zero terms */
       writemask_0 = (((swz_r == PIPE_SWIZZLE_0) << 0) |
@@ -5178,8 +7724,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
             writemask_dst(swz->inst_dst, writemask_0);
 
          /* MOV dst.writemask_0, {0,0,0,0} */
-         emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                              &dst, &zero, FALSE);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &zero);
       }
 
       /* handle swizzle one terms */
@@ -5197,7 +7742,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
             writemask_dst(swz->inst_dst, writemask_1);
 
          /* MOV dst.writemask_1, {1,1,1,1} */
-         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &one, FALSE);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &one);
       }
    }
 }
@@ -5339,45 +7884,160 @@ emit_tg4(struct svga_shader_emitter_v10 *emit,
 {
    const uint unit = inst->Src[2].Register.Index;
    struct tgsi_full_src_register src;
+   struct tgsi_full_src_register offset_src, sampler, ref;
    int offsets[3];
 
    /* check that the sampler returns a float */
    if (!is_valid_tex_instruction(emit, inst))
       return TRUE;
 
-   /* Only a single channel is supported in SM4_1 and we report
-    * PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS = 1.
-    * Only the 0th component will be gathered.
-    */
-   switch (emit->key.tex[unit].swizzle_r) {
-   case PIPE_SWIZZLE_X:
-      get_texel_offsets(emit, inst, offsets);
+   if (emit->version >= 50) {
+      unsigned target = inst->Texture.Texture;
+      int index = inst->Src[1].Register.Index;
+      const union tgsi_immediate_data *imm = emit->immediates[index];
+      int select_comp  = imm[inst->Src[1].Register.SwizzleX].Int;
+      unsigned select_swizzle = PIPE_SWIZZLE_X;
+
+      if (!tgsi_is_shadow_target(target)) {
+         switch (select_comp) {
+         case 0:
+            select_swizzle = emit->key.tex[unit].swizzle_r;
+            break;
+         case 1:
+            select_swizzle = emit->key.tex[unit].swizzle_g;
+            break;
+         case 2:
+            select_swizzle = emit->key.tex[unit].swizzle_b;
+            break;
+         case 3:
+            select_swizzle = emit->key.tex[unit].swizzle_a;
+            break;
+         default:
+            assert(!"Unexpected component in texture gather swizzle");
+         }
+      }
+      else {
+         select_swizzle = emit->key.tex[unit].swizzle_r;
+      }
+
+      if (select_swizzle == PIPE_SWIZZLE_1) {
+         src = make_immediate_reg_float(emit, 1.0);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &src);
+         return TRUE;
+      }
+      else if (select_swizzle == PIPE_SWIZZLE_0) {
+         src = make_immediate_reg_float(emit, 0.0);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &src);
+         return TRUE;
+      }
+
       src = setup_texcoord(emit, unit, &inst->Src[0]);
 
-      /* Gather dst, coord, resource, sampler */
+      /* GATHER4 dst, coord, resource, sampler */
+      /* GATHER4_C dst, coord, resource, sampler ref */
+      /* GATHER4_PO dst, coord, offset resource, sampler */
+      /* GATHER4_PO_C dst, coord, offset resource, sampler, ref */
       begin_emit_instruction(emit);
-      emit_sample_opcode(emit, VGPU10_OPCODE_GATHER4,
-                         inst->Instruction.Saturate, offsets);
+      if (inst->Texture.NumOffsets == 1) {
+         if (tgsi_is_shadow_target(target)) {
+            emit_opcode(emit, VGPU10_OPCODE_GATHER4_PO_C,
+                        inst->Instruction.Saturate);
+         }
+         else {
+            emit_opcode(emit, VGPU10_OPCODE_GATHER4_PO,
+                        inst->Instruction.Saturate);
+         }
+      }
+      else {
+         if (tgsi_is_shadow_target(target)) {
+            emit_opcode(emit, VGPU10_OPCODE_GATHER4_C,
+                        inst->Instruction.Saturate);
+         }
+         else {
+            emit_opcode(emit, VGPU10_OPCODE_GATHER4,
+                        inst->Instruction.Saturate);
+         }
+      }
+
       emit_dst_register(emit, &inst->Dst[0]);
       emit_src_register(emit, &src);
+      if (inst->Texture.NumOffsets == 1) {
+         /* offset */
+         offset_src = make_src_reg(inst->TexOffsets[0].File,
+                                   inst->TexOffsets[0].Index);
+         offset_src = swizzle_src(&offset_src, inst->TexOffsets[0].SwizzleX,
+                                  inst->TexOffsets[0].SwizzleY,
+                                  inst->TexOffsets[0].SwizzleZ,
+                                  TGSI_SWIZZLE_W);
+         emit_src_register(emit, &offset_src);
+      }
+
+      /* resource */
       emit_resource_register(emit, unit);
-      emit_sampler_register(emit, unit);
+
+      /* sampler */
+      sampler = make_src_reg(TGSI_FILE_SAMPLER, unit);
+      sampler.Register.SwizzleX =
+      sampler.Register.SwizzleY =
+      sampler.Register.SwizzleZ =
+      sampler.Register.SwizzleW = select_swizzle;
+      emit_src_register(emit, &sampler);
+
+      if (tgsi_is_shadow_target(target)) {
+         /* ref */
+         if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
+            ref = scalar_src(&inst->Src[1], TGSI_SWIZZLE_X);
+            emit_tex_compare_refcoord(emit, target, &ref);
+         }
+         else {
+            emit_tex_compare_refcoord(emit, target, &src);
+         }
+      }
+
       end_emit_instruction(emit);
-      break;
-   case PIPE_SWIZZLE_W:
-   case PIPE_SWIZZLE_1:
-      src = make_immediate_reg_float(emit, 1.0);
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                           &inst->Dst[0], &src, FALSE);
-      break;
-   case PIPE_SWIZZLE_Y:
-   case PIPE_SWIZZLE_Z:
-   case PIPE_SWIZZLE_0:
-   default:
-      src = make_immediate_reg_float(emit, 0.0);
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                           &inst->Dst[0], &src, FALSE);
-      break;
+      free_temp_indexes(emit);
+   }
+   else {
+      /* Only a single channel is supported in SM4_1 and we report
+       * PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS = 1.
+       * Only the 0th component will be gathered.
+       */
+      switch (emit->key.tex[unit].swizzle_r) {
+      case PIPE_SWIZZLE_X:
+         get_texel_offsets(emit, inst, offsets);
+         src = setup_texcoord(emit, unit, &inst->Src[0]);
+
+         /* Gather dst, coord, resource, sampler */
+         begin_emit_instruction(emit);
+         emit_sample_opcode(emit, VGPU10_OPCODE_GATHER4,
+                            inst->Instruction.Saturate, offsets);
+         emit_dst_register(emit, &inst->Dst[0]);
+         emit_src_register(emit, &src);
+         emit_resource_register(emit, unit);
+
+         /* sampler */
+         sampler = make_src_reg(TGSI_FILE_SAMPLER, unit);
+         sampler.Register.SwizzleX =
+         sampler.Register.SwizzleY =
+         sampler.Register.SwizzleZ =
+         sampler.Register.SwizzleW = PIPE_SWIZZLE_X;
+         emit_src_register(emit, &sampler);
+
+         end_emit_instruction(emit);
+         break;
+      case PIPE_SWIZZLE_W:
+      case PIPE_SWIZZLE_1:
+         src = make_immediate_reg_float(emit, 1.0);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &src);
+         break;
+      case PIPE_SWIZZLE_Y:
+      case PIPE_SWIZZLE_Z:
+      case PIPE_SWIZZLE_0:
+      default:
+         src = make_immediate_reg_float(emit, 0.0);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &src);
+         break;
+      }
    }
 
    return TRUE;
@@ -5459,7 +8119,7 @@ emit_txp(struct svga_shader_emitter_v10 *emit,
 
    /* DIV tmp, coord, coord.wwww */
    emit_instruction_op2(emit, VGPU10_OPCODE_DIV, &tmp_dst,
-                        &coord, &src0_wwww, FALSE);
+                        &coord, &src0_wwww);
 
    /* SAMPLE dst, coord(tmp), resource, sampler */
    begin_emit_instruction(emit);
@@ -5703,8 +8363,7 @@ emit_txq(struct svga_shader_emitter_v10 *emit,
        */
       struct tgsi_full_src_register size_src =
          make_src_const_reg(emit->texture_buffer_size_index[unit]);
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &size_src,
-                           FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0], &size_src);
    } else {
       /* RESINFO dst, srcMipLevel, resource */
       begin_emit_instruction(emit);
@@ -5722,6 +8381,319 @@ emit_txq(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * Does this opcode produce a double-precision result?
+ * XXX perhaps move this to a TGSI utility.
+ */
+static bool
+opcode_has_dbl_dst(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_F2D:
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_I2D:
+   case TGSI_OPCODE_U2D:
+      // XXX more TBD
+      return true;
+   default:
+      return false;
+   }
+}
+
+
+/**
+ * Does this opcode use double-precision source registers?
+ */
+static bool
+opcode_has_dbl_src(unsigned opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_D2U:
+      // XXX more TBD
+      return true;
+   default:
+      return false;
+   }
+}
+
+
+/**
+ * Check that the swizzle for reading from a double-precision register
+ * is valid.
+ */
+static void
+check_double_src_swizzle(const struct tgsi_full_src_register *reg)
+{
+   assert((reg->Register.SwizzleX == PIPE_SWIZZLE_X &&
+           reg->Register.SwizzleY == PIPE_SWIZZLE_Y) ||
+          (reg->Register.SwizzleX == PIPE_SWIZZLE_Z &&
+           reg->Register.SwizzleY == PIPE_SWIZZLE_W));
+
+   assert((reg->Register.SwizzleZ == PIPE_SWIZZLE_X &&
+           reg->Register.SwizzleW == PIPE_SWIZZLE_Y) ||
+          (reg->Register.SwizzleZ == PIPE_SWIZZLE_Z &&
+           reg->Register.SwizzleW == PIPE_SWIZZLE_W));
+}
+
+
+/**
+ * Check that the writemask for a double-precision instruction is valid.
+ */
+static void
+check_double_dst_writemask(const struct tgsi_full_instruction *inst)
+{
+   ASSERTED unsigned writemask = inst->Dst[0].Register.WriteMask;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_DABS:
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DFRAC:
+   case TGSI_OPCODE_DNEG:
+   case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DRCP:
+   case TGSI_OPCODE_DSQRT:
+   case TGSI_OPCODE_F2D:
+      assert(writemask == TGSI_WRITEMASK_XYZW ||
+             writemask == TGSI_WRITEMASK_XY ||
+             writemask == TGSI_WRITEMASK_ZW);
+      break;
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_D2U:
+      /* Write to 1 or 2 components only */
+      assert(util_bitcount(writemask) <= 2);
+      break;
+   default:
+      /* XXX this list may be incomplete */
+      ;
+   }
+}
+
+
+/**
+ * Double-precision absolute value.
+ */
+static boolean
+emit_dabs(struct svga_shader_emitter_v10 *emit,
+          const struct tgsi_full_instruction *inst)
+{
+   assert(emit->version >= 50);
+   check_double_src_swizzle(&inst->Src[0]);
+   check_double_dst_writemask(inst);
+
+   struct tgsi_full_src_register abs_src = absolute_src(&inst->Src[0]);
+
+   /* DMOV dst, |src| */
+   emit_instruction_op1(emit, VGPU10_OPCODE_DMOV, &inst->Dst[0], &abs_src);
+
+   return TRUE;
+}
+
+
+/**
+ * Double-precision negation
+ */
+static boolean
+emit_dneg(struct svga_shader_emitter_v10 *emit,
+          const struct tgsi_full_instruction *inst)
+{
+   assert(emit->version >= 50);
+   check_double_src_swizzle(&inst->Src[0]);
+   check_double_dst_writemask(inst);
+
+   struct tgsi_full_src_register neg_src = negate_src(&inst->Src[0]);
+
+   /* DMOV dst, -src */
+   emit_instruction_op1(emit, VGPU10_OPCODE_DMOV, &inst->Dst[0], &neg_src);
+
+   return TRUE;
+}
+
+
+/**
+ * SM5 has no DMAD opcode.  Implement negation with DMUL/DADD.
+ */
+static boolean
+emit_dmad(struct svga_shader_emitter_v10 *emit,
+          const struct tgsi_full_instruction *inst)
+{
+   assert(emit->version >= 50);
+   check_double_src_swizzle(&inst->Src[0]);
+   check_double_src_swizzle(&inst->Src[1]);
+   check_double_src_swizzle(&inst->Src[2]);
+   check_double_dst_writemask(inst);
+
+   unsigned tmp = get_temp_index(emit);
+   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
+   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
+
+   /* DMUL tmp, src[0], src[1] */
+   emit_instruction_opn(emit, VGPU10_OPCODE_DMUL,
+                        &tmp_dst, &inst->Src[0], &inst->Src[1], NULL,
+                        FALSE, inst->Instruction.Precise);
+
+   /* DADD dst, tmp, src[2] */
+   emit_instruction_opn(emit, VGPU10_OPCODE_DADD,
+                        &inst->Dst[0], &tmp_src, &inst->Src[2], NULL,
+                        inst->Instruction.Saturate, inst->Instruction.Precise);
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * Double precision reciprocal square root
+ */
+static boolean
+emit_drsq(struct svga_shader_emitter_v10 *emit,
+          const struct tgsi_full_dst_register *dst,
+          const struct tgsi_full_src_register *src)
+{
+   assert(emit->version >= 50);
+
+   VGPU10OpcodeToken0 token0;
+   begin_emit_instruction(emit);
+
+   token0.value = 0;
+   token0.opcodeType = VGPU10_OPCODE_VMWARE;
+   token0.vmwareOpcodeType = VGPU10_VMWARE_OPCODE_DRSQ;
+   emit_dword(emit, token0.value);
+
+   emit_dst_register(emit, dst);
+
+   check_double_src_swizzle(src);
+   emit_src_register(emit, src);
+
+   end_emit_instruction(emit);
+
+   return TRUE;
+}
+
+
+/**
+ * There is no SM5 opcode for double precision square root.
+ * It will be implemented with DRSQ.
+ * dst = src * DRSQ(src)
+ */
+static boolean
+emit_dsqrt(struct svga_shader_emitter_v10 *emit,
+          const struct tgsi_full_instruction *inst)
+{
+   assert(emit->version >= 50);
+
+   check_double_src_swizzle(&inst->Src[0]);
+
+   /* temporary register to hold the source */
+   unsigned tmp = get_temp_index(emit);
+   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
+   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
+
+   /* temporary register to hold the DEQ result */
+   unsigned tmp_cond = get_temp_index(emit);
+   struct tgsi_full_dst_register tmp_cond_dst = make_dst_temp_reg(tmp_cond);
+   struct tgsi_full_dst_register tmp_cond_dst_xy =
+      writemask_dst(&tmp_cond_dst, TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y);
+   struct tgsi_full_src_register tmp_cond_src = make_src_temp_reg(tmp_cond);
+   struct tgsi_full_src_register tmp_cond_src_xy =
+         swizzle_src(&tmp_cond_src,
+                     PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y,
+                     PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y);
+
+   /* The reciprocal square root of zero yields INF.
+    * So if the source is 0, we replace it with 1 in the tmp register.
+    * The later multiplication of zero in the original source will yield 0
+    * in the result.
+    */
+
+   /* tmp1 = (src == 0) ? 1 : src;
+    *   EQ tmp1, 0, src
+    *   MOVC tmp, tmp1, 1.0, src
+    */
+   struct tgsi_full_src_register zero =
+               make_immediate_reg_double(emit, 0);
+
+   struct tgsi_full_src_register one =
+               make_immediate_reg_double(emit, 1.0);
+
+   emit_instruction_op2(emit, VGPU10_OPCODE_DEQ, &tmp_cond_dst_xy,
+                        &zero, &inst->Src[0]);
+   emit_instruction_op3(emit, VGPU10_OPCODE_DMOVC, &tmp_dst,
+                        &tmp_cond_src_xy, &one, &inst->Src[0]);
+
+   struct tgsi_full_dst_register tmp_rsq_dst = make_dst_temp_reg(tmp);
+   struct tgsi_full_src_register tmp_rsq_src = make_src_temp_reg(tmp);
+
+   /* DRSQ tmp_rsq, tmp */
+   emit_drsq(emit, &tmp_rsq_dst, &tmp_src);
+
+   /* DMUL dst, tmp_rsq, src[0] */
+   emit_instruction_op2(emit, VGPU10_OPCODE_DMUL, &inst->Dst[0],
+                        &tmp_rsq_src, &inst->Src[0]);
+
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+static boolean
+emit_interp_offset(struct svga_shader_emitter_v10 *emit,
+                   const struct tgsi_full_instruction *inst)
+{
+   assert(emit->version >= 50);
+
+   /* The src1.xy offset is a float with values in the range [-0.5, 0.5]
+    * where (0,0) is the center of the pixel.  We need to translate that
+    * into an integer offset on a 16x16 grid in the range [-8/16, 7/16].
+    * Also need to flip the Y axis (I think).
+    */
+   unsigned tmp = get_temp_index(emit);
+   struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
+   struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
+   struct tgsi_full_dst_register tmp_dst_xy =
+      writemask_dst(&tmp_dst, TGSI_WRITEMASK_X | TGSI_WRITEMASK_Y);
+   struct tgsi_full_src_register const16 =
+      make_immediate_reg_float4(emit, 16.0f, -16.0, 0, 0);
+
+   /* MUL tmp.xy, src1, {16, -16, 0, 0} */
+   emit_instruction_op2(emit, VGPU10_OPCODE_MUL,
+                        &tmp_dst_xy, &inst->Src[1], &const16);
+
+   /* FTOI tmp.xy, tmp */
+   emit_instruction_op1(emit, VGPU10_OPCODE_FTOI, &tmp_dst_xy, &tmp_src);
+
+   /* EVAL_SNAPPED dst, src0, tmp */
+   emit_instruction_op2(emit, VGPU10_OPCODE_EVAL_SNAPPED,
+                        &inst->Dst[0], &inst->Src[0], &tmp_src);
+
+   free_temp_indexes(emit);
+
+   return TRUE;
+}
+
+
+/**
  * Emit a simple instruction (like ADD, MUL, MIN, etc).
  */
 static boolean
@@ -5730,14 +8702,31 @@ emit_simple(struct svga_shader_emitter_v10 *emit,
 {
    const enum tgsi_opcode opcode = inst->Instruction.Opcode;
    const struct tgsi_opcode_info *op = tgsi_get_opcode_info(opcode);
+   const bool dbl_dst = opcode_has_dbl_dst(inst->Instruction.Opcode);
+   const bool dbl_src = opcode_has_dbl_src(inst->Instruction.Opcode);
    unsigned i;
 
+   if (inst->Instruction.Opcode == TGSI_OPCODE_BGNLOOP) {
+      emit->current_loop_depth++;
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_ENDLOOP) {
+      emit->current_loop_depth--;
+   }
+
    begin_emit_instruction(emit);
-   emit_opcode(emit, translate_opcode(opcode), inst->Instruction.Saturate);
+   emit_opcode_precise(emit, translate_opcode(inst->Instruction.Opcode),
+                       inst->Instruction.Saturate,
+                       inst->Instruction.Precise);
    for (i = 0; i < op->num_dst; i++) {
+      if (dbl_dst) {
+         check_double_dst_writemask(inst);
+      }
       emit_dst_register(emit, &inst->Dst[i]);
    }
    for (i = 0; i < op->num_src; i++) {
+      if (dbl_src) {
+         check_double_src_swizzle(&inst->Src[i]);
+      }
       emit_src_register(emit, &inst->Src[i]);
    }
    end_emit_instruction(emit);
@@ -5747,6 +8736,222 @@ emit_simple(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * Emit MSB instruction (like IMSB, UMSB).
+ *
+ * GLSL returns the index starting from the LSB;
+ * whereas in SM5, firstbit_hi/shi returns the index starting from the MSB.
+ * To get correct location as per glsl from SM5 device, we should
+ * return (31 - index) if returned index is not -1.
+ */
+static boolean
+emit_msb(struct svga_shader_emitter_v10 *emit,
+         const struct tgsi_full_instruction *inst)
+{
+   const struct tgsi_full_dst_register *index_dst = &inst->Dst[0];
+
+   assert(index_dst->Register.File != TGSI_FILE_OUTPUT);
+
+   struct tgsi_full_src_register index_src =
+      make_src_reg(index_dst->Register.File, index_dst->Register.Index);
+   struct tgsi_full_src_register imm31 =
+      make_immediate_reg_int(emit, 31);
+   imm31 = scalar_src(&imm31, TGSI_SWIZZLE_X);
+   struct tgsi_full_src_register neg_one =
+      make_immediate_reg_int(emit, -1);
+   neg_one = scalar_src(&neg_one, TGSI_SWIZZLE_X);
+   unsigned tmp = get_temp_index(emit);
+   const struct tgsi_full_dst_register tmp_dst =
+      make_dst_temp_reg(tmp);
+   const struct tgsi_full_dst_register tmp_dst_x =
+      writemask_dst(&tmp_dst, TGSI_WRITEMASK_X);
+   const struct tgsi_full_src_register tmp_src_x =
+       make_src_scalar_reg(TGSI_FILE_TEMPORARY, tmp, TGSI_SWIZZLE_X);
+   int writemask = TGSI_WRITEMASK_X;
+   int src_swizzle = TGSI_SWIZZLE_X;
+   int dst_writemask = index_dst->Register.WriteMask;
+
+   emit_simple(emit, inst);
+
+   /* index conversion from SM5 to GLSL */
+   while (writemask & dst_writemask) {
+      struct tgsi_full_src_register index_src_comp =
+         scalar_src(&index_src, src_swizzle);
+      struct tgsi_full_dst_register index_dst_comp =
+         writemask_dst(index_dst, writemask);
+
+      /* check if index_src_comp != -1 */
+      emit_instruction_op2(emit, VGPU10_OPCODE_INE,
+                           &tmp_dst_x, &index_src_comp, &neg_one);
+
+      /* if */
+      emit_if(emit, &tmp_src_x);
+
+      index_src_comp = negate_src(&index_src_comp);
+      /* SUB DST, IMM{31}, DST */
+      emit_instruction_op2(emit, VGPU10_OPCODE_IADD,
+                           &index_dst_comp, &imm31, &index_src_comp);
+
+      /* endif */
+      emit_instruction_op0(emit, VGPU10_OPCODE_ENDIF);
+
+      writemask = writemask << 1;
+      src_swizzle = src_swizzle + 1;
+   }
+   free_temp_indexes(emit);
+   return TRUE;
+}
+
+
+/**
+ * Emit a BFE instruction (like UBFE, IBFE).
+ * tgsi representation:
+ * U/IBFE dst, value, offset, width
+ * SM5 representation:
+ * U/IBFE dst, width, offset, value
+ * Note: SM5 has width & offset range (0-31);
+ *      whereas GLSL has width & offset range (0-32)
+ */
+static boolean
+emit_bfe(struct svga_shader_emitter_v10 *emit,
+         const struct tgsi_full_instruction *inst)
+{
+   const enum tgsi_opcode opcode = inst->Instruction.Opcode;
+   struct tgsi_full_src_register imm32 = make_immediate_reg_int(emit, 32);
+   imm32 = scalar_src(&imm32, TGSI_SWIZZLE_X);
+   struct tgsi_full_src_register zero = make_immediate_reg_int(emit, 0);
+   zero = scalar_src(&zero, TGSI_SWIZZLE_X);
+
+   unsigned tmp1 = get_temp_index(emit);
+   const struct tgsi_full_dst_register cond1_dst = make_dst_temp_reg(tmp1);
+   const struct tgsi_full_dst_register cond1_dst_x =
+      writemask_dst(&cond1_dst, TGSI_WRITEMASK_X);
+   const struct tgsi_full_src_register cond1_src_x =
+      make_src_scalar_reg(TGSI_FILE_TEMPORARY, tmp1, TGSI_SWIZZLE_X);
+
+   unsigned tmp2 = get_temp_index(emit);
+   const struct tgsi_full_dst_register cond2_dst = make_dst_temp_reg(tmp2);
+   const struct tgsi_full_dst_register cond2_dst_x =
+      writemask_dst(&cond2_dst, TGSI_WRITEMASK_X);
+   const struct tgsi_full_src_register cond2_src_x =
+      make_src_scalar_reg(TGSI_FILE_TEMPORARY, tmp2, TGSI_SWIZZLE_X);
+
+   /**
+    * In SM5, when width = 32  and offset = 0, it returns 0.
+    * On the other hand GLSL, expects value to be copied as it is, to dst.
+    */
+
+   /* cond1 = width ! = 32 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_IEQ,
+                        &cond1_dst_x, &inst->Src[2], &imm32);
+
+   /* cond2 = offset ! = 0 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_IEQ,
+                        &cond2_dst_x, &inst->Src[1], &zero);
+
+   /* cond 2 = cond1 & cond 2 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_AND, &cond2_dst_x,
+                        &cond2_src_x,
+                        &cond1_src_x);
+   /* IF */
+   emit_if(emit, &cond2_src_x);
+
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0],
+                        &inst->Src[0]);
+
+   /* ELSE */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ELSE);
+
+   /* U/IBFE dst, width, offset, value */
+   emit_instruction_op3(emit, translate_opcode(opcode), &inst->Dst[0],
+                        &inst->Src[2], &inst->Src[1], &inst->Src[0]);
+
+   /* ENDIF */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ENDIF);
+
+   free_temp_indexes(emit);
+   return TRUE;
+}
+
+
+/**
+ * Emit BFI  instruction
+ * tgsi representation:
+ * BFI dst, base, insert, offset, width
+ * SM5 representation:
+ * BFI dst, width, offset, insert, base
+ * Note: SM5 has width & offset range (0-31);
+ *      whereas GLSL has width & offset range (0-32)
+ */
+static boolean
+emit_bfi(struct svga_shader_emitter_v10 *emit,
+         const struct tgsi_full_instruction *inst)
+{
+   const enum tgsi_opcode opcode = inst->Instruction.Opcode;
+   struct tgsi_full_src_register imm32 = make_immediate_reg_int(emit, 32);
+   imm32 = scalar_src(&imm32, TGSI_SWIZZLE_X);
+
+   struct tgsi_full_src_register zero = make_immediate_reg_int(emit, 0);
+   zero = scalar_src(&zero, TGSI_SWIZZLE_X);
+
+   unsigned tmp1 = get_temp_index(emit);
+   const struct tgsi_full_dst_register cond1_dst = make_dst_temp_reg(tmp1);
+   const struct tgsi_full_dst_register cond1_dst_x =
+      writemask_dst(&cond1_dst, TGSI_WRITEMASK_X);
+   const struct tgsi_full_src_register cond1_src_x =
+      make_src_scalar_reg(TGSI_FILE_TEMPORARY, tmp1, TGSI_SWIZZLE_X);
+
+   unsigned tmp2 = get_temp_index(emit);
+   const struct tgsi_full_dst_register cond2_dst = make_dst_temp_reg(tmp2);
+   const struct tgsi_full_dst_register cond2_dst_x =
+      writemask_dst(&cond2_dst, TGSI_WRITEMASK_X);
+   const struct tgsi_full_src_register cond2_src_x =
+      make_src_scalar_reg(TGSI_FILE_TEMPORARY, tmp2, TGSI_SWIZZLE_X);
+
+   /**
+    * In SM5, when width = 32  and offset = 0, it returns 0.
+    * On the other hand GLSL, expects insert to be copied as it is, to dst.
+    */
+
+   /* cond1 = width == 32 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_IEQ,
+                        &cond1_dst_x, &inst->Src[3], &imm32);
+
+   /* cond1 = offset == 0 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_IEQ,
+                        &cond2_dst_x, &inst->Src[2], &zero);
+
+   /* cond2 = cond1 & cond2 */
+   emit_instruction_op2(emit, VGPU10_OPCODE_AND,
+                        &cond2_dst_x, &cond2_src_x, &cond1_src_x);
+
+   /* if */
+   emit_if(emit, &cond2_src_x);
+
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &inst->Dst[0],
+                        &inst->Src[1]);
+
+   /* else */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ELSE);
+
+   /* BFI dst, width, offset, insert, base */
+   begin_emit_instruction(emit);
+   emit_opcode(emit, translate_opcode(opcode), inst->Instruction.Saturate);
+   emit_dst_register(emit, &inst->Dst[0]);
+   emit_src_register(emit, &inst->Src[3]);
+   emit_src_register(emit, &inst->Src[2]);
+   emit_src_register(emit, &inst->Src[1]);
+   emit_src_register(emit, &inst->Src[0]);
+   end_emit_instruction(emit);
+
+   /* endif */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ENDIF);
+
+   free_temp_indexes(emit);
+   return TRUE;
+}
+
+
+/**
  * We only special case the MOV instruction to try to detect constant
  * color writes in the fragment shader.
  */
@@ -5804,6 +9009,56 @@ emit_simple_1dst(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * Emit a vmware specific VGPU10 instruction.
+ */
+static boolean
+emit_vmware(struct svga_shader_emitter_v10 *emit,
+            const struct tgsi_full_instruction *inst,
+            VGPU10_VMWARE_OPCODE_TYPE subopcode)
+{
+   VGPU10OpcodeToken0 token0;
+   const enum tgsi_opcode opcode = inst->Instruction.Opcode;
+   const struct tgsi_opcode_info *op = tgsi_get_opcode_info(opcode);
+   const bool dbl_dst = opcode_has_dbl_dst(inst->Instruction.Opcode);
+   const bool dbl_src = opcode_has_dbl_src(inst->Instruction.Opcode);
+
+   unsigned i;
+
+   begin_emit_instruction(emit);
+
+   assert((subopcode > 0 && emit->version >= 50) || subopcode == 0);
+
+   token0.value = 0;
+   token0.opcodeType = VGPU10_OPCODE_VMWARE;
+   token0.vmwareOpcodeType = subopcode;
+   emit_dword(emit, token0.value);
+
+   if (subopcode == VGPU10_VMWARE_OPCODE_IDIV) {
+      /* IDIV only uses the first dest register. */
+      emit_dst_register(emit, &inst->Dst[0]);
+      emit_null_dst_register(emit);
+   } else {
+      for (i = 0; i < op->num_dst; i++) {
+         if (dbl_dst) {
+            check_double_dst_writemask(inst);
+         }
+         emit_dst_register(emit, &inst->Dst[i]);
+      }
+   }
+
+   for (i = 0; i < op->num_src; i++) {
+      if (dbl_src) {
+         check_double_src_swizzle(&inst->Src[i]);
+      }
+      emit_src_register(emit, &inst->Src[i]);
+   }
+   end_emit_instruction(emit);
+
+   return TRUE;
+}
+
+
+/**
  * Translate a single TGSI instruction to VGPU10.
  */
 static boolean
@@ -5813,6 +9068,9 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
 {
    const enum tgsi_opcode opcode = inst->Instruction.Opcode;
 
+   if (emit->skip_instruction)
+      return TRUE;
+
    switch (opcode) {
    case TGSI_OPCODE_ADD:
    case TGSI_OPCODE_AND:
@@ -5852,7 +9110,6 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_NOP:
    case TGSI_OPCODE_NOT:
    case TGSI_OPCODE_OR:
-   case TGSI_OPCODE_RET:
    case TGSI_OPCODE_UADD:
    case TGSI_OPCODE_USEQ:
    case TGSI_OPCODE_USGE:
@@ -5869,9 +9126,41 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_USHR:
    case TGSI_OPCODE_USNE:
    case TGSI_OPCODE_XOR:
+   /* Begin SM5 opcodes */
+   case TGSI_OPCODE_F2D:
+   case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_DADD:
+   case TGSI_OPCODE_DMUL:
+   case TGSI_OPCODE_DMAX:
+   case TGSI_OPCODE_DMIN:
+   case TGSI_OPCODE_DSGE:
+   case TGSI_OPCODE_DSLT:
+   case TGSI_OPCODE_DSEQ:
+   case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_BREV:
+   case TGSI_OPCODE_POPC:
+   case TGSI_OPCODE_LSB:
+   case TGSI_OPCODE_INTERP_CENTROID:
+   case TGSI_OPCODE_INTERP_SAMPLE:
       /* simple instructions */
       return emit_simple(emit, inst);
+   case TGSI_OPCODE_RET:
+      if (emit->unit == PIPE_SHADER_TESS_CTRL &&
+          !emit->tcs.control_point_phase) {
+
+         /* store the tessellation levels in the patch constant phase only */
+         store_tesslevels(emit);
+      }
+      return emit_simple(emit, inst);
 
+   case TGSI_OPCODE_IMSB:
+   case TGSI_OPCODE_UMSB:
+      return emit_msb(emit, inst);
+   case TGSI_OPCODE_IBFE:
+   case TGSI_OPCODE_UBFE:
+      return emit_bfe(emit, inst);
+   case TGSI_OPCODE_BFI:
+      return emit_bfi(emit, inst);
    case TGSI_OPCODE_MOV:
       return emit_mov(emit, inst);
    case TGSI_OPCODE_EMIT:
@@ -5900,7 +9189,7 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_EXP:
       return emit_exp(emit, inst);
    case TGSI_OPCODE_IF:
-      return emit_if(emit, inst);
+      return emit_if(emit, &inst->Src[0]);
    case TGSI_OPCODE_KILL:
       return emit_kill(emit, inst);
    case TGSI_OPCODE_KILL_IF:
@@ -5962,18 +9251,90 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_TXQ:
       return emit_txq(emit, inst);
    case TGSI_OPCODE_UIF:
-      return emit_if(emit, inst);
+      return emit_if(emit, &inst->Src[0]);
    case TGSI_OPCODE_UMUL_HI:
    case TGSI_OPCODE_IMUL_HI:
    case TGSI_OPCODE_UDIV:
-   case TGSI_OPCODE_IDIV:
       /* These cases use only the FIRST of two destination registers */
       return emit_simple_1dst(emit, inst, 2, 0);
+   case TGSI_OPCODE_IDIV:
+      return emit_vmware(emit, inst, VGPU10_VMWARE_OPCODE_IDIV);
    case TGSI_OPCODE_UMUL:
    case TGSI_OPCODE_UMOD:
    case TGSI_OPCODE_MOD:
       /* These cases use only the SECOND of two destination registers */
       return emit_simple_1dst(emit, inst, 2, 1);
+
+   /* Begin SM5 opcodes */
+   case TGSI_OPCODE_DABS:
+      return emit_dabs(emit, inst);
+   case TGSI_OPCODE_DNEG:
+      return emit_dneg(emit, inst);
+   case TGSI_OPCODE_DRCP:
+      return emit_simple(emit, inst);
+   case TGSI_OPCODE_DSQRT:
+      return emit_dsqrt(emit, inst);
+   case TGSI_OPCODE_DMAD:
+      return emit_dmad(emit, inst);
+   case TGSI_OPCODE_DFRAC:
+      return emit_vmware(emit, inst, VGPU10_VMWARE_OPCODE_DFRC);
+   case TGSI_OPCODE_D2I:
+   case TGSI_OPCODE_D2U:
+      return emit_simple(emit, inst);
+   case TGSI_OPCODE_I2D:
+   case TGSI_OPCODE_U2D:
+      return emit_simple(emit, inst);
+   case TGSI_OPCODE_DRSQ:
+      return emit_drsq(emit, &inst->Dst[0], &inst->Src[0]);
+   case TGSI_OPCODE_DDIV:
+      return emit_simple(emit, inst);
+   case TGSI_OPCODE_INTERP_OFFSET:
+      return emit_interp_offset(emit, inst);
+
+   /* The following opcodes should never be seen here.  We return zero
+    * for all the PIPE_CAP_TGSI_DROUND_SUPPORTED, DFRACEXP_DLDEXP_SUPPORTED,
+    * FMA_SUPPORTED, LDEXP_SUPPORTED queries.
+    */
+   case TGSI_OPCODE_FMA:
+   case TGSI_OPCODE_LDEXP:
+   case TGSI_OPCODE_DSSG:
+   case TGSI_OPCODE_DFRACEXP:
+   case TGSI_OPCODE_DLDEXP:
+   case TGSI_OPCODE_DTRUNC:
+   case TGSI_OPCODE_DCEIL:
+   case TGSI_OPCODE_DFLR:
+      debug_printf("Unexpected TGSI opcode %s.  "
+                   "Should have been translated away by the GLSL compiler.\n",
+                   tgsi_get_opcode_name(opcode));
+      return FALSE;
+
+   case TGSI_OPCODE_LOAD:
+   case TGSI_OPCODE_STORE:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMIMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMXOR:
+      return FALSE;
+   case TGSI_OPCODE_BARRIER:
+      if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         /* SM5 device doesn't support BARRIER in tcs . If barrier is used
+          * in shader, don't do anything for this opcode and continue rest
+          * of shader translation
+          */
+         pipe_debug_message(&emit->svga_debug_callback, INFO,
+                            "barrier instruction is not supported in tessellation control shader\n");
+         return TRUE;
+      }
+      else {
+         return emit_simple(emit, inst);
+      }
+
    case TGSI_OPCODE_END:
       if (!emit_post_helpers(emit))
          return FALSE;
@@ -5998,11 +9359,11 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
  * \param vs_pos_tmp_index  which temporary register contains the vertex pos.
  */
 static void
-emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
-                       unsigned vs_pos_tmp_index)
+emit_vpos_instructions(struct svga_shader_emitter_v10 *emit)
 {
    struct tgsi_full_src_register tmp_pos_src;
    struct tgsi_full_dst_register pos_dst;
+   const unsigned vs_pos_tmp_index = emit->vposition.tmp_index;
 
    /* Don't bother to emit any extra vertex instructions if vertex position is
     * not written out
@@ -6010,6 +9371,12 @@ emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
    if (emit->vposition.out_index == INVALID_INDEX)
       return;
 
+   /**
+    * Reset the temporary vertex position register index
+    * so that emit_dst_register() will use the real vertex position output
+    */
+   emit->vposition.tmp_index = INVALID_INDEX;
+
    tmp_pos_src = make_src_temp_reg(vs_pos_tmp_index);
    pos_dst = make_dst_output_reg(emit->vposition.out_index);
 
@@ -6023,8 +9390,7 @@ emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
          make_dst_output_reg(emit->vposition.so_index);
 
       /* MOV pos_so, tmp_pos */
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &pos_so_dst,
-                           &tmp_pos_src, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &pos_so_dst, &tmp_pos_src);
    }
 
    if (emit->vposition.need_prescale) {
@@ -6045,17 +9411,17 @@ emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
          writemask_dst(&tmp_pos_dst, TGSI_WRITEMASK_XYZ);
 
       struct tgsi_full_src_register prescale_scale =
-         make_src_const_reg(emit->vposition.prescale_scale_index);
+         make_src_temp_reg(emit->vposition.prescale_scale_index);
       struct tgsi_full_src_register prescale_trans =
-         make_src_const_reg(emit->vposition.prescale_trans_index);
+         make_src_temp_reg(emit->vposition.prescale_trans_index);
 
       /* MUL tmp_pos.xyz, tmp_pos, prescale.scale */
       emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_pos_dst_xyz,
-                           &tmp_pos_src, &prescale_scale, FALSE);
+                           &tmp_pos_src, &prescale_scale);
 
       /* MAD pos, tmp_pos.wwww, prescale.trans, tmp_pos */
       emit_instruction_op3(emit, VGPU10_OPCODE_MAD, &pos_dst, &tmp_pos_src_w,
-                           &prescale_trans, &tmp_pos_src, FALSE);
+                           &prescale_trans, &tmp_pos_src);
    }
    else if (emit->key.vs.undo_viewport) {
       /* This code computes the final vertex position from the temporary
@@ -6090,19 +9456,18 @@ emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
 
       /* ADD tmp_pos.xy, tmp_pos.xy, viewport.zwww */
       emit_instruction_op2(emit, VGPU10_OPCODE_ADD, &tmp_pos_dst_xy,
-                           &tmp_pos_src, &vp_zwww, FALSE);
+                           &tmp_pos_src, &vp_zwww);
 
       /* MUL tmp_pos.xy, tmp_pos.xyzw, viewport.xyzy */
       emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &tmp_pos_dst_xy,
-                           &tmp_pos_src, &vp_xyzw, FALSE);
+                           &tmp_pos_src, &vp_xyzw);
 
       /* MUL pos.xyz, tmp_pos.xyz, tmp_pos.www */
       emit_instruction_op2(emit, VGPU10_OPCODE_MUL, &pos_dst_xyz,
-                           &tmp_pos_src, &tmp_pos_src_wwww, FALSE);
+                           &tmp_pos_src, &tmp_pos_src_wwww);
 
       /* MOV pos.w, tmp_pos.w */
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &pos_dst_w,
-                           &tmp_pos_src, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &pos_dst_w, &tmp_pos_src);
    }
    else if (vs_pos_tmp_index != INVALID_INDEX) {
       /* This code is to handle the case where the temporary vertex
@@ -6120,6 +9485,11 @@ emit_vpos_instructions(struct svga_shader_emitter_v10 *emit,
       emit_src_register(emit, &tmp_pos_src);
       end_emit_instruction(emit);
    }
+
+   /* Restore original vposition.tmp_index value for the next GS vertex.
+    * It doesn't matter for VS.
+    */
+   emit->vposition.tmp_index = vs_pos_tmp_index;
 }
 
 static void
@@ -6129,7 +9499,8 @@ emit_clipping_instructions(struct svga_shader_emitter_v10 *emit)
       /* Copy from copy distance temporary to CLIPDIST & the shadow copy */
       emit_clip_distance_instructions(emit);
 
-   } else if (emit->clip_mode == CLIP_VERTEX) {
+   } else if (emit->clip_mode == CLIP_VERTEX &&
+              emit->key.last_vertex_stage) {
       /* Convert TGSI CLIPVERTEX to CLIPDIST */
       emit_clip_vertex_instructions(emit);
    }
@@ -6150,7 +9521,7 @@ emit_clipping_instructions(struct svga_shader_emitter_v10 *emit)
     * emit_vpos_instructions() call since the later function will change
     * the TEMP[vs_pos_tmp_index] value.
     */
-   if (emit->clip_mode == CLIP_LEGACY) {
+   if (emit->clip_mode == CLIP_LEGACY && emit->key.last_vertex_stage) {
       /* Emit CLIPDIST for legacy user defined clip planes */
       emit_clip_distance_from_vpos(emit, emit->vposition.tmp_index);
    }
@@ -6165,26 +9536,14 @@ emit_clipping_instructions(struct svga_shader_emitter_v10 *emit)
 static void
 emit_vertex_instructions(struct svga_shader_emitter_v10 *emit)
 {
-   const unsigned vs_pos_tmp_index = emit->vposition.tmp_index;
-
    /* Emit clipping instructions based on clipping mode */
    emit_clipping_instructions(emit);
 
-   /**
-    * Reset the temporary vertex position register index
-    * so that emit_dst_register() will use the real vertex position output
-    */
-   emit->vposition.tmp_index = INVALID_INDEX;
-
    /* Emit vertex position instructions */
-   emit_vpos_instructions(emit, vs_pos_tmp_index);
-
-   /* Restore original vposition.tmp_index value for the next GS vertex.
-    * It doesn't matter for VS.
-    */
-   emit->vposition.tmp_index = vs_pos_tmp_index;
+   emit_vpos_instructions(emit);
 }
 
+
 /**
  * Translate the TGSI_OPCODE_EMIT GS instruction.
  */
@@ -6196,13 +9555,66 @@ emit_vertex(struct svga_shader_emitter_v10 *emit,
 
    assert(emit->unit == PIPE_SHADER_GEOMETRY);
 
-   emit_vertex_instructions(emit);
+   /**
+    * Emit the viewport array index for the first vertex.
+    */
+   if (emit->gs.viewport_index_out_index != INVALID_INDEX) {
+      struct tgsi_full_dst_register viewport_index_out =
+         make_dst_output_reg(emit->gs.viewport_index_out_index);
+      struct tgsi_full_dst_register viewport_index_out_x =
+         writemask_dst(&viewport_index_out, TGSI_WRITEMASK_X);
+      struct tgsi_full_src_register viewport_index_tmp =
+         make_src_temp_reg(emit->gs.viewport_index_tmp_index);
+
+      /* Set the out index to INVALID_INDEX, so it will not
+       * be assigned to a temp again in emit_dst_register, and
+       * the viewport index will not be assigned again in the
+       * subsequent vertices.
+       */
+      emit->gs.viewport_index_out_index = INVALID_INDEX;
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
+                           &viewport_index_out_x, &viewport_index_tmp);
+   }
 
-   /* We can't use emit_simple() because the TGSI instruction has one
-    * operand (vertex stream number) which we must ignore for VGPU10.
+   /**
+    * Find the stream index associated with this emit vertex instruction.
     */
+   assert(inst->Src[0].Register.File == TGSI_FILE_IMMEDIATE);
+   unsigned streamIndex = find_stream_index(emit, &inst->Src[0]);
+
+   /**
+    * According to the ARB_gpu_shader5 spec, the built-in geometry shader
+    * outputs are always associated with vertex stream zero.
+    * So emit the extra vertex instructions for position or clip distance
+    * for stream zero only.
+    */
+   if (streamIndex == 0) {
+      /**
+       * Before emitting vertex instructions, emit the temporaries for
+       * the prescale constants based on the viewport index if needed.
+       */
+      if (emit->vposition.need_prescale && !emit->vposition.have_prescale)
+         emit_temp_prescale_instructions(emit);
+
+      emit_vertex_instructions(emit);
+   }
+
    begin_emit_instruction(emit);
-   emit_opcode(emit, VGPU10_OPCODE_EMIT, FALSE);
+   if (emit->version >= 50) {
+      if (emit->info.num_stream_output_components[streamIndex] == 0) {
+         /**
+          * If there is no output for this stream, discard this instruction.
+          */
+         emit->discard_instruction = TRUE;
+      }
+      else {
+         emit_opcode(emit, VGPU10_OPCODE_EMIT_STREAM, FALSE);
+         emit_stream_register(emit, streamIndex);
+      }
+   }
+   else {
+      emit_opcode(emit, VGPU10_OPCODE_EMIT, FALSE);
+   }
    end_emit_instruction(emit);
 
    return ret;
@@ -6399,11 +9811,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
          /* ITOF/UTOF/MOV tmp, input[index] */
          if (save_itof_mask & (1 << index)) {
             emit_instruction_op1(emit, VGPU10_OPCODE_ITOF,
-                                 &tmp_dst, &input_src, FALSE);
+                                 &tmp_dst, &input_src);
          }
          else if (save_utof_mask & (1 << index)) {
             emit_instruction_op1(emit, VGPU10_OPCODE_UTOF,
-                                 &tmp_dst, &input_src, FALSE);
+                                 &tmp_dst, &input_src);
          }
          else if (save_puint_to_snorm_mask & (1 << index)) {
             emit_puint_to_snorm(emit, &tmp_dst, &input_src);
@@ -6417,7 +9829,7 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
          else {
             assert((save_w_1_mask | save_is_bgra_mask) & (1 << index));
             emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                                 &tmp_dst, &input_src, FALSE);
+                                 &tmp_dst, &input_src);
          }
 
          if (save_is_bgra_mask & (1 << index)) {
@@ -6428,11 +9840,11 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
             /* MOV tmp.w, 1.0 */
             if (emit->key.vs.attrib_is_pure_int & (1 << index)) {
                emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                                    &tmp_dst_w, &one_int, FALSE);
+                                    &tmp_dst_w, &one_int);
             }
             else {
                emit_instruction_op1(emit, VGPU10_OPCODE_MOV,
-                                    &tmp_dst_w, &one, FALSE);
+                                    &tmp_dst_w, &one);
             }
          }
       }
@@ -6448,58 +9860,281 @@ emit_vertex_attrib_instructions(struct svga_shader_emitter_v10 *emit)
 }
 
 
-/**
- * Some common values like 0.0, 1.0, 0.5, etc. are frequently needed
- * to implement some instructions.  We pre-allocate those values here
- * in the immediate constant buffer.
- */
+/* Find zero-value immedate for default layer index */
 static void
-alloc_common_immediates(struct svga_shader_emitter_v10 *emit)
+emit_default_layer_instructions(struct svga_shader_emitter_v10 *emit)
 {
-   unsigned n = 0;
+   assert(emit->unit == PIPE_SHADER_FRAGMENT);
 
-   emit->common_immediate_pos[n++] =
-      alloc_immediate_float4(emit, 0.0f, 1.0f, 0.5f, -1.0f);
+   /* immediate for default layer index 0 */
+   if (emit->fs.layer_input_index != INVALID_INDEX) {
+      union tgsi_immediate_data imm;
+      imm.Int = 0;
+      emit->fs.layer_imm_index = find_immediate(emit, imm, 0);
+   }
+}
 
-   if (emit->info.opcode_count[TGSI_OPCODE_LIT] > 0) {
-      emit->common_immediate_pos[n++] =
-         alloc_immediate_float4(emit, 128.0f, -128.0f, 0.0f, 0.0f);
+
+static void
+emit_temp_prescale_from_cbuf(struct svga_shader_emitter_v10 *emit,
+                             unsigned cbuf_index,
+                             struct tgsi_full_dst_register *scale,
+                             struct tgsi_full_dst_register *translate)
+{
+   struct tgsi_full_src_register scale_cbuf = make_src_const_reg(cbuf_index);
+   struct tgsi_full_src_register trans_cbuf = make_src_const_reg(cbuf_index+1);
+
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, scale, &scale_cbuf);
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, translate, &trans_cbuf);
+}
+
+
+/**
+ * A recursive helper function to find the prescale from the constant buffer
+ */
+static void
+find_prescale_from_cbuf(struct svga_shader_emitter_v10 *emit,
+                        unsigned index, unsigned num_prescale,
+                        struct tgsi_full_src_register *vp_index,
+                        struct tgsi_full_dst_register *scale,
+                        struct tgsi_full_dst_register *translate,
+                        struct tgsi_full_src_register *tmp_src,
+                        struct tgsi_full_dst_register *tmp_dst)
+{
+   if (num_prescale == 0)
+      return;
+
+   if (index > 0) {
+      /* ELSE */
+      emit_instruction_op0(emit, VGPU10_OPCODE_ELSE);
    }
 
-   emit->common_immediate_pos[n++] =
-      alloc_immediate_int4(emit, 0, 1, 0, -1);
+   struct tgsi_full_src_register index_src =
+	                            make_immediate_reg_int(emit, index);
 
-   if (emit->key.vs.attrib_puint_to_snorm) {
-      emit->common_immediate_pos[n++] =
-         alloc_immediate_float4(emit, -2.0f, 2.0f, 3.0f, -1.66666f);
+   if (index == 0) {
+      /* GE tmp, vp_index, index */
+      emit_instruction_op2(emit, VGPU10_OPCODE_GE, tmp_dst,
+                           vp_index, &index_src);
+   } else {
+      /* EQ tmp, vp_index, index */
+      emit_instruction_op2(emit, VGPU10_OPCODE_EQ, tmp_dst,
+                           vp_index, &index_src);
    }
 
-   if (emit->key.vs.attrib_puint_to_uscaled) {
-      emit->common_immediate_pos[n++] =
-         alloc_immediate_float4(emit, 1023.0f, 3.0f, 0.0f, 0.0f);
+   /* IF tmp */
+   emit_if(emit, tmp_src);
+   emit_temp_prescale_from_cbuf(emit,
+                                emit->vposition.prescale_cbuf_index + 2 * index,
+                                scale, translate);
+
+   find_prescale_from_cbuf(emit, index+1, num_prescale-1,
+                           vp_index, scale, translate,
+                           tmp_src, tmp_dst);
+
+   /* ENDIF */
+   emit_instruction_op0(emit, VGPU10_OPCODE_ENDIF);
+}
+
+
+/**
+ * This helper function emits instructions to set the prescale
+ * and translate temporaries to the correct constants from the
+ * constant buffer according to the designated viewport.
+ */
+static void
+emit_temp_prescale_instructions(struct svga_shader_emitter_v10 *emit)
+{
+   struct tgsi_full_dst_register prescale_scale =
+         make_dst_temp_reg(emit->vposition.prescale_scale_index);
+   struct tgsi_full_dst_register prescale_translate =
+         make_dst_temp_reg(emit->vposition.prescale_trans_index);
+
+   unsigned prescale_cbuf_index = emit->vposition.prescale_cbuf_index;
+
+   if (emit->vposition.num_prescale == 1) {
+      emit_temp_prescale_from_cbuf(emit,
+                                   prescale_cbuf_index,
+                                   &prescale_scale, &prescale_translate);
+   } else {
+      /**
+       * Since SM5 device does not support dynamic indexing, we need
+       * to do the if-else to find the prescale constants for the
+       * specified viewport.
+       */
+      struct tgsi_full_src_register vp_index_src =
+         make_src_temp_reg(emit->gs.viewport_index_tmp_index);
+
+      struct tgsi_full_src_register vp_index_src_x =
+         scalar_src(&vp_index_src, TGSI_SWIZZLE_X);
+
+      unsigned tmp = get_temp_index(emit);
+      struct tgsi_full_src_register tmp_src = make_src_temp_reg(tmp);
+      struct tgsi_full_src_register tmp_src_x =
+                scalar_src(&tmp_src, TGSI_SWIZZLE_X);
+      struct tgsi_full_dst_register tmp_dst = make_dst_temp_reg(tmp);
+
+      find_prescale_from_cbuf(emit, 0, emit->vposition.num_prescale,
+                              &vp_index_src_x,
+		              &prescale_scale, &prescale_translate,
+                              &tmp_src_x, &tmp_dst);
    }
 
-   if (emit->key.vs.attrib_puint_to_sscaled) {
-      emit->common_immediate_pos[n++] =
-         alloc_immediate_int4(emit, 22, 12, 2, 0);
+   /* Mark prescale temporaries are emitted */
+   emit->vposition.have_prescale = 1;
+}
 
-      emit->common_immediate_pos[n++] =
-         alloc_immediate_int4(emit, 22, 30, 0, 0);
+
+/**
+ * Hull Shader must have control point outputs. But tessellation
+ * control shader can return without writing to control point output.
+ * In this case, the control point output is assumed to be passthrough
+ * from the control point input.
+ * This helper function is to write out a control point output first in case
+ * the tessellation control shader returns before writing a
+ * control point output.
+ */
+static void
+emit_tcs_default_control_point_output(struct svga_shader_emitter_v10 *emit)
+{
+   assert(emit->unit == PIPE_SHADER_TESS_CTRL);
+   assert(emit->tcs.control_point_phase);
+   assert(emit->tcs.control_point_input_index != INVALID_INDEX);
+   assert(emit->tcs.control_point_out_index != INVALID_INDEX);
+   assert(emit->tcs.invocation_id_sys_index != INVALID_INDEX);
+
+   /* UARL ADDR[INDEX].x INVOCATION.xxxx */
+
+   struct tgsi_full_src_register invocation_src;
+   struct tgsi_full_dst_register addr_dst;
+   struct tgsi_full_dst_register addr_dst_x;
+   unsigned addr_tmp;
+
+   addr_tmp = emit->address_reg_index[emit->tcs.control_point_addr_index];
+   addr_dst = make_dst_temp_reg(addr_tmp);
+   addr_dst_x = writemask_dst(&addr_dst, TGSI_WRITEMASK_X);
+
+   invocation_src = make_src_reg(TGSI_FILE_SYSTEM_VALUE,
+                                 emit->tcs.invocation_id_sys_index);
+
+   begin_emit_instruction(emit);
+   emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
+   emit_dst_register(emit, &addr_dst_x);
+   emit_src_register(emit, &invocation_src);
+   end_emit_instruction(emit);
+
+
+   /* MOV OUTPUT INPUT[ADDR[INDEX].x][POSITION] */
+
+   struct tgsi_full_src_register input_control_point;
+   struct tgsi_full_dst_register output_control_point;
+
+   input_control_point = make_src_reg(TGSI_FILE_INPUT,
+                                      emit->tcs.control_point_input_index);
+   input_control_point.Register.Dimension = 1;
+   input_control_point.Dimension.Indirect = 1;
+   input_control_point.DimIndirect.File = TGSI_FILE_ADDRESS;
+   input_control_point.DimIndirect.Index = emit->tcs.control_point_addr_index;
+   output_control_point =
+      make_dst_output_reg(emit->tcs.control_point_out_index);
+
+   begin_emit_instruction(emit);
+   emit_opcode_precise(emit, VGPU10_OPCODE_MOV, FALSE, FALSE);
+   emit_dst_register(emit, &output_control_point);
+   emit_src_register(emit, &input_control_point);
+   end_emit_instruction(emit);
+}
+
+/**
+ * This functions constructs temporary tessfactor from VGPU10*_TESSFACTOR
+ * values in domain shader. SM5 has tessfactors as floating point values where
+ * as tgsi emit them as vector. This function allows to construct temp
+ * tessfactor vector similar to TGSI_SEMANTIC_TESSINNER/OUTER filled with
+ * values from VGPU10*_TESSFACTOR. Use this constructed vector whenever
+ * TGSI_SEMANTIC_TESSINNER/OUTER is used in shader.
+ */
+static void
+emit_temp_tessfactor_instructions(struct svga_shader_emitter_v10 *emit)
+{
+   struct tgsi_full_src_register src;
+   struct tgsi_full_dst_register dst;
+
+   if (emit->tes.inner.tgsi_index != INVALID_INDEX) {
+      dst = make_dst_temp_reg(emit->tes.inner.temp_index);
+
+      switch (emit->tes.prim_mode) {
+      case PIPE_PRIM_QUADS:
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.inner.in_index + 1, TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_Y);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      case PIPE_PRIM_TRIANGLES:
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.inner.in_index, TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+         break;
+      case PIPE_PRIM_LINES:
+         /**
+          * As per SM5 spec, InsideTessFactor for isolines are unused.
+          * In fact glsl tessInnerLevel for isolines doesn't mean anything but if
+          * any application try to read tessInnerLevel in TES when primitive type
+          * is isolines, then instead of driver throwing segfault for accesing it,
+          * return atleast vec(1.0f)
+          */
+         src = make_immediate_reg_float(emit, 1.0f);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+         break;
+      default:
+         break;
+      }
    }
 
-   unsigned i;
+   if (emit->tes.outer.tgsi_index != INVALID_INDEX) {
+      dst = make_dst_temp_reg(emit->tes.outer.temp_index);
+
+      switch (emit->tes.prim_mode) {
+      case PIPE_PRIM_QUADS:
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.outer.in_index + 3, TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_W);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      case PIPE_PRIM_TRIANGLES:
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.outer.in_index + 2, TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_Z);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+      case PIPE_PRIM_LINES:
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.outer.in_index + 1, TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_Y);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+
+         src = make_src_scalar_reg(TGSI_FILE_INPUT,
+                  emit->tes.outer.in_index , TGSI_SWIZZLE_X);
+         dst = writemask_dst(&dst, TGSI_WRITEMASK_X);
+         emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      if (emit->key.tex[i].texel_bias) {
-         /* Replace 0.0f if more immediate float value is needed */
-         emit->common_immediate_pos[n++] =
-            alloc_immediate_float4(emit, 0.0001f, 0.0f, 0.0f, 0.0f);
+         break;
+      default:
          break;
       }
    }
+}
 
-   assert(n <= ARRAY_SIZE(emit->common_immediate_pos));
-   emit->num_common_immediates = n;
+
+static void
+emit_initialize_temp_instruction(struct svga_shader_emitter_v10 *emit)
+{
+   struct tgsi_full_src_register src;
+   struct tgsi_full_dst_register dst;
+   unsigned vgpu10_temp_index = remap_temp_index(emit, TGSI_FILE_TEMPORARY,
+                                                 emit->initialize_temp_index);
+   src = make_immediate_reg_float(emit, 0.0f);
+   dst = make_dst_temp_reg(vgpu10_temp_index);
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &dst, &src);
+   emit->temp_map[emit->initialize_temp_index].initialized = TRUE;
+   emit->initialize_temp_index = INVALID_INDEX;
 }
 
 
@@ -6513,6 +10148,25 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
    /* Properties */
    if (emit->unit == PIPE_SHADER_GEOMETRY)
       emit_property_instructions(emit);
+   else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+      emit_hull_shader_declarations(emit);
+
+      /* Save the position of the first instruction token so that we can
+       * do a second pass of the instructions for the patch constant phase.
+       */
+      emit->tcs.instruction_token_pos = emit->cur_tgsi_token;
+
+      if (!emit_hull_shader_control_point_phase(emit)) {
+         emit->skip_instruction = TRUE;
+         return TRUE;
+      }
+
+      /* Set the current tcs phase to control point phase */
+      emit->tcs.control_point_phase = TRUE;
+   }
+   else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+      emit_domain_shader_declarations(emit);
+   }
 
    /* Declare inputs */
    if (!emit_input_declarations(emit))
@@ -6525,20 +10179,30 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
    /* Declare temporary registers */
    emit_temporaries_declaration(emit);
 
-   /* Declare constant registers */
-   emit_constant_declaration(emit);
+   /* For PIPE_SHADER_TESS_CTRL, constants, samplers, resources and immediates
+    * will already be declared in hs_decls (emit_hull_shader_declarations)
+    */
+   if (emit->unit != PIPE_SHADER_TESS_CTRL) {
+      /* Declare constant registers */
+      emit_constant_declaration(emit);
 
-   /* Declare samplers and resources */
-   emit_sampler_declarations(emit);
-   emit_resource_declarations(emit);
+      /* Declare samplers and resources */
+      emit_sampler_declarations(emit);
+      emit_resource_declarations(emit);
 
-   /* Declare clip distance output registers */
-   if (emit->unit == PIPE_SHADER_VERTEX ||
-       emit->unit == PIPE_SHADER_GEOMETRY) {
-      emit_clip_distance_declarations(emit);
+      alloc_common_immediates(emit);
+      /* Now, emit the constant block containing all the immediates
+       * declared by shader, as well as the extra ones seen above.
+       */
    }
 
-   alloc_common_immediates(emit);
+   if (emit->unit != PIPE_SHADER_FRAGMENT) {
+      /*
+       * Declare clip distance output registers for ClipVertex or
+       * user defined planes
+       */
+      emit_clip_distance_declarations(emit);
+   }
 
    if (emit->unit == PIPE_SHADER_FRAGMENT &&
        emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) {
@@ -6547,19 +10211,36 @@ emit_pre_helpers(struct svga_shader_emitter_v10 *emit)
          alloc_immediate_float4(emit, alpha, alpha, alpha, alpha);
    }
 
-   /* Now, emit the constant block containing all the immediates
-    * declared by shader, as well as the extra ones seen above.
-    */
-   emit_vgpu10_immediates_block(emit);
+   if (emit->unit != PIPE_SHADER_TESS_CTRL) {
+      /**
+       * For PIPE_SHADER_TESS_CTRL, immediates are already declared in
+       * hs_decls
+       */
+      emit_vgpu10_immediates_block(emit);
+   }
+   else {
+      emit_tcs_default_control_point_output(emit);
+   }
 
    if (emit->unit == PIPE_SHADER_FRAGMENT) {
       emit_frontface_instructions(emit);
       emit_fragcoord_instructions(emit);
       emit_sample_position_instructions(emit);
+      emit_default_layer_instructions(emit);
    }
    else if (emit->unit == PIPE_SHADER_VERTEX) {
       emit_vertex_attrib_instructions(emit);
    }
+   else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+      emit_temp_tessfactor_instructions(emit);
+   }
+
+   /**
+    * For geometry shader that writes to viewport index, the prescale
+    * temporaries will be done at the first vertex emission.
+    */
+   if (emit->vposition.need_prescale && emit->vposition.num_prescale == 1)
+      emit_temp_prescale_instructions(emit);
 
    return TRUE;
 }
@@ -6601,7 +10282,7 @@ emit_alpha_to_one_instructions(struct svga_shader_emitter_v10 *emit,
 
       color_dst = writemask_dst(&color_dst, TGSI_WRITEMASK_W);
 
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &one, FALSE);
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &one);
    }
 }
 
@@ -6646,8 +10327,7 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
     */
    if (emit->key.fs.write_color0_to_n_cbufs <= 1) {
       /* MOV output.color, tempcolor */
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst,
-                           &color_src, FALSE);     /* XXX saturate? */
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &color_src);
    }
 
    free_temp_indexes(emit);
@@ -6694,8 +10374,7 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit,
       emit->info.output_semantic_name[output_reg] = TGSI_SEMANTIC_COLOR;
 
       /* MOV output.color[i], tempcolor */
-      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst,
-                           &color_src, FALSE);     /* XXX saturate? */
+      emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &color_src);
    }
 }
 
@@ -6734,6 +10413,18 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit)
          emit_broadcast_color_instructions(emit, fs_color_tmp_index);
       }
    }
+   else if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+      if (!emit->tcs.control_point_phase) {
+         /* store the tessellation levels in the patch constant phase only */
+         store_tesslevels(emit);
+      }
+      else {
+         emit_clipping_instructions(emit);
+      }
+   }
+   else if (emit->unit == PIPE_SHADER_TESS_EVAL) {
+      emit_vertex_instructions(emit);
+   }
 
    return TRUE;
 }
@@ -6754,6 +10445,10 @@ emit_vgpu10_instructions(struct svga_shader_emitter_v10 *emit,
    tgsi_parse_init(&parse, tokens);
 
    while (!tgsi_parse_end_of_tokens(&parse)) {
+
+      /* Save the current tgsi token starting position */
+      emit->cur_tgsi_token = parse.Position;
+
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
@@ -6778,6 +10473,24 @@ emit_vgpu10_instructions(struct svga_shader_emitter_v10 *emit,
          }
          ret = emit_vgpu10_instruction(emit, inst_number++,
                                        &parse.FullToken.FullInstruction);
+
+         /* Usually this applies to TCS only. If shader is reading control
+          * point outputs in control point phase, we should reemit all
+          * instructions which are writting into control point output in
+          * control phase to store results into temporaries.
+          */
+         if (emit->reemit_instruction) {
+            assert(emit->unit == PIPE_SHADER_TESS_CTRL);
+            ret = emit_vgpu10_instruction(emit, inst_number,
+                                          &parse.FullToken.FullInstruction);
+         }
+         else if (emit->initialize_temp_index != INVALID_INDEX) {
+            emit_initialize_temp_instruction(emit);
+            emit->initialize_temp_index = INVALID_INDEX;
+            ret = emit_vgpu10_instruction(emit, inst_number - 1,
+                                          &parse.FullToken.FullInstruction);
+         }
+
          if (!ret)
             goto done;
          break;
@@ -6793,6 +10506,10 @@ emit_vgpu10_instructions(struct svga_shader_emitter_v10 *emit,
       }
    }
 
+   if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+      ret = emit_hull_shader_patch_constant_phase(emit, &parse);
+   }
+
 done:
    tgsi_parse_free(&parse);
    return ret;
@@ -6808,6 +10525,7 @@ emit_vgpu10_header(struct svga_shader_emitter_v10 *emit)
    VGPU10ProgramToken ptoken;
 
    /* First token: VGPU10ProgramToken  (version info, program type (VS,GS,PS)) */
+   ptoken.value = 0; /* init whole token to zero */
    ptoken.majorVersion = emit->version / 10;
    ptoken.minorVersion = emit->version % 10;
    ptoken.programType = translate_shader_type(emit->unit);
@@ -6817,7 +10535,49 @@ emit_vgpu10_header(struct svga_shader_emitter_v10 *emit)
    /* Second token: total length of shader, in tokens.  We can't fill this
     * in until we're all done.  Emit zero for now.
     */
-   return emit_dword(emit, 0);
+   if (!emit_dword(emit, 0))
+      return FALSE;
+
+   if (emit->version >= 50) {
+      VGPU10OpcodeToken0 token;
+
+      if (emit->unit == PIPE_SHADER_TESS_CTRL) {
+         /* For hull shader, we need to start the declarations phase first before
+          * emitting any declarations including the global flags.
+          */
+         token.value = 0;
+         token.opcodeType = VGPU10_OPCODE_HS_DECLS;
+         begin_emit_instruction(emit);
+         emit_dword(emit, token.value);
+         end_emit_instruction(emit);
+      }
+
+      /* Emit global flags */
+      token.value = 0;    /* init whole token to zero */
+      token.opcodeType = VGPU10_OPCODE_DCL_GLOBAL_FLAGS;
+      token.enableDoublePrecisionFloatOps = 1;  /* set bit */
+      token.instructionLength = 1;
+      if (!emit_dword(emit, token.value))
+         return FALSE;
+   }
+
+   if (emit->version >= 40) {
+      VGPU10OpcodeToken0 token;
+
+      /* Reserved for global flag such as refactoringAllowed.
+       * If the shader does not use the precise qualifier, we will set the
+       * refactoringAllowed global flag; otherwise, we will leave the reserved
+       * token to NOP.
+       */
+      emit->reserved_token = (emit->ptr - emit->buf) / sizeof(VGPU10OpcodeToken0);
+      token.value = 0;
+      token.opcodeType = VGPU10_OPCODE_NOP;
+      token.instructionLength = 1;
+      if (!emit_dword(emit, token.value))
+         return FALSE;
+   }
+
+   return TRUE;
 }
 
 
@@ -6830,6 +10590,16 @@ emit_vgpu10_tail(struct svga_shader_emitter_v10 *emit)
    tokens = (VGPU10ProgramToken *) emit->buf;
    tokens[1].value = emit_get_num_tokens(emit);
 
+   if (emit->version >= 40 && !emit->uses_precise_qualifier) {
+      /* Replace the reserved token with the RefactoringAllowed global flag */
+      VGPU10OpcodeToken0 *ptoken;
+
+      ptoken = (VGPU10OpcodeToken0 *)&tokens[emit->reserved_token];
+      assert(ptoken->opcodeType == VGPU10_OPCODE_NOP);
+      ptoken->opcodeType = VGPU10_OPCODE_DCL_GLOBAL_FLAGS;
+      ptoken->refactoringAllowed = 1;
+   }
+
    return TRUE;
 }
 
@@ -6908,6 +10678,97 @@ transform_fs_aapoint(const struct tgsi_token *tokens,
    return tokens;
 }
 
+
+/**
+ * A helper function to determine the shader in the previous stage and
+ * then call the linker function to determine the input mapping for this
+ * shader to match the output indices from the shader in the previous stage.
+ */
+static void
+compute_input_mapping(struct svga_context *svga,
+                      struct svga_shader_emitter_v10 *emit,
+                      enum pipe_shader_type unit)
+{
+   struct svga_shader *prevShader = NULL;   /* shader in the previous stage */
+
+   if (unit == PIPE_SHADER_FRAGMENT) {
+      prevShader = svga->curr.gs ?
+         &svga->curr.gs->base : (svga->curr.tes ?
+         &svga->curr.tes->base : &svga->curr.vs->base);
+   } else if (unit == PIPE_SHADER_GEOMETRY) {
+      prevShader = svga->curr.tes ? &svga->curr.tes->base : &svga->curr.vs->base;
+   } else if (unit == PIPE_SHADER_TESS_EVAL) {
+      assert(svga->curr.tcs);
+      prevShader = &svga->curr.tcs->base;
+   } else if (unit == PIPE_SHADER_TESS_CTRL) {
+      assert(svga->curr.vs);
+      prevShader = &svga->curr.vs->base;
+   }
+
+   if (prevShader != NULL) {
+      svga_link_shaders(&prevShader->info, &emit->info, &emit->linkage);
+   } 
+   else {
+      /**
+       * Since vertex shader does not need to go through the linker to
+       * establish the input map, we need to make sure the highest index
+       * of input registers is set properly here.
+       */
+      emit->linkage.input_map_max = MAX2((int)emit->linkage.input_map_max,
+                                         emit->info.file_max[TGSI_FILE_INPUT]);
+   }
+}
+
+
+/**
+ * Copies the shader signature info to the shader variant
+ */
+static void
+copy_shader_signature(struct svga_shader_signature *sgn,
+                      struct svga_shader_variant *variant)
+{
+   SVGA3dDXShaderSignatureHeader *header = &sgn->header;
+
+   /* Calculate the signature length */
+   variant->signatureLen = sizeof(SVGA3dDXShaderSignatureHeader) +
+                           (header->numInputSignatures +
+                            header->numOutputSignatures +
+                            header->numPatchConstantSignatures) *
+                           sizeof(SVGA3dDXShaderSignatureEntry);
+
+   /* Allocate buffer for the signature info */
+   variant->signature =
+      (SVGA3dDXShaderSignatureHeader *)CALLOC(1, variant->signatureLen);
+
+   char *sgnBuf = (char *)variant->signature;
+   unsigned sgnLen;
+
+   /* Copy the signature info to the shader variant structure */
+   memcpy(sgnBuf, &sgn->header, sizeof(SVGA3dDXShaderSignatureHeader));
+   sgnBuf += sizeof(SVGA3dDXShaderSignatureHeader);
+
+   if (header->numInputSignatures) {
+      sgnLen =
+         header->numInputSignatures * sizeof(SVGA3dDXShaderSignatureEntry);
+      memcpy(sgnBuf, &sgn->inputs[0], sgnLen);
+      sgnBuf += sgnLen;
+   }
+
+   if (header->numOutputSignatures) {
+      sgnLen =
+         header->numOutputSignatures * sizeof(SVGA3dDXShaderSignatureEntry);
+      memcpy(sgnBuf, &sgn->outputs[0], sgnLen);
+      sgnBuf += sgnLen;
+   }
+
+   if (header->numPatchConstantSignatures) {
+      sgnLen =
+         header->numPatchConstantSignatures * sizeof(SVGA3dDXShaderSignatureEntry);
+      memcpy(sgnBuf, &sgn->patchConstants[0], sgnLen);
+   }
+}
+
+
 /**
  * This is the main entrypoint for the TGSI -> VPGU10 translator.
  */
@@ -6920,12 +10781,15 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    struct svga_shader_variant *variant = NULL;
    struct svga_shader_emitter_v10 *emit;
    const struct tgsi_token *tokens = shader->tokens;
-   struct svga_vertex_shader *vs = svga->curr.vs;
-   struct svga_geometry_shader *gs = svga->curr.gs;
+
+   (void) make_immediate_reg_double;   /* unused at this time */
 
    assert(unit == PIPE_SHADER_VERTEX ||
           unit == PIPE_SHADER_GEOMETRY ||
-          unit == PIPE_SHADER_FRAGMENT);
+          unit == PIPE_SHADER_FRAGMENT ||
+          unit == PIPE_SHADER_TESS_CTRL ||
+          unit == PIPE_SHADER_TESS_EVAL ||
+          unit == PIPE_SHADER_COMPUTE);
 
    /* These two flags cannot be used together */
    assert(key->vs.need_prescale + key->vs.undo_viewport <= 1);
@@ -6939,12 +10803,29 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       goto done;
 
    emit->unit = unit;
-   emit->version = svga_have_sm4_1(svga) ? 41 : 40;
+   if (svga_have_sm5(svga)) {
+      emit->version = 50;
+   } else if (svga_have_sm4_1(svga)) {
+      emit->version = 41;
+   } else {
+      emit->version = 40;
+   }
+
+   emit->signature.header.headerVersion = SVGADX_SIGNATURE_HEADER_VERSION_0;
 
    emit->key = *key;
 
    emit->vposition.need_prescale = (emit->key.vs.need_prescale ||
-                                   emit->key.gs.need_prescale);
+                                    emit->key.gs.need_prescale ||
+                                    emit->key.tes.need_prescale);
+
+   /* Determine how many prescale factors in the constant buffer */
+   emit->vposition.num_prescale = 1;
+   if (emit->vposition.need_prescale && emit->key.gs.writes_viewport_index) {
+      assert(emit->unit == PIPE_SHADER_GEOMETRY);
+      emit->vposition.num_prescale = emit->key.gs.num_prescale;
+   }
+
    emit->vposition.tmp_index = INVALID_INDEX;
    emit->vposition.so_index = INVALID_INDEX;
    emit->vposition.out_index = INVALID_INDEX;
@@ -6954,13 +10835,60 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    emit->fs.fragcoord_input_index = INVALID_INDEX;
    emit->fs.sample_id_sys_index = INVALID_INDEX;
    emit->fs.sample_pos_sys_index = INVALID_INDEX;
+   emit->fs.sample_mask_in_sys_index = INVALID_INDEX;
+   emit->fs.layer_input_index = INVALID_INDEX;
+   emit->fs.layer_imm_index = INVALID_INDEX;
 
    emit->gs.prim_id_index = INVALID_INDEX;
+   emit->gs.invocation_id_sys_index = INVALID_INDEX;
+   emit->gs.viewport_index_out_index = INVALID_INDEX;
+   emit->gs.viewport_index_tmp_index = INVALID_INDEX;
+
+   emit->tcs.vertices_per_patch_index = INVALID_INDEX;
+   emit->tcs.invocation_id_sys_index = INVALID_INDEX;
+   emit->tcs.control_point_input_index = INVALID_INDEX;
+   emit->tcs.control_point_addr_index = INVALID_INDEX;
+   emit->tcs.control_point_out_index = INVALID_INDEX;
+   emit->tcs.control_point_tmp_index = INVALID_INDEX;
+   emit->tcs.control_point_out_count = 0;
+   emit->tcs.inner.out_index = INVALID_INDEX;
+   emit->tcs.inner.out_index = INVALID_INDEX;
+   emit->tcs.inner.temp_index = INVALID_INDEX;
+   emit->tcs.inner.tgsi_index = INVALID_INDEX;
+   emit->tcs.outer.out_index = INVALID_INDEX;
+   emit->tcs.outer.temp_index = INVALID_INDEX;
+   emit->tcs.outer.tgsi_index = INVALID_INDEX;
+   emit->tcs.patch_generic_out_count = 0;
+   emit->tcs.patch_generic_out_index = INVALID_INDEX;
+   emit->tcs.patch_generic_tmp_index = INVALID_INDEX;
+   emit->tcs.prim_id_index = INVALID_INDEX;
+
+   emit->tes.tesscoord_sys_index = INVALID_INDEX;
+   emit->tes.inner.in_index = INVALID_INDEX;
+   emit->tes.inner.temp_index = INVALID_INDEX;
+   emit->tes.inner.tgsi_index = INVALID_INDEX;
+   emit->tes.outer.in_index = INVALID_INDEX;
+   emit->tes.outer.temp_index = INVALID_INDEX;
+   emit->tes.outer.tgsi_index = INVALID_INDEX;
+   emit->tes.prim_id_index = INVALID_INDEX;
 
    emit->clip_dist_out_index = INVALID_INDEX;
    emit->clip_dist_tmp_index = INVALID_INDEX;
    emit->clip_dist_so_index = INVALID_INDEX;
    emit->clip_vertex_out_index = INVALID_INDEX;
+   emit->clip_vertex_tmp_index = INVALID_INDEX;
+   emit->svga_debug_callback = svga->debug.callback;
+
+   emit->index_range.start_index = INVALID_INDEX;
+   emit->index_range.count = 0;
+   emit->index_range.required = FALSE;
+   emit->index_range.operandType = VGPU10_NUM_OPERANDS;
+   emit->index_range.dim = 0;
+   emit->index_range.size = 0;
+
+   emit->current_loop_depth = 0;
+
+   emit->initialize_temp_index = INVALID_INDEX;
 
    if (emit->key.fs.alpha_func == SVGA3D_CMP_INVALID) {
       emit->key.fs.alpha_func = SVGA3D_CMP_ALWAYS;
@@ -7002,34 +10930,21 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
 
    emit->num_outputs = emit->info.num_outputs;
 
-   if (unit == PIPE_SHADER_FRAGMENT) {
-      /* Compute FS input remapping to match the output from VS/GS */
-      if (gs) {
-         svga_link_shaders(&gs->base.info, &emit->info, &emit->linkage);
-      } else {
-         assert(vs);
-         svga_link_shaders(&vs->base.info, &emit->info, &emit->linkage);
-      }
-   } else if (unit == PIPE_SHADER_GEOMETRY) {
-      assert(vs);
-      svga_link_shaders(&vs->base.info, &emit->info, &emit->linkage);
-   }
-
-   /* Since vertex shader does not need to go through the linker to
-    * establish the input map, we need to make sure the highest index
-    * of input registers is set properly here.
+   /**
+    * Compute input mapping to match the outputs from shader
+    * in the previous stage
     */
-   emit->linkage.input_map_max = MAX2((int)emit->linkage.input_map_max,
-                                      emit->info.file_max[TGSI_FILE_INPUT]);
+   compute_input_mapping(svga, emit, unit);
 
    determine_clipping_mode(emit);
 
-   if (unit == PIPE_SHADER_GEOMETRY || unit == PIPE_SHADER_VERTEX) {
+   if (unit == PIPE_SHADER_GEOMETRY || unit == PIPE_SHADER_VERTEX ||
+       unit == PIPE_SHADER_TESS_CTRL || unit == PIPE_SHADER_TESS_EVAL) {
       if (shader->stream_output != NULL || emit->clip_mode == CLIP_DISTANCE) {
          /* if there is stream output declarations associated
           * with this shader or the shader writes to ClipDistance
           * then reserve extra registers for the non-adjusted vertex position
-          * and the ClipDistance shadow copy
+          * and the ClipDistance shadow copy.
           */
          emit->vposition.so_index = emit->num_outputs++;
 
@@ -7073,6 +10988,12 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
    variant->shader = shader;
    variant->nr_tokens = emit_get_num_tokens(emit);
    variant->tokens = (const unsigned *)emit->buf;
+
+   /* Copy shader signature info to the shader variant */
+   if (svga_have_sm5(svga)) {
+      copy_shader_signature(&emit->signature, variant);
+   }
+
    emit->buf = NULL;  /* buffer is no longer owed by emitter context */
    memcpy(&variant->key, key, sizeof(*key));
    variant->id = UTIL_BITMASK_INVALID_INDEX;
@@ -7091,23 +11012,38 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
       variant->extra_const_start--;
    }
 
-   variant->pstipple_sampler_unit = emit->fs.pstipple_sampler_unit;
+   if (unit == PIPE_SHADER_FRAGMENT) {
+      struct svga_fs_variant *fs_variant = svga_fs_variant(variant);
 
-   /* If there was exactly one write to a fragment shader output register
-    * and it came from a constant buffer, we know all fragments will have
-    * the same color (except for blending).
-    */
-   variant->constant_color_output =
-      emit->constant_color_output && emit->num_output_writes == 1;
+      fs_variant->pstipple_sampler_unit = emit->fs.pstipple_sampler_unit;
 
-   /** keep track in the variant if flat interpolation is used
-    *  for any of the varyings.
-    */
-   variant->uses_flat_interp = emit->uses_flat_interp;
+      /* If there was exactly one write to a fragment shader output register
+       * and it came from a constant buffer, we know all fragments will have
+       * the same color (except for blending).
+       */
+      fs_variant->constant_color_output =
+         emit->constant_color_output && emit->num_output_writes == 1;
+
+      /** keep track in the variant if flat interpolation is used
+       *  for any of the varyings.
+       */
+      fs_variant->uses_flat_interp = emit->uses_flat_interp;
 
-   variant->fs_shadow_compare_units = emit->fs.shadow_compare_units;
+      fs_variant->fs_shadow_compare_units = emit->fs.shadow_compare_units;
+   }
+   else if (unit == PIPE_SHADER_TESS_EVAL) {
+      struct svga_tes_variant *tes_variant = svga_tes_variant(variant);
+
+      /* Keep track in the tes variant some of the layout parameters.
+       * These parameters will be referenced by the tcs to emit
+       * the necessary declarations for the hull shader.
+       */
+      tes_variant->prim_mode = emit->tes.prim_mode;
+      tes_variant->spacing = emit->tes.spacing;
+      tes_variant->vertices_order_cw = emit->tes.vertices_order_cw;
+      tes_variant->point_mode = emit->tes.point_mode;
+   }
 
-   variant->fs_shadow_compare_units = emit->fs.shadow_compare_units;
 
    if (tokens != shader->tokens) {
       tgsi_free_tokens(tokens);
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 88c1c6c7983..717e56caccf 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -427,7 +427,9 @@ struct svga_winsys_context
                     uint32 shaderId,
                     SVGA3dShaderType shaderType,
                     const uint32 *bytecode,
-                    uint32 bytecodeLen);
+                    uint32 bytecodeLen,
+                    const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                    uint32 sgnLen);
 
    /**
     * Destroy a DX GB shader.
@@ -457,7 +459,13 @@ struct svga_winsys_context
 
    /** For HUD queries */
    uint64_t num_commands;
+   uint64_t num_command_buffers;
    uint64_t num_draw_commands;
+   uint64_t num_shader_reloc;
+   uint64_t num_surf_reloc;
+
+   /* Whether we are in retry processing */
+   unsigned int in_retry;
 };
 
 
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 432f9afcd1f..da7506e7797 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -65,6 +65,7 @@
 #define VMW_MAX_SURF_MEM_FACTOR 2
 
 
+
 struct vmw_buffer_relocation
 {
    struct pb_buffer *buffer;
@@ -701,20 +702,19 @@ vmw_svga_winsys_vgpu10_shader_create(struct svga_winsys_context *swc,
                                      uint32 shaderId,
                                      SVGA3dShaderType shaderType,
                                      const uint32 *bytecode,
-                                     uint32 bytecodeLen)
+                                     uint32 bytecodeLen,
+                                     const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                                     uint32 sgnLen)
 {
    struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
    struct vmw_svga_winsys_shader *shader;
-   struct svga_winsys_gb_shader *gb_shader =
-      vmw_svga_winsys_shader_create(&vswc->vws->base, shaderType, bytecode,
-                                    bytecodeLen);
-   if (!gb_shader)
+   shader = vmw_svga_shader_create(&vswc->vws->base, shaderType, bytecode,
+                                   bytecodeLen, sgnInfo, sgnLen);
+   if (!shader)
       return NULL;
 
-   shader = vmw_svga_winsys_shader(gb_shader);
    shader->shid = shaderId;
-
-   return gb_shader;
+   return svga_winsys_shader(shader);
 }
 
 /**
diff --git a/src/gallium/winsys/svga/drm/vmw_shader.c b/src/gallium/winsys/svga/drm/vmw_shader.c
index 56ffdd16f79..dbf63c59234 100644
--- a/src/gallium/winsys/svga/drm/vmw_shader.c
+++ b/src/gallium/winsys/svga/drm/vmw_shader.c
@@ -28,7 +28,9 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 
+#include "vmw_context.h"
 #include "vmw_shader.h"
+#include "vmw_buffer.h"
 #include "vmw_screen.h"
 
 void
@@ -63,3 +65,54 @@ vmw_svga_winsys_shader_reference(struct vmw_svga_winsys_shader **pdst,
 
    *pdst = src;
 }
+
+
+/**
+ * A helper function to create a shader object and upload the
+ * shader bytecode and signature if specified to the shader memory.
+ */
+struct vmw_svga_winsys_shader *
+vmw_svga_shader_create(struct svga_winsys_screen *sws,
+                       SVGA3dShaderType type,
+                       const uint32 *bytecode,
+                       uint32 bytecodeLen,
+                       const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                       uint32 sgnLen)
+{
+   struct vmw_svga_winsys_shader *shader;
+   void *map;
+
+   shader = CALLOC_STRUCT(vmw_svga_winsys_shader);
+   if (!shader)
+      return NULL;
+
+   pipe_reference_init(&shader->refcnt, 1);
+   p_atomic_set(&shader->validated, 0);
+   shader->screen = vmw_winsys_screen(sws);
+   shader->buf = sws->buffer_create(sws, 64,
+                                    SVGA_BUFFER_USAGE_SHADER,
+                                    bytecodeLen + sgnLen);
+   if (!shader->buf) {
+      FREE(shader);
+      return NULL;
+   }
+
+   map = sws->buffer_map(sws, shader->buf, PIPE_TRANSFER_WRITE);
+   if (!map) {
+      FREE(shader);
+      return NULL;
+   }
+
+   /* copy the shader bytecode */
+   memcpy(map, bytecode, bytecodeLen);
+
+   /* if shader signature is specified, append it to the bytecode. */
+   if (sgnLen) {
+      assert(sws->have_sm5);
+      map = (char *)map + bytecodeLen;
+      memcpy(map, sgnInfo, sgnLen);
+   }
+   sws->buffer_unmap(sws, shader->buf);
+
+   return shader;
+}
diff --git a/src/gallium/winsys/svga/drm/vmw_shader.h b/src/gallium/winsys/svga/drm/vmw_shader.h
index ae557bcc8e4..a62a814471d 100644
--- a/src/gallium/winsys/svga/drm/vmw_shader.h
+++ b/src/gallium/winsys/svga/drm/vmw_shader.h
@@ -65,4 +65,12 @@ void
 vmw_svga_winsys_shader_reference(struct vmw_svga_winsys_shader **pdst,
                                   struct vmw_svga_winsys_shader *src);
 
+struct vmw_svga_winsys_shader *
+vmw_svga_shader_create(struct svga_winsys_screen *sws,
+                       SVGA3dShaderType type,
+                       const uint32 *bytecode,
+                       uint32 bytecodeLen,
+                       const SVGA3dDXShaderSignatureHeader *sgnInfo,
+                       uint32 sgnLen);
+
 #endif /* VMW_SHADER_H_ */