crocus: initial gallium driver for Intel gfx 4-7

This is a gallium driver for the Intel gfx 4-7 GPUs. It was initially cloned from the iris driver by Ilia Mirkin, then I ported over large reams of code from i965 until it worked. Acked-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11146>
author: Dave Airlie <airlied@gmail.com> 2021-06-01 13:14:51 +1000
committer: Dave Airlie <airlied@gmail.com> 2021-06-14 06:34:05 +1000
commit: f3630548f1da904ec6c63b43ece7e68afdb8867e (patch)
tree: 05cfc909591aba9d8bf4bdeb9ba32ce8db2c58f4 /src
parent: 8da92b5c0a358e30be557cae3303a4027b24db1c (diff)
49 files changed, 28504 insertions, 3 deletions
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 8147c3ca346..ca5bf121a88 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -70,6 +70,7 @@ static const struct pipe_loader_ops pipe_loader_drm_ops;
 static const struct drm_driver_descriptor *driver_descriptors[] = {
    &i915_driver_descriptor,
    &iris_driver_descriptor,
+   &crocus_driver_descriptor,
    &nouveau_driver_descriptor,
    &r300_driver_descriptor,
    &r600_driver_descriptor,
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 6bab07a40e7..ff4621e1a88 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -112,6 +112,26 @@ DRM_DRIVER_DESCRIPTOR(iris, iris_driconf, ARRAY_SIZE(iris_driconf))
 DRM_DRIVER_DESCRIPTOR_STUB(iris)
 #endif
 
+#ifdef GALLIUM_CROCUS
+#include "crocus/drm/crocus_drm_public.h"
+
+static struct pipe_screen *
+pipe_crocus_create_screen(int fd, const struct pipe_screen_config *config)
+{
+   struct pipe_screen *screen;
+
+   screen = crocus_drm_screen_create(fd, config);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+
+const driOptionDescription crocus_driconf[] = {
+      #include "crocus/driinfo_crocus.h"
+};
+DRM_DRIVER_DESCRIPTOR(crocus, crocus_driconf, ARRAY_SIZE(crocus_driconf))
+#else
+DRM_DRIVER_DESCRIPTOR_STUB(crocus)
+#endif
+
 #ifdef GALLIUM_NOUVEAU
 #include "nouveau/drm/nouveau_drm_public.h"
 
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
index 5fd3084dfdb..478e72b8525 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -6,6 +6,7 @@ struct pipe_screen_config;
 
 extern const struct drm_driver_descriptor i915_driver_descriptor;
 extern const struct drm_driver_descriptor iris_driver_descriptor;
+extern const struct drm_driver_descriptor crocus_driver_descriptor;
 extern const struct drm_driver_descriptor nouveau_driver_descriptor;
 extern const struct drm_driver_descriptor r300_driver_descriptor;
 extern const struct drm_driver_descriptor r600_driver_descriptor;
diff --git a/src/gallium/drivers/crocus/crocus_batch.c b/src/gallium/drivers/crocus/crocus_batch.c
new file mode 100644
index 00000000000..63cfe282de4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_batch.c
+ *
+ * Batchbuffer and command submission module.
+ *
+ * Every API draw call results in a number of GPU commands, which we
+ * collect into a "batch buffer".  Typically, many draw calls are grouped
+ * into a single batch to amortize command submission overhead.
+ *
+ * We submit batches to the kernel using the I915_GEM_EXECBUFFER2 ioctl.
+ * One critical piece of data is the "validation list", which contains a
+ * list of the buffer objects (BOs) which the commands in the GPU need.
+ * The kernel will make sure these are resident and pinned at the correct
+ * virtual memory address before executing our batch.  If a BO is not in
+ * the validation list, it effectively does not exist, so take care.
+ */
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#include "intel/common/intel_gem.h"
+#include "main/macros.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "util/u_upload_mgr.h"
+
+#include <errno.h>
+#include <xf86drm.h>
+
+#if HAVE_VALGRIND
+#include <memcheck.h>
+#include <valgrind.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+/* Terminating the batch takes either 4 bytes for MI_BATCH_BUFFER_END
+ * or 12 bytes for MI_BATCH_BUFFER_START (when chaining).  Plus, we may
+ * need an extra 4 bytes to pad out to the nearest QWord.  So reserve 16.
+ */
+#define BATCH_RESERVED(devinfo) ((devinfo)->is_haswell ? 32 : 16)
+
+static void crocus_batch_reset(struct crocus_batch *batch);
+
+static unsigned
+num_fences(struct crocus_batch *batch)
+{
+   return util_dynarray_num_elements(&batch->exec_fences,
+                                     struct drm_i915_gem_exec_fence);
+}
+
+/**
+ * Debugging code to dump the fence list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_fence_list(struct crocus_batch *batch)
+{
+   fprintf(stderr, "Fence list (length %u):      ", num_fences(batch));
+
+   util_dynarray_foreach(&batch->exec_fences,
+                         struct drm_i915_gem_exec_fence, f) {
+      fprintf(stderr, "%s%u%s ",
+              (f->flags & I915_EXEC_FENCE_WAIT) ? "..." : "",
+              f->handle,
+              (f->flags & I915_EXEC_FENCE_SIGNAL) ? "!" : "");
+   }
+
+   fprintf(stderr, "\n");
+}
+
+/**
+ * Debugging code to dump the validation list, used by INTEL_DEBUG=submit.
+ */
+static void
+dump_validation_list(struct crocus_batch *batch)
+{
+   fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      uint64_t flags = batch->validation_list[i].flags;
+      assert(batch->validation_list[i].handle ==
+             batch->exec_bos[i]->gem_handle);
+      fprintf(stderr,
+              "[%2d]: %2d %-14s @ 0x%016llx (%" PRIu64 "B)\t %2d refs %s\n", i,
+              batch->validation_list[i].handle, batch->exec_bos[i]->name,
+              batch->validation_list[i].offset, batch->exec_bos[i]->size,
+              batch->exec_bos[i]->refcount,
+              (flags & EXEC_OBJECT_WRITE) ? " (write)" : "");
+   }
+}
+
+/**
+ * Return BO information to the batch decoder (for debugging).
+ */
+static struct intel_batch_decode_bo
+decode_get_bo(void *v_batch, bool ppgtt, uint64_t address)
+{
+   struct crocus_batch *batch = v_batch;
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct crocus_bo *bo = batch->exec_bos[i];
+      /* The decoder zeroes out the top 16 bits, so we need to as well */
+      uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
+
+      if (address >= bo_address && address < bo_address + bo->size) {
+         return (struct intel_batch_decode_bo){
+            .addr = address,
+            .size = bo->size,
+            .map = crocus_bo_map(batch->dbg, bo, MAP_READ) +
+                   (address - bo_address),
+         };
+      }
+   }
+
+   return (struct intel_batch_decode_bo) { };
+}
+
+static unsigned
+decode_get_state_size(void *v_batch, uint64_t address,
+                      uint64_t base_address)
+{
+   struct crocus_batch *batch = v_batch;
+
+   /* The decoder gives us offsets from a base address, which is not great.
+    * Binding tables are relative to surface state base address, and other
+    * state is relative to dynamic state base address.  These could alias,
+    * but in practice it's unlikely because surface offsets are always in
+    * the [0, 64K) range, and we assign dynamic state addresses starting at
+    * the top of the 4GB range.  We should fix this but it's likely good
+    * enough for now.
+    */
+   unsigned size = (uintptr_t)
+      _mesa_hash_table_u64_search(batch->state_sizes, address - base_address);
+
+   return size;
+}
+
+/**
+ * Decode the current batch.
+ */
+static void
+decode_batch(struct crocus_batch *batch)
+{
+   void *map = crocus_bo_map(batch->dbg, batch->exec_bos[0], MAP_READ);
+   intel_print_batch(&batch->decoder, map, batch->primary_batch_size,
+                     batch->exec_bos[0]->gtt_offset, false);
+}
+
+static void
+init_reloc_list(struct crocus_reloc_list *rlist, int count)
+{
+   rlist->reloc_count = 0;
+   rlist->reloc_array_size = count;
+   rlist->relocs = malloc(rlist->reloc_array_size *
+                          sizeof(struct drm_i915_gem_relocation_entry));
+}
+
+void
+crocus_init_batch(struct crocus_context *ice,
+                  enum crocus_batch_name name,
+                  int priority)
+{
+   struct crocus_batch *batch = &ice->batches[name];
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+
+   batch->ice = ice;
+   batch->screen = screen;
+   batch->dbg = &ice->dbg;
+   batch->reset = &ice->reset;
+   batch->name = name;
+   batch->contains_fence_signal = false;
+
+   if (devinfo->ver >= 7) {
+      batch->fine_fences.uploader =
+         u_upload_create(&ice->ctx, 4096, PIPE_BIND_CUSTOM,
+                         PIPE_USAGE_STAGING, 0);
+   }
+   crocus_fine_fence_init(batch);
+
+   batch->hw_ctx_id = crocus_create_hw_context(screen->bufmgr);
+   assert(batch->hw_ctx_id);
+
+   crocus_hw_context_set_priority(screen->bufmgr, batch->hw_ctx_id, priority);
+
+   batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
+   if (devinfo->ver == 6)
+      batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
+
+   if (INTEL_DEBUG & DEBUG_BATCH) {
+      /* The shadow doesn't get relocs written so state decode fails. */
+      batch->use_shadow_copy = false;
+   } else
+      batch->use_shadow_copy = !devinfo->has_llc;
+
+   util_dynarray_init(&batch->exec_fences, ralloc_context(NULL));
+   util_dynarray_init(&batch->syncobjs, ralloc_context(NULL));
+
+   init_reloc_list(&batch->command.relocs, 250);
+   init_reloc_list(&batch->state.relocs, 250);
+
+   batch->exec_count = 0;
+   batch->exec_array_size = 100;
+   batch->exec_bos =
+      malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
+   batch->validation_list =
+      malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
+
+   batch->cache.render = _mesa_hash_table_create(NULL, NULL,
+                                                 _mesa_key_pointer_equal);
+   batch->cache.depth = _mesa_set_create(NULL, NULL,
+                                         _mesa_key_pointer_equal);
+
+   memset(batch->other_batches, 0, sizeof(batch->other_batches));
+
+   for (int i = 0, j = 0; i < ice->batch_count; i++) {
+      if (i != name)
+         batch->other_batches[j++] = &ice->batches[i];
+   }
+
+   if (INTEL_DEBUG & DEBUG_BATCH) {
+
+      batch->state_sizes = _mesa_hash_table_u64_create(NULL);
+      const unsigned decode_flags =
+         INTEL_BATCH_DECODE_FULL |
+         ((INTEL_DEBUG & DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
+         INTEL_BATCH_DECODE_OFFSETS | INTEL_BATCH_DECODE_FLOATS;
+
+      intel_batch_decode_ctx_init(&batch->decoder, &screen->devinfo, stderr,
+                                  decode_flags, NULL, decode_get_bo,
+                                  decode_get_state_size, batch);
+      batch->decoder.max_vbo_decoded_lines = 32;
+   }
+
+   crocus_batch_reset(batch);
+}
+
+static struct drm_i915_gem_exec_object2 *
+find_validation_entry(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+   unsigned index = READ_ONCE(bo->index);
+
+   if (index < batch->exec_count && batch->exec_bos[index] == bo)
+      return &batch->validation_list[index];
+
+   /* May have been shared between multiple active batches */
+   for (index = 0; index < batch->exec_count; index++) {
+      if (batch->exec_bos[index] == bo)
+         return &batch->validation_list[index];
+   }
+
+   return NULL;
+}
+
+static void
+ensure_exec_obj_space(struct crocus_batch *batch, uint32_t count)
+{
+   while (batch->exec_count + count > batch->exec_array_size) {
+      batch->exec_array_size *= 2;
+      batch->exec_bos = realloc(
+         batch->exec_bos, batch->exec_array_size * sizeof(batch->exec_bos[0]));
+      batch->validation_list =
+         realloc(batch->validation_list,
+                 batch->exec_array_size * sizeof(batch->validation_list[0]));
+   }
+}
+
+static struct drm_i915_gem_exec_object2 *
+crocus_use_bo(struct crocus_batch *batch, struct crocus_bo *bo, bool writable)
+{
+   assert(bo->bufmgr == batch->command.bo->bufmgr);
+
+   if (bo == batch->ice->workaround_bo)
+      writable = false;
+
+   struct drm_i915_gem_exec_object2 *existing_entry =
+      find_validation_entry(batch, bo);
+
+   if (existing_entry) {
+      /* The BO is already in the validation list; mark it writable */
+      if (writable)
+         existing_entry->flags |= EXEC_OBJECT_WRITE;
+      return existing_entry;
+   }
+
+   if (bo != batch->command.bo && bo != batch->state.bo) {
+      /* This is the first time our batch has seen this BO.  Before we use it,
+       * we may need to flush and synchronize with other batches.
+       */
+      for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
+
+         if (!batch->other_batches[b])
+            continue;
+         struct drm_i915_gem_exec_object2 *other_entry =
+            find_validation_entry(batch->other_batches[b], bo);
+
+         /* If the buffer is referenced by another batch, and either batch
+          * intends to write it, then flush the other batch and synchronize.
+          *
+          * Consider these cases:
+          *
+          * 1. They read, we read   =>  No synchronization required.
+          * 2. They read, we write  =>  Synchronize (they need the old value)
+          * 3. They write, we read  =>  Synchronize (we need their new value)
+          * 4. They write, we write =>  Synchronize (order writes)
+          *
+          * The read/read case is very common, as multiple batches usually
+          * share a streaming state buffer or shader assembly buffer, and
+          * we want to avoid synchronizing in this case.
+          */
+         if (other_entry &&
+             ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
+            crocus_batch_flush(batch->other_batches[b]);
+            crocus_batch_add_syncobj(batch,
+                                     batch->other_batches[b]->last_fence->syncobj,
+                                     I915_EXEC_FENCE_WAIT);
+         }
+      }
+   }
+
+   /* Bump the ref count since the batch is now using this bo. */
+   crocus_bo_reference(bo);
+
+   ensure_exec_obj_space(batch, 1);
+
+   batch->validation_list[batch->exec_count] =
+      (struct drm_i915_gem_exec_object2) {
+         .handle = bo->gem_handle,
+         .offset = bo->gtt_offset,
+         .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
+      };
+
+   bo->index = batch->exec_count;
+   batch->exec_bos[batch->exec_count] = bo;
+   batch->aperture_space += bo->size;
+
+   batch->exec_count++;
+
+   return &batch->validation_list[batch->exec_count - 1];
+}
+
+static uint64_t
+emit_reloc(struct crocus_batch *batch,
+           struct crocus_reloc_list *rlist, uint32_t offset,
+           struct crocus_bo *target, int32_t target_offset,
+           unsigned int reloc_flags)
+{
+   assert(target != NULL);
+
+   bool writable = reloc_flags & RELOC_WRITE;
+
+   struct drm_i915_gem_exec_object2 *entry =
+      crocus_use_bo(batch, target, writable);
+
+   if (rlist->reloc_count == rlist->reloc_array_size) {
+      rlist->reloc_array_size *= 2;
+      rlist->relocs = realloc(rlist->relocs,
+                              rlist->reloc_array_size *
+                              sizeof(struct drm_i915_gem_relocation_entry));
+   }
+
+   if (reloc_flags & RELOC_32BIT) {
+      /* Restrict this buffer to the low 32 bits of the address space.
+       *
+       * Altering the validation list flags restricts it for this batch,
+       * but we also alter the BO's kflags to restrict it permanently
+       * (until the BO is destroyed and put back in the cache).  Buffers
+       * may stay bound across batches, and we want keep it constrained.
+       */
+      target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+      /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
+      reloc_flags &= ~RELOC_32BIT;
+   }
+
+   if (reloc_flags)
+      entry->flags |= reloc_flags & batch->valid_reloc_flags;
+
+   rlist->relocs[rlist->reloc_count++] =
+      (struct drm_i915_gem_relocation_entry) {
+         .offset = offset,
+         .delta = target_offset,
+         .target_handle = target->index,
+         .presumed_offset = entry->offset,
+      };
+
+   /* Using the old buffer offset, write in what the right data would be, in
+    * case the buffer doesn't move and we can short-circuit the relocation
+    * processing in the kernel
+    */
+   return entry->offset + target_offset;
+}
+
+uint64_t
+crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+                     struct crocus_bo *target, uint32_t target_offset,
+                     unsigned int reloc_flags)
+{
+   assert(batch_offset <= batch->command.bo->size - sizeof(uint32_t));
+
+   return emit_reloc(batch, &batch->command.relocs, batch_offset,
+                     target, target_offset, reloc_flags);
+}
+
+uint64_t
+crocus_state_reloc(struct crocus_batch *batch, uint32_t state_offset,
+                   struct crocus_bo *target, uint32_t target_offset,
+                   unsigned int reloc_flags)
+{
+   assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
+
+   return emit_reloc(batch, &batch->state.relocs, state_offset,
+                     target, target_offset, reloc_flags);
+}
+
+static void
+recreate_growing_buffer(struct crocus_batch *batch,
+                        struct crocus_growing_bo *grow,
+                        const char *name, unsigned size)
+{
+   struct crocus_screen *screen = batch->screen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+   grow->bo = crocus_bo_alloc(bufmgr, name, size);
+   grow->bo->kflags |= EXEC_OBJECT_CAPTURE;
+   grow->partial_bo = NULL;
+   grow->partial_bo_map = NULL;
+   grow->partial_bytes = 0;
+   if (batch->use_shadow_copy)
+      grow->map = realloc(grow->map, grow->bo->size);
+   else
+      grow->map = crocus_bo_map(NULL, grow->bo, MAP_READ | MAP_WRITE);
+   grow->map_next = grow->map;
+}
+
+static void
+create_batch(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+
+   recreate_growing_buffer(batch, &batch->command,
+                           "command buffer",
+                           BATCH_SZ + BATCH_RESERVED(&screen->devinfo));
+
+   crocus_use_bo(batch, batch->command.bo, false);
+
+   recreate_growing_buffer(batch, &batch->state,
+                           "state buffer",
+                           STATE_SZ);
+
+   batch->state.used = 1;
+   crocus_use_bo(batch, batch->state.bo, false);
+}
+
+static void
+crocus_batch_maybe_noop(struct crocus_batch *batch)
+{
+   /* We only insert the NOOP at the beginning of the batch. */
+   assert(crocus_batch_bytes_used(batch) == 0);
+
+   if (batch->noop_enabled) {
+      /* Emit MI_BATCH_BUFFER_END to prevent any further command to be
+       * executed.
+       */
+      uint32_t *map = batch->command.map_next;
+
+      map[0] = (0xA << 23);
+
+      batch->command.map_next += 4;
+   }
+}
+
+static void
+crocus_batch_reset(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+
+   crocus_bo_unreference(batch->command.bo);
+   crocus_bo_unreference(batch->state.bo);
+   batch->primary_batch_size = 0;
+   batch->contains_draw = false;
+   batch->contains_fence_signal = false;
+   batch->state_base_address_emitted = false;
+   batch->screen->vtbl.batch_reset_dirty(batch);
+
+   create_batch(batch);
+   assert(batch->command.bo->index == 0);
+
+   if (batch->state_sizes)
+      _mesa_hash_table_u64_clear(batch->state_sizes);
+   struct crocus_syncobj *syncobj = crocus_create_syncobj(screen);
+   crocus_batch_add_syncobj(batch, syncobj, I915_EXEC_FENCE_SIGNAL);
+   crocus_syncobj_reference(screen, &syncobj, NULL);
+
+   crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_batch_free(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+   if (batch->use_shadow_copy) {
+      free(batch->command.map);
+      free(batch->state.map);
+   }
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      crocus_bo_unreference(batch->exec_bos[i]);
+   }
+
+   pipe_resource_reference(&batch->fine_fences.ref.res, NULL);
+
+   free(batch->command.relocs.relocs);
+   free(batch->state.relocs.relocs);
+   free(batch->exec_bos);
+   free(batch->validation_list);
+
+   ralloc_free(batch->exec_fences.mem_ctx);
+
+   util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+      crocus_syncobj_reference(screen, s, NULL);
+   ralloc_free(batch->syncobjs.mem_ctx);
+
+   crocus_fine_fence_reference(batch->screen, &batch->last_fence, NULL);
+   if (batch_has_fine_fence(batch))
+      u_upload_destroy(batch->fine_fences.uploader);
+
+   crocus_bo_unreference(batch->command.bo);
+   batch->command.bo = NULL;
+   batch->command.map = NULL;
+   batch->command.map_next = NULL;
+
+   crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+
+   _mesa_hash_table_destroy(batch->cache.render, NULL);
+   _mesa_set_destroy(batch->cache.depth, NULL);
+
+   if (batch->state_sizes) {
+      _mesa_hash_table_u64_destroy(batch->state_sizes);
+      intel_batch_decode_ctx_finish(&batch->decoder);
+   }
+}
+
+/**
+ * If we've chained to a secondary batch, or are getting near to the end,
+ * then flush.  This should only be called between draws.
+ */
+void
+crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate)
+{
+   if (batch->command.bo != batch->exec_bos[0] ||
+       crocus_batch_bytes_used(batch) + estimate >= BATCH_SZ) {
+      crocus_batch_flush(batch);
+   }
+}
+
+/**
+ * Finish copying the old batch/state buffer's contents to the new one
+ * after we tried to "grow" the buffer in an earlier operation.
+ */
+static void
+finish_growing_bos(struct crocus_growing_bo *grow)
+{
+   struct crocus_bo *old_bo = grow->partial_bo;
+   if (!old_bo)
+      return;
+
+   memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
+
+   grow->partial_bo = NULL;
+   grow->partial_bo_map = NULL;
+   grow->partial_bytes = 0;
+
+   crocus_bo_unreference(old_bo);
+}
+
+void
+crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+                   unsigned used,
+                   unsigned new_size)
+{
+   struct crocus_screen *screen = batch->screen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+   struct crocus_growing_bo *grow = grow_state ? &batch->state : &batch->command;
+   struct crocus_bo *bo = grow->bo;
+
+   if (grow->partial_bo) {
+      /* We've already grown once, and now we need to do it again.
+       * Finish our last grow operation so we can start a new one.
+       * This should basically never happen.
+       */
+      finish_growing_bos(grow);
+   }
+
+   struct crocus_bo *new_bo = crocus_bo_alloc(bufmgr, bo->name, new_size);
+
+   /* Copy existing data to the new larger buffer */
+   grow->partial_bo_map = grow->map;
+
+   if (batch->use_shadow_copy) {
+      /* We can't safely use realloc, as it may move the existing buffer,
+       * breaking existing pointers the caller may still be using.  Just
+       * malloc a new copy and memcpy it like the normal BO path.
+       *
+       * Use bo->size rather than new_size because the bufmgr may have
+       * rounded up the size, and we want the shadow size to match.
+       */
+      grow->map = malloc(new_bo->size);
+   } else {
+      grow->map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE);
+   }
+   /* Try to put the new BO at the same GTT offset as the old BO (which
+    * we're throwing away, so it doesn't need to be there).
+    *
+    * This guarantees that our relocations continue to work: values we've
+    * already written into the buffer, values we're going to write into the
+    * buffer, and the validation/relocation lists all will match.
+    *
+    * Also preserve kflags for EXEC_OBJECT_CAPTURE.
+    */
+   new_bo->gtt_offset = bo->gtt_offset;
+   new_bo->index = bo->index;
+   new_bo->kflags = bo->kflags;
+
+   /* Batch/state buffers are per-context, and if we've run out of space,
+    * we must have actually used them before, so...they will be in the list.
+    */
+   assert(bo->index < batch->exec_count);
+   assert(batch->exec_bos[bo->index] == bo);
+
+   /* Update the validation list to use the new BO. */
+   batch->validation_list[bo->index].handle = new_bo->gem_handle;
+   /* Exchange the two BOs...without breaking pointers to the old BO.
+    *
+    * Consider this scenario:
+    *
+    * 1. Somebody calls brw_state_batch() to get a region of memory, and
+    *    and then creates a brw_address pointing to brw->batch.state.bo.
+    * 2. They then call brw_state_batch() a second time, which happens to
+    *    grow and replace the state buffer.  They then try to emit a
+    *    relocation to their first section of memory.
+    *
+    * If we replace the brw->batch.state.bo pointer at step 2, we would
+    * break the address created in step 1.  They'd have a pointer to the
+    * old destroyed BO.  Emitting a relocation would add this dead BO to
+    * the validation list...causing /both/ statebuffers to be in the list,
+    * and all kinds of disasters.
+    *
+    * This is not a contrived case - BLORP vertex data upload hits this.
+    *
+    * There are worse scenarios too.  Fences for GL sync objects reference
+    * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
+    * we'd need to chase down every fence and update it to point to the
+    * new BO.  Otherwise, it would refer to a "batch" that never actually
+    * gets submitted, and would fail to trigger.
+    *
+    * To work around both of these issues, we transmutate the buffers in
+    * place, making the existing struct brw_bo represent the new buffer,
+    * and "new_bo" represent the old BO.  This is highly unusual, but it
+    * seems like a necessary evil.
+    *
+    * We also defer the memcpy of the existing batch's contents.  Callers
+    * may make multiple brw_state_batch calls, and retain pointers to the
+    * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
+    * we finally submit the batch, at which point we've finished uploading
+    * state, and nobody should have any old references anymore.
+    *
+    * To do that, we keep a reference to the old BO in grow->partial_bo,
+    * and store the number of bytes to copy in grow->partial_bytes.  We
+    * can monkey with the refcounts directly without atomics because these
+    * are per-context BOs and they can only be touched by this thread.
+    */
+   assert(new_bo->refcount == 1);
+   new_bo->refcount = bo->refcount;
+   bo->refcount = 1;
+
+   struct crocus_bo tmp;
+   memcpy(&tmp, bo, sizeof(struct crocus_bo));
+   memcpy(bo, new_bo, sizeof(struct crocus_bo));
+   memcpy(new_bo, &tmp, sizeof(struct crocus_bo));
+
+   grow->partial_bo = new_bo; /* the one reference of the OLD bo */
+   grow->partial_bytes = used;
+}
+
+static void
+finish_seqno(struct crocus_batch *batch)
+{
+   struct crocus_fine_fence *sq = crocus_fine_fence_new(batch, CROCUS_FENCE_END);
+   if (!sq)
+      return;
+
+   crocus_fine_fence_reference(batch->screen, &batch->last_fence, sq);
+   crocus_fine_fence_reference(batch->screen, &sq, NULL);
+}
+
+/**
+ * Terminate a batch with MI_BATCH_BUFFER_END.
+ */
+static void
+crocus_finish_batch(struct crocus_batch *batch)
+{
+
+   batch->no_wrap = true;
+   if (batch->screen->vtbl.finish_batch)
+      batch->screen->vtbl.finish_batch(batch);
+
+   finish_seqno(batch);
+
+   /* Emit MI_BATCH_BUFFER_END to finish our batch. */
+   uint32_t *map = batch->command.map_next;
+
+   map[0] = (0xA << 23);
+
+   batch->command.map_next += 4;
+   VG(VALGRIND_CHECK_MEM_IS_DEFINED(batch->command.map, crocus_batch_bytes_used(batch)));
+
+   if (batch->command.bo == batch->exec_bos[0])
+      batch->primary_batch_size = crocus_batch_bytes_used(batch);
+   batch->no_wrap = false;
+}
+
+/**
+ * Replace our current GEM context with a new one (in case it got banned).
+ */
+static bool
+replace_hw_ctx(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+
+   uint32_t new_ctx = crocus_clone_hw_context(bufmgr, batch->hw_ctx_id);
+   if (!new_ctx)
+      return false;
+
+   crocus_destroy_hw_context(bufmgr, batch->hw_ctx_id);
+   batch->hw_ctx_id = new_ctx;
+
+   /* Notify the context that state must be re-initialized. */
+   crocus_lost_context_state(batch);
+
+   return true;
+}
+
+enum pipe_reset_status
+crocus_batch_check_for_reset(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+   enum pipe_reset_status status = PIPE_NO_RESET;
+   struct drm_i915_reset_stats stats = { .ctx_id = batch->hw_ctx_id };
+
+   if (drmIoctl(screen->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats))
+      DBG("DRM_IOCTL_I915_GET_RESET_STATS failed: %s\n", strerror(errno));
+
+   if (stats.batch_active != 0) {
+      /* A reset was observed while a batch from this hardware context was
+       * executing.  Assume that this context was at fault.
+       */
+      status = PIPE_GUILTY_CONTEXT_RESET;
+   } else if (stats.batch_pending != 0) {
+      /* A reset was observed while a batch from this context was in progress,
+       * but the batch was not executing.  In this case, assume that the
+       * context was not at fault.
+       */
+      status = PIPE_INNOCENT_CONTEXT_RESET;
+   }
+
+   if (status != PIPE_NO_RESET) {
+      /* Our context is likely banned, or at least in an unknown state.
+       * Throw it away and start with a fresh context.  Ideally this may
+       * catch the problem before our next execbuf fails with -EIO.
+       */
+      replace_hw_ctx(batch);
+   }
+
+   return status;
+}
+
+/**
+ * Submit the batch to the GPU via execbuffer2.
+ */
+static int
+submit_batch(struct crocus_batch *batch)
+{
+
+   if (batch->use_shadow_copy) {
+      void *bo_map = crocus_bo_map(batch->dbg, batch->command.bo, MAP_WRITE);
+      memcpy(bo_map, batch->command.map, crocus_batch_bytes_used(batch));
+
+      bo_map = crocus_bo_map(batch->dbg, batch->state.bo, MAP_WRITE);
+      memcpy(bo_map, batch->state.map, batch->state.used);
+   }
+
+   crocus_bo_unmap(batch->command.bo);
+   crocus_bo_unmap(batch->state.bo);
+
+   /* The requirement for using I915_EXEC_NO_RELOC are:
+    *
+    *   The addresses written in the objects must match the corresponding
+    *   reloc.gtt_offset which in turn must match the corresponding
+    *   execobject.offset.
+    *
+    *   Any render targets written to in the batch must be flagged with
+    *   EXEC_OBJECT_WRITE.
+    *
+    *   To avoid stalling, execobject.offset should match the current
+    *   address of that object within the active context.
+    */
+   /* Set statebuffer relocations */
+   const unsigned state_index = batch->state.bo->index;
+   if (state_index < batch->exec_count &&
+       batch->exec_bos[state_index] == batch->state.bo) {
+      struct drm_i915_gem_exec_object2 *entry =
+         &batch->validation_list[state_index];
+      assert(entry->handle == batch->state.bo->gem_handle);
+      entry->relocation_count = batch->state.relocs.reloc_count;
+      entry->relocs_ptr = (uintptr_t)batch->state.relocs.relocs;
+   }
+
+   /* Set batchbuffer relocations */
+   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
+   assert(entry->handle == batch->command.bo->gem_handle);
+   entry->relocation_count = batch->command.relocs.reloc_count;
+   entry->relocs_ptr = (uintptr_t)batch->command.relocs.relocs;
+
+   struct drm_i915_gem_execbuffer2 execbuf = {
+      .buffers_ptr = (uintptr_t)batch->validation_list,
+      .buffer_count = batch->exec_count,
+      .batch_start_offset = 0,
+      /* This must be QWord aligned. */
+      .batch_len = ALIGN(batch->primary_batch_size, 8),
+      .flags = I915_EXEC_RENDER |
+               I915_EXEC_NO_RELOC |
+               I915_EXEC_BATCH_FIRST |
+               I915_EXEC_HANDLE_LUT,
+      .rsvd1 = batch->hw_ctx_id, /* rsvd1 is actually the context ID */
+   };
+
+   if (num_fences(batch)) {
+      execbuf.flags |= I915_EXEC_FENCE_ARRAY;
+      execbuf.num_cliprects = num_fences(batch);
+      execbuf.cliprects_ptr =
+         (uintptr_t)util_dynarray_begin(&batch->exec_fences);
+   }
+
+   int ret = 0;
+   if (!batch->screen->no_hw &&
+       intel_ioctl(batch->screen->fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf))
+      ret = -errno;
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct crocus_bo *bo = batch->exec_bos[i];
+
+      bo->idle = false;
+      bo->index = -1;
+
+      /* Update brw_bo::gtt_offset */
+      if (batch->validation_list[i].offset != bo->gtt_offset) {
+         DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
+             bo->gem_handle, bo->gtt_offset,
+             batch->validation_list[i].offset);
+         assert(!(bo->kflags & EXEC_OBJECT_PINNED));
+         bo->gtt_offset = batch->validation_list[i].offset;
+      }
+   }
+
+   return ret;
+}
+
+static const char *
+batch_name_to_string(enum crocus_batch_name name)
+{
+   const char *names[CROCUS_BATCH_COUNT] = {
+      [CROCUS_BATCH_RENDER] = "render",
+      [CROCUS_BATCH_COMPUTE] = "compute",
+   };
+   return names[name];
+}
+
+/**
+ * Flush the batch buffer, submitting it to the GPU and resetting it so
+ * we're ready to emit the next batch.
+ *
+ * \param in_fence_fd is ignored if -1.  Otherwise, this function takes
+ * ownership of the fd.
+ *
+ * \param out_fence_fd is ignored if NULL.  Otherwise, the caller must
+ * take ownership of the returned fd.
+ */
+void
+_crocus_batch_flush(struct crocus_batch *batch, const char *file, int line)
+{
+   struct crocus_screen *screen = batch->screen;
+
+   /* If a fence signals we need to flush it. */
+   if (crocus_batch_bytes_used(batch) == 0 && !batch->contains_fence_signal)
+      return;
+
+   assert(!batch->no_wrap);
+   crocus_finish_batch(batch);
+
+   finish_growing_bos(&batch->command);
+   finish_growing_bos(&batch->state);
+   int ret = submit_batch(batch);
+
+   if (unlikely(INTEL_DEBUG &
+                (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL))) {
+      int bytes_for_commands = crocus_batch_bytes_used(batch);
+      int second_bytes = 0;
+      if (batch->command.bo != batch->exec_bos[0]) {
+         second_bytes = bytes_for_commands;
+         bytes_for_commands += batch->primary_batch_size;
+      }
+      fprintf(stderr, "%19s:%-3d: %s batch [%u] flush with %5d+%5db (%0.1f%%) "
+              "(cmds), %4d BOs (%0.1fMb aperture),"
+              " %4d command relocs, %4d state relocs\n",
+              file, line, batch_name_to_string(batch->name), batch->hw_ctx_id,
+              batch->primary_batch_size, second_bytes,
+              100.0f * bytes_for_commands / BATCH_SZ,
+              batch->exec_count,
+              (float) batch->aperture_space / (1024 * 1024),
+              batch->command.relocs.reloc_count,
+              batch->state.relocs.reloc_count);
+
+      if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT)) {
+         dump_fence_list(batch);
+         dump_validation_list(batch);
+      }
+
+      if (INTEL_DEBUG & DEBUG_BATCH) {
+         decode_batch(batch);
+      }
+   }
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct crocus_bo *bo = batch->exec_bos[i];
+      crocus_bo_unreference(bo);
+   }
+
+   batch->command.relocs.reloc_count = 0;
+   batch->state.relocs.reloc_count = 0;
+   batch->exec_count = 0;
+   batch->aperture_space = 0;
+
+   util_dynarray_foreach(&batch->syncobjs, struct crocus_syncobj *, s)
+      crocus_syncobj_reference(screen, s, NULL);
+   util_dynarray_clear(&batch->syncobjs);
+
+   util_dynarray_clear(&batch->exec_fences);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
+      dbg_printf("waiting for idle\n");
+      crocus_bo_wait_rendering(batch->command.bo); /* if execbuf failed; this is a nop */
+   }
+
+   /* Start a new batch buffer. */
+   crocus_batch_reset(batch);
+
+   /* EIO means our context is banned.  In this case, try and replace it
+    * with a new logical context, and inform crocus_context that all state
+    * has been lost and needs to be re-initialized.  If this succeeds,
+    * dubiously claim success...
+    */
+   if (ret == -EIO && replace_hw_ctx(batch)) {
+      if (batch->reset->reset) {
+         /* Tell the state tracker the device is lost and it was our fault. */
+         batch->reset->reset(batch->reset->data, PIPE_GUILTY_CONTEXT_RESET);
+      }
+
+      ret = 0;
+   }
+
+   if (ret < 0) {
+#ifdef DEBUG
+      const bool color = INTEL_DEBUG & DEBUG_COLOR;
+      fprintf(stderr, "%scrocus: Failed to submit batchbuffer: %-80s%s\n",
+              color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
+#endif
+      abort();
+   }
+}
+
+/**
+ * Does the current batch refer to the given BO?
+ *
+ * (In other words, is the BO in the current batch's validation list?)
+ */
+bool
+crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+   return find_validation_entry(batch, bo) != NULL;
+}
+
+/**
+ * Updates the state of the noop feature.  Returns true if there was a noop
+ * transition that led to state invalidation.
+ */
+bool
+crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable)
+{
+   if (batch->noop_enabled == noop_enable)
+      return 0;
+
+   batch->noop_enabled = noop_enable;
+
+   crocus_batch_flush(batch);
+
+   /* If the batch was empty, flush had no effect, so insert our noop. */
+   if (crocus_batch_bytes_used(batch) == 0)
+      crocus_batch_maybe_noop(batch);
+
+   /* We only need to update the entire state if we transition from noop ->
+    * not-noop.
+    */
+   return !batch->noop_enabled;
+}
diff --git a/src/gallium/drivers/crocus/crocus_batch.h b/src/gallium/drivers/crocus/crocus_batch.h
new file mode 100644
index 00000000000..fe6857d83ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_batch.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BATCH_DOT_H
+#define CROCUS_BATCH_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "util/u_dynarray.h"
+
+#include "common/intel_decoder.h"
+#include "drm-uapi/i915_drm.h"
+
+#include "crocus_fence.h"
+#include "crocus_fine_fence.h"
+
+#include "crocus_bufmgr.h"
+/* The kernel assumes batchbuffers are smaller than 256kB. */
+#define MAX_BATCH_SIZE (256 * 1024)
+
+/* 3DSTATE_BINDING_TABLE_POINTERS has a U16 offset from Surface State Base
+ * Address, which means that we can't put binding tables beyond 64kB.  This
+ * effectively limits the maximum statebuffer size to 64kB.
+ */
+#define MAX_STATE_SIZE (64 * 1024)
+
+/* Our target batch size - flush approximately at this point. */
+#define BATCH_SZ (20 * 1024)
+#define STATE_SZ (16 * 1024)
+
+enum crocus_batch_name {
+   CROCUS_BATCH_RENDER,
+   CROCUS_BATCH_COMPUTE,
+};
+
+#define CROCUS_BATCH_COUNT 2
+
+struct crocus_address {
+   struct crocus_bo *bo;
+   int32_t offset;
+   uint32_t reloc_flags;
+};
+
+struct crocus_reloc_list {
+   struct drm_i915_gem_relocation_entry *relocs;
+   int reloc_count;
+   int reloc_array_size;
+};
+
+struct crocus_growing_bo {
+   struct crocus_bo *bo;
+   void *map;
+   void *map_next;
+   struct crocus_bo *partial_bo;
+   void *partial_bo_map;
+   unsigned partial_bytes;
+   struct crocus_reloc_list relocs;
+   unsigned used;
+};
+
+struct crocus_batch {
+   struct crocus_context *ice;
+   struct crocus_screen *screen;
+   struct pipe_debug_callback *dbg;
+   struct pipe_device_reset_callback *reset;
+
+   /** What batch is this? (e.g. CROCUS_BATCH_RENDER/COMPUTE) */
+   enum crocus_batch_name name;
+
+   /** buffers: command, state */
+   struct crocus_growing_bo command, state;
+
+   /** Size of the primary batch if we've moved on to a secondary. */
+   unsigned primary_batch_size;
+
+   bool state_base_address_emitted;
+   uint8_t pipe_controls_since_last_cs_stall;
+
+   uint32_t hw_ctx_id;
+
+   uint32_t valid_reloc_flags;
+
+   bool use_shadow_copy;
+   bool no_wrap;
+
+   /** The validation list */
+   struct drm_i915_gem_exec_object2 *validation_list;
+   struct crocus_bo **exec_bos;
+   int exec_count;
+   int exec_array_size;
+
+   /** Whether INTEL_BLACKHOLE_RENDER is enabled in the batch (aka first
+    * instruction is a MI_BATCH_BUFFER_END).
+    */
+   bool noop_enabled;
+
+   /**
+    * A list of crocus_syncobjs associated with this batch.
+    *
+    * The first list entry will always be a signalling sync-point, indicating
+    * that this batch has completed.  The others are likely to be sync-points
+    * to wait on before executing the batch.
+    */
+   struct util_dynarray syncobjs;
+
+   /** A list of drm_i915_exec_fences to have execbuf signal or wait on */
+   struct util_dynarray exec_fences;
+
+   /** The amount of aperture space (in bytes) used by all exec_bos */
+   int aperture_space;
+
+   struct {
+      /** Uploader to use for sequence numbers */
+      struct u_upload_mgr *uploader;
+
+      /** GPU buffer and CPU map where our seqno's will be written. */
+      struct crocus_state_ref ref;
+      uint32_t *map;
+
+      /** The sequence number to write the next time we add a fence. */
+      uint32_t next;
+   } fine_fences;
+
+   /** A seqno (and syncobj) for the last batch that was submitted. */
+   struct crocus_fine_fence *last_fence;
+
+   /** List of other batches which we might need to flush to use a BO */
+   struct crocus_batch *other_batches[CROCUS_BATCH_COUNT - 1];
+
+   struct {
+      /**
+       * Set of struct brw_bo * that have been rendered to within this
+       * batchbuffer and would need flushing before being used from another
+       * cache domain that isn't coherent with it (i.e. the sampler).
+       */
+      struct hash_table *render;
+
+      /**
+       * Set of struct brw_bo * that have been used as a depth buffer within
+       * this batchbuffer and would need flushing before being used from
+       * another cache domain that isn't coherent with it (i.e. the sampler).
+       */
+      struct set *depth;
+   } cache;
+
+   struct intel_batch_decode_ctx decoder;
+   struct hash_table_u64 *state_sizes;
+
+   /** Have we emitted any draw calls to this batch? */
+   bool contains_draw;
+
+   /** Batch contains fence signal operation. */
+   bool contains_fence_signal;
+};
+
+static inline bool
+batch_has_fine_fence(struct crocus_batch *batch)
+{
+   return !!batch->fine_fences.uploader;
+}
+
+#define BATCH_HAS_FINE_FENCES(batch) (!!(batch)->fine_fences.uploader)
+void crocus_init_batch(struct crocus_context *ctx,
+                       enum crocus_batch_name name,
+                       int priority);
+void crocus_batch_free(struct crocus_batch *batch);
+void crocus_batch_maybe_flush(struct crocus_batch *batch, unsigned estimate);
+
+void _crocus_batch_flush(struct crocus_batch *batch, const char *file, int line);
+#define crocus_batch_flush(batch) _crocus_batch_flush((batch), __FILE__, __LINE__)
+
+bool crocus_batch_references(struct crocus_batch *batch, struct crocus_bo *bo);
+
+bool crocus_batch_prepare_noop(struct crocus_batch *batch, bool noop_enable);
+
+#define RELOC_WRITE EXEC_OBJECT_WRITE
+#define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
+/* Inverted meaning, but using the same bit...emit_reloc will flip it. */
+#define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
+
+void crocus_use_pinned_bo(struct crocus_batch *batch, struct crocus_bo *bo,
+                          bool writable);
+uint64_t crocus_command_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+                              struct crocus_bo *target, uint32_t target_offset,
+                              unsigned int reloc_flags);
+uint64_t crocus_state_reloc(struct crocus_batch *batch, uint32_t batch_offset,
+                            struct crocus_bo *target, uint32_t target_offset,
+                            unsigned int reloc_flags);
+
+enum pipe_reset_status crocus_batch_check_for_reset(struct crocus_batch *batch);
+
+void crocus_grow_buffer(struct crocus_batch *batch, bool grow_state,
+                        unsigned used, unsigned new_size);
+
+static inline unsigned
+crocus_batch_bytes_used(struct crocus_batch *batch)
+{
+   return batch->command.map_next - batch->command.map;
+}
+
+/**
+ * Ensure the current command buffer has \param size bytes of space
+ * remaining.  If not, this creates a secondary batch buffer and emits
+ * a jump from the primary batch to the start of the secondary.
+ *
+ * Most callers want crocus_get_command_space() instead.
+ */
+static inline void
+crocus_require_command_space(struct crocus_batch *batch, unsigned size)
+{
+   const unsigned required_bytes = crocus_batch_bytes_used(batch) + size;
+   unsigned used = crocus_batch_bytes_used(batch);
+   if (required_bytes >= BATCH_SZ && !batch->no_wrap) {
+      crocus_batch_flush(batch);
+   } else if (used + size >= batch->command.bo->size) {
+      const unsigned new_size =
+         MIN2(batch->command.bo->size + batch->command.bo->size / 2,
+              MAX_BATCH_SIZE);
+
+      crocus_grow_buffer(batch, false, used, new_size);
+      batch->command.map_next = (void *)batch->command.map + used;
+      assert(crocus_batch_bytes_used(batch) + size < batch->command.bo->size);
+   }
+}
+
+/**
+ * Allocate space in the current command buffer, and return a pointer
+ * to the mapped area so the caller can write commands there.
+ *
+ * This should be called whenever emitting commands.
+ */
+static inline void *
+crocus_get_command_space(struct crocus_batch *batch, unsigned bytes)
+{
+   crocus_require_command_space(batch, bytes);
+   void *map = batch->command.map_next;
+   batch->command.map_next += bytes;
+   return map;
+}
+
+/**
+ * Helper to emit GPU commands - allocates space, copies them there.
+ */
+static inline void
+crocus_batch_emit(struct crocus_batch *batch, const void *data, unsigned size)
+{
+   void *map = crocus_get_command_space(batch, size);
+   memcpy(map, data, size);
+}
+
+/**
+ * Get a pointer to the batch's signalling syncobj.  Does not refcount.
+ */
+static inline struct crocus_syncobj *
+crocus_batch_get_signal_syncobj(struct crocus_batch *batch)
+{
+   /* The signalling syncobj is the first one in the list. */
+   struct crocus_syncobj *syncobj =
+      ((struct crocus_syncobj **)util_dynarray_begin(&batch->syncobjs))[0];
+   return syncobj;
+}
+
+/**
+ * Take a reference to the batch's signalling syncobj.
+ *
+ * Callers can use this to wait for the the current batch under construction
+ * to complete (after flushing it).
+ */
+static inline void
+crocus_batch_reference_signal_syncobj(struct crocus_batch *batch,
+                                      struct crocus_syncobj **out_syncobj)
+{
+   struct crocus_syncobj *syncobj = crocus_batch_get_signal_syncobj(batch);
+   crocus_syncobj_reference(batch->screen, out_syncobj, syncobj);
+}
+
+/**
+ * Record the size of a piece of state for use in INTEL_DEBUG=bat printing.
+ */
+static inline void
+crocus_record_state_size(struct hash_table_u64 *ht, uint32_t offset_from_base,
+                         uint32_t size)
+{
+   if (ht) {
+      _mesa_hash_table_u64_insert(ht, offset_from_base,
+                                  (void *)(uintptr_t)size);
+   }
+}
+
+static inline bool
+crocus_ptr_in_state_buffer(struct crocus_batch *batch, void *p)
+{
+   return (char *)p >= (char *)batch->state.map &&
+          (char *)p < (char *)batch->state.map + batch->state.bo->size;
+}
+
+static inline void
+crocus_require_statebuffer_space(struct crocus_batch *batch, int size)
+{
+   if (batch->state.used + size >= STATE_SZ)
+      crocus_batch_flush(batch);
+}
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_blit.c b/src/gallium/drivers/crocus/crocus_blit.c
new file mode 100644
index 00000000000..9cae82e3e2d
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blit.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/format/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/ralloc.h"
+#include "intel/blorp/blorp.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond)
+{
+   util_blitter_save_vertex_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_VERTEX]);
+   util_blitter_save_tessctrl_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL]);
+   util_blitter_save_tesseval_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]);
+   util_blitter_save_geometry_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]);
+   util_blitter_save_so_targets(ice->blitter, ice->state.so_targets,
+                                (struct pipe_stream_output_target**)ice->state.so_target);
+   util_blitter_save_vertex_buffer_slot(ice->blitter, ice->state.vertex_buffers);
+   util_blitter_save_vertex_elements(ice->blitter, (void *)ice->state.cso_vertex_elements);
+   if (op & CROCUS_SAVE_FRAGMENT_STATE) {
+      util_blitter_save_blend(ice->blitter, ice->state.cso_blend);
+      util_blitter_save_depth_stencil_alpha(ice->blitter, ice->state.cso_zsa);
+      util_blitter_save_stencil_ref(ice->blitter, &ice->state.stencil_ref);
+      util_blitter_save_fragment_shader(ice->blitter, ice->shaders.uncompiled[MESA_SHADER_FRAGMENT]);
+      util_blitter_save_sample_mask(ice->blitter, ice->state.sample_mask);
+      util_blitter_save_rasterizer(ice->blitter, ice->state.cso_rast);
+      util_blitter_save_scissor(ice->blitter, &ice->state.scissors[0]);
+      util_blitter_save_viewport(ice->blitter, &ice->state.viewports[0]);
+      util_blitter_save_fragment_constant_buffer_slot(ice->blitter, &ice->state.shaders[MESA_SHADER_FRAGMENT].constbufs[0]);
+   }
+
+   if (!render_cond)
+      util_blitter_save_render_condition(ice->blitter,
+                                         (struct pipe_query *)ice->condition.query,
+                                         ice->condition.condition,
+                                         ice->condition.mode);
+
+//   util_blitter_save_scissor(ice->blitter, &ice->scissors[0]);
+   if (op & CROCUS_SAVE_FRAMEBUFFER)
+      util_blitter_save_framebuffer(ice->blitter, &ice->state.framebuffer);
+
+   if (op & CROCUS_SAVE_TEXTURES) {
+      util_blitter_save_fragment_sampler_states(ice->blitter, 1, (void **)ice->state.shaders[MESA_SHADER_FRAGMENT].samplers);
+      util_blitter_save_fragment_sampler_views(ice->blitter, 1, (struct pipe_sampler_view **)ice->state.shaders[MESA_SHADER_FRAGMENT].textures);
+   }
+}
+
+/**
+ * Helper function for handling mirror image blits.
+ *
+ * If coord0 > coord1, swap them and return "true" (mirrored).
+ */
+static bool
+apply_mirror(float *coord0, float *coord1)
+{
+   if (*coord0 > *coord1) {
+      float tmp = *coord0;
+      *coord0 = *coord1;
+      *coord1 = tmp;
+      return true;
+   }
+   return false;
+}
+
+/**
+ * Compute the number of pixels to clip for each side of a rect
+ *
+ * \param x0 The rect's left coordinate
+ * \param y0 The rect's bottom coordinate
+ * \param x1 The rect's right coordinate
+ * \param y1 The rect's top coordinate
+ * \param min_x The clipping region's left coordinate
+ * \param min_y The clipping region's bottom coordinate
+ * \param max_x The clipping region's right coordinate
+ * \param max_y The clipping region's top coordinate
+ * \param clipped_x0 The number of pixels to clip from the left side
+ * \param clipped_y0 The number of pixels to clip from the bottom side
+ * \param clipped_x1 The number of pixels to clip from the right side
+ * \param clipped_y1 The number of pixels to clip from the top side
+ *
+ * \return false if we clip everything away, true otherwise
+ */
+static inline bool
+compute_pixels_clipped(float x0, float y0, float x1, float y1,
+                       float min_x, float min_y, float max_x, float max_y,
+                       float *clipped_x0, float *clipped_y0,
+                       float *clipped_x1, float *clipped_y1)
+{
+   /* If we are going to clip everything away, stop. */
+   if (!(min_x <= max_x &&
+         min_y <= max_y &&
+         x0 <= max_x &&
+         y0 <= max_y &&
+         min_x <= x1 &&
+         min_y <= y1 &&
+         x0 <= x1 &&
+         y0 <= y1)) {
+      return false;
+   }
+
+   if (x0 < min_x)
+      *clipped_x0 = min_x - x0;
+   else
+      *clipped_x0 = 0;
+   if (max_x < x1)
+      *clipped_x1 = x1 - max_x;
+   else
+      *clipped_x1 = 0;
+
+   if (y0 < min_y)
+      *clipped_y0 = min_y - y0;
+   else
+      *clipped_y0 = 0;
+   if (max_y < y1)
+      *clipped_y1 = y1 - max_y;
+   else
+      *clipped_y1 = 0;
+
+   return true;
+}
+
+/**
+ * Clips a coordinate (left, right, top or bottom) for the src or dst rect
+ * (whichever requires the largest clip) and adjusts the coordinate
+ * for the other rect accordingly.
+ *
+ * \param mirror true if mirroring is required
+ * \param src the source rect coordinate (for example src_x0)
+ * \param dst0 the dst rect coordinate (for example dst_x0)
+ * \param dst1 the opposite dst rect coordinate (for example dst_x1)
+ * \param clipped_dst0 number of pixels to clip from the dst coordinate
+ * \param clipped_dst1 number of pixels to clip from the opposite dst coordinate
+ * \param scale the src vs dst scale involved for that coordinate
+ * \param is_left_or_bottom true if we are clipping the left or bottom sides
+ *        of the rect.
+ */
+static void
+clip_coordinates(bool mirror,
+                 float *src, float *dst0, float *dst1,
+                 float clipped_dst0,
+                 float clipped_dst1,
+                 float scale,
+                 bool is_left_or_bottom)
+{
+   /* When clipping we need to add or subtract pixels from the original
+    * coordinates depending on whether we are acting on the left/bottom
+    * or right/top sides of the rect respectively. We assume we have to
+    * add them in the code below, and multiply by -1 when we should
+    * subtract.
+    */
+   int mult = is_left_or_bottom ? 1 : -1;
+
+   if (!mirror) {
+      *dst0 += clipped_dst0 * mult;
+      *src += clipped_dst0 * scale * mult;
+   } else {
+      *dst1 -= clipped_dst1 * mult;
+      *src += clipped_dst1 * scale * mult;
+   }
+}
+
+/**
+ * Apply a scissor rectangle to blit coordinates.
+ *
+ * Returns true if the blit was entirely scissored away.
+ */
+static bool
+apply_blit_scissor(const struct pipe_scissor_state *scissor,
+                   float *src_x0, float *src_y0,
+                   float *src_x1, float *src_y1,
+                   float *dst_x0, float *dst_y0,
+                   float *dst_x1, float *dst_y1,
+                   bool mirror_x, bool mirror_y)
+{
+   float clip_dst_x0, clip_dst_x1, clip_dst_y0, clip_dst_y1;
+
+   /* Compute number of pixels to scissor away. */
+   if (!compute_pixels_clipped(*dst_x0, *dst_y0, *dst_x1, *dst_y1,
+                               scissor->minx, scissor->miny,
+                               scissor->maxx, scissor->maxy,
+                               &clip_dst_x0, &clip_dst_y0,
+                               &clip_dst_x1, &clip_dst_y1))
+      return true;
+
+   // XXX: comments assume source clipping, which we don't do
+
+   /* When clipping any of the two rects we need to adjust the coordinates
+    * in the other rect considering the scaling factor involved.  To obtain
+    * the best precision we want to make sure that we only clip once per
+    * side to avoid accumulating errors due to the scaling adjustment.
+    *
+    * For example, if src_x0 and dst_x0 need both to be clipped we want to
+    * avoid the situation where we clip src_x0 first, then adjust dst_x0
+    * accordingly but then we realize that the resulting dst_x0 still needs
+    * to be clipped, so we clip dst_x0 and adjust src_x0 again.  Because we are
+    * applying scaling factors to adjust the coordinates in each clipping
+    * pass we lose some precision and that can affect the results of the
+    * blorp blit operation slightly.  What we want to do here is detect the
+    * rect that we should clip first for each side so that when we adjust
+    * the other rect we ensure the resulting coordinate does not need to be
+    * clipped again.
+    *
+    * The code below implements this by comparing the number of pixels that
+    * we need to clip for each side of both rects considering the scales
+    * involved.  For example, clip_src_x0 represents the number of pixels
+    * to be clipped for the src rect's left side, so if clip_src_x0 = 5,
+    * clip_dst_x0 = 4 and scale_x = 2 it means that we are clipping more
+    * from the dst rect so we should clip dst_x0 only and adjust src_x0.
+    * This is because clipping 4 pixels in the dst is equivalent to
+    * clipping 4 * 2 = 8 > 5 in the src.
+    */
+
+   if (*src_x0 == *src_x1 || *src_y0 == *src_y1
+       || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1)
+      return true;
+
+   float scale_x = (float) (*src_x1 - *src_x0) / (*dst_x1 - *dst_x0);
+   float scale_y = (float) (*src_y1 - *src_y0) / (*dst_y1 - *dst_y0);
+
+   /* Clip left side */
+   clip_coordinates(mirror_x, src_x0, dst_x0, dst_x1,
+                    clip_dst_x0, clip_dst_x1, scale_x, true);
+
+   /* Clip right side */
+   clip_coordinates(mirror_x, src_x1, dst_x1, dst_x0,
+                    clip_dst_x1, clip_dst_x0, scale_x, false);
+
+   /* Clip bottom side */
+   clip_coordinates(mirror_y, src_y0, dst_y0, dst_y1,
+                    clip_dst_y0, clip_dst_y1, scale_y, true);
+
+   /* Clip top side */
+   clip_coordinates(mirror_y, src_y1, dst_y1, dst_y0,
+                    clip_dst_y1, clip_dst_y0, scale_y, false);
+
+   /* Check for invalid bounds
+    * Can't blit for 0-dimensions
+    */
+   return *src_x0 == *src_x1 || *src_y0 == *src_y1
+      || *dst_x0 == *dst_x1 || *dst_y0 == *dst_y1;
+}
+
+void
+crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+                               struct isl_device *isl_dev,
+                               struct blorp_surf *surf,
+                               struct pipe_resource *p_res,
+                               enum isl_aux_usage aux_usage,
+                               unsigned level,
+                               bool is_render_target)
+{
+   struct crocus_resource *res = (void *) p_res;
+
+   assert(!crocus_resource_unfinished_aux_import(res));
+
+   if (isl_aux_usage_has_hiz(aux_usage) &&
+       !crocus_resource_level_has_hiz(res, level))
+      aux_usage = ISL_AUX_USAGE_NONE;
+
+   *surf = (struct blorp_surf) {
+      .surf = &res->surf,
+      .addr = (struct blorp_address) {
+         .buffer = res->bo,
+         .offset = res->offset,
+         .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+         .mocs = crocus_mocs(res->bo, isl_dev),
+      },
+      .aux_usage = aux_usage,
+   };
+
+   if (aux_usage != ISL_AUX_USAGE_NONE) {
+      surf->aux_surf = &res->aux.surf;
+      surf->aux_addr = (struct blorp_address) {
+         .buffer = res->aux.bo,
+         .offset = res->aux.offset,
+         .reloc_flags = is_render_target ? EXEC_OBJECT_WRITE : 0,
+         .mocs = crocus_mocs(res->bo, isl_dev),
+      };
+      surf->clear_color =
+         crocus_resource_get_clear_color(res);
+   }
+}
+
+static void
+tex_cache_flush_hack(struct crocus_batch *batch,
+                     enum isl_format view_format,
+                     enum isl_format surf_format)
+{
+   /* The WaSamplerCacheFlushBetweenRedescribedSurfaceReads workaround says:
+    *
+    *    "Currently Sampler assumes that a surface would not have two
+    *     different format associate with it.  It will not properly cache
+    *     the different views in the MT cache, causing a data corruption."
+    *
+    * We may need to handle this for texture views in general someday, but
+    * for now we handle it here, as it hurts copies and blits particularly
+    * badly because they ofter reinterpret formats.
+    *
+    * If the BO hasn't been referenced yet this batch, we assume that the
+    * texture cache doesn't contain any relevant data nor need flushing.
+    *
+    * Icelake (Gen11+) claims to fix this issue, but seems to still have
+    * issues with ASTC formats.
+    */
+   bool need_flush = view_format != surf_format;
+   if (!need_flush)
+      return;
+
+   const char *reason =
+      "workaround: WaSamplerCacheFlushBetweenRedescribedSurfaceReads";
+
+   crocus_emit_pipe_control_flush(batch, reason, PIPE_CONTROL_CS_STALL);
+   crocus_emit_pipe_control_flush(batch, reason,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+}
+
+static struct crocus_resource *
+crocus_resource_for_aspect(const struct intel_device_info *devinfo,
+                           struct pipe_resource *p_res, unsigned pipe_mask)
+{
+   if (pipe_mask == PIPE_MASK_S) {
+      struct crocus_resource *junk, *s_res;
+      crocus_get_depth_stencil_resources(devinfo, p_res, &junk, &s_res);
+      return s_res;
+   } else {
+      return (struct crocus_resource *)p_res;
+   }
+}
+
+static enum pipe_format
+pipe_format_for_aspect(enum pipe_format format, unsigned pipe_mask)
+{
+   if (pipe_mask == PIPE_MASK_S) {
+      return util_format_stencil_only(format);
+   } else if (pipe_mask == PIPE_MASK_Z) {
+      return util_format_get_depth_only(format);
+   } else {
+      return format;
+   }
+}
+
+static void
+crocus_u_blitter(struct crocus_context *ice,
+                 const struct pipe_blit_info *info)
+{
+   struct pipe_blit_info dinfo = *info;
+   if (!util_format_has_alpha(dinfo.dst.resource->format))
+      dinfo.mask &= ~PIPE_MASK_A;
+   crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+   util_blitter_blit(ice->blitter, &dinfo);
+}
+
+/**
+ * The pipe->blit() driver hook.
+ *
+ * This performs a blit between two surfaces, which copies data but may
+ * also perform format conversion, scaling, flipping, and so on.
+ */
+static void
+crocus_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   enum blorp_batch_flags blorp_flags = 0;
+
+   /* We don't support color masking. */
+   assert((info->mask & PIPE_MASK_RGBA) == PIPE_MASK_RGBA ||
+          (info->mask & PIPE_MASK_RGBA) == 0);
+
+   if (info->render_condition_enable)
+      if (!crocus_check_conditional_render(ice))
+         return;
+
+   if (devinfo->ver <= 5) {
+      if (!screen->vtbl.blit_blt(batch, info)) {
+
+         if (!util_format_is_depth_or_stencil(info->src.resource->format) &&
+             info->dst.resource->target != PIPE_TEXTURE_3D)
+            goto use_blorp;
+
+         if (!util_blitter_is_blit_supported(ice->blitter, info)) {
+            if (util_format_is_depth_or_stencil(info->src.resource->format)) {
+
+               struct pipe_blit_info depth_blit = *info;
+               depth_blit.mask = PIPE_MASK_Z;
+               crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+               util_blitter_blit(ice->blitter, &depth_blit);
+
+               struct pipe_surface *dst_view, dst_templ;
+               util_blitter_default_dst_texture(&dst_templ, info->dst.resource, info->dst.level, info->dst.box.z);
+               dst_view = ctx->create_surface(ctx, info->dst.resource, &dst_templ);
+
+               crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+
+               util_blitter_clear_depth_stencil(ice->blitter, dst_view, PIPE_CLEAR_STENCIL,
+                                                0, 0, info->dst.box.x, info->dst.box.y,
+                                                info->dst.box.width, info->dst.box.height);
+               crocus_blitter_begin(ice, CROCUS_SAVE_FRAMEBUFFER | CROCUS_SAVE_TEXTURES | CROCUS_SAVE_FRAGMENT_STATE, info->render_condition_enable);
+               util_blitter_stencil_fallback(ice->blitter,
+                                             info->dst.resource,
+                                             info->dst.level,
+                                             &info->dst.box,
+                                             info->src.resource,
+                                             info->src.level,
+                                             &info->src.box, NULL);
+
+            }
+            return;
+         }
+
+         crocus_u_blitter(ice, info);
+      }
+      return;
+   }
+
+   if (devinfo->ver == 6) {
+      if (info->src.resource->target == PIPE_TEXTURE_3D &&
+          info->dst.resource->target == PIPE_TEXTURE_3D) {
+         crocus_u_blitter(ice, info);
+         return;
+      }
+   }
+
+use_blorp:
+   if (info->render_condition_enable) {
+      if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+         blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+   }
+
+   float src_x0 = info->src.box.x;
+   float src_x1 = info->src.box.x + info->src.box.width;
+   float src_y0 = info->src.box.y;
+   float src_y1 = info->src.box.y + info->src.box.height;
+   float dst_x0 = info->dst.box.x;
+   float dst_x1 = info->dst.box.x + info->dst.box.width;
+   float dst_y0 = info->dst.box.y;
+   float dst_y1 = info->dst.box.y + info->dst.box.height;
+   bool mirror_x = apply_mirror(&src_x0, &src_x1);
+   bool mirror_y = apply_mirror(&src_y0, &src_y1);
+   enum blorp_filter filter;
+
+   if (info->scissor_enable) {
+      bool noop = apply_blit_scissor(&info->scissor,
+                                     &src_x0, &src_y0, &src_x1, &src_y1,
+                                     &dst_x0, &dst_y0, &dst_x1, &dst_y1,
+                                     mirror_x, mirror_y);
+      if (noop)
+         return;
+   }
+
+   if (abs(info->dst.box.width) == abs(info->src.box.width) &&
+       abs(info->dst.box.height) == abs(info->src.box.height)) {
+      if (info->src.resource->nr_samples > 1 &&
+          info->dst.resource->nr_samples <= 1) {
+         /* The OpenGL ES 3.2 specification, section 16.2.1, says:
+          *
+          *    "If the read framebuffer is multisampled (its effective
+          *     value of SAMPLE_BUFFERS is one) and the draw framebuffer
+          *     is not (its value of SAMPLE_BUFFERS is zero), the samples
+          *     corresponding to each pixel location in the source are
+          *     converted to a single sample before being written to the
+          *     destination.  The filter parameter is ignored.  If the
+          *     source formats are integer types or stencil values, a
+          *     single sample’s value is selected for each pixel.  If the
+          *     source formats are floating-point or normalized types,
+          *     the sample values for each pixel are resolved in an
+          *     implementation-dependent manner.  If the source formats
+          *     are depth values, sample values are resolved in an
+          *     implementation-dependent manner where the result will be
+          *     between the minimum and maximum depth values in the pixel."
+          *
+          * When selecting a single sample, we always choose sample 0.
+          */
+         if (util_format_is_depth_or_stencil(info->src.format) ||
+             util_format_is_pure_integer(info->src.format)) {
+            filter = BLORP_FILTER_SAMPLE_0;
+         } else {
+            filter = BLORP_FILTER_AVERAGE;
+         }
+      } else {
+         /* The OpenGL 4.6 specification, section 18.3.1, says:
+          *
+          *    "If the source and destination dimensions are identical,
+          *     no filtering is applied."
+          *
+          * Using BLORP_FILTER_NONE will also handle the upsample case by
+          * replicating the one value in the source to all values in the
+          * destination.
+          */
+         filter = BLORP_FILTER_NONE;
+      }
+   } else if (info->filter == PIPE_TEX_FILTER_LINEAR) {
+      filter = BLORP_FILTER_BILINEAR;
+   } else {
+      filter = BLORP_FILTER_NEAREST;
+   }
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+   float src_z_step = (float)info->src.box.depth / (float)info->dst.box.depth;
+
+   /* There is no interpolation to the pixel center during rendering, so
+    * add the 0.5 offset ourselves here.
+    */
+   float depth_center_offset = 0;
+   if (info->src.resource->target == PIPE_TEXTURE_3D)
+      depth_center_offset = 0.5 / info->dst.box.depth * info->src.box.depth;
+
+   /* Perform a blit for each aspect requested by the caller. PIPE_MASK_R is
+    * used to represent the color aspect. */
+   unsigned aspect_mask = info->mask & (PIPE_MASK_R | PIPE_MASK_ZS);
+   while (aspect_mask) {
+      unsigned aspect = 1 << u_bit_scan(&aspect_mask);
+
+      struct crocus_resource *src_res =
+         crocus_resource_for_aspect(devinfo, info->src.resource, aspect);
+      struct crocus_resource *dst_res =
+         crocus_resource_for_aspect(devinfo, info->dst.resource, aspect);
+
+      enum pipe_format src_pfmt =
+         pipe_format_for_aspect(info->src.format, aspect);
+      enum pipe_format dst_pfmt =
+         pipe_format_for_aspect(info->dst.format, aspect);
+
+      if (crocus_resource_unfinished_aux_import(src_res))
+         crocus_resource_finish_aux_import(ctx->screen, src_res);
+      if (crocus_resource_unfinished_aux_import(dst_res))
+         crocus_resource_finish_aux_import(ctx->screen, dst_res);
+
+      struct crocus_format_info src_fmt =
+         crocus_format_for_usage(devinfo, src_pfmt, ISL_SURF_USAGE_TEXTURE_BIT);
+      enum isl_aux_usage src_aux_usage =
+         crocus_resource_texture_aux_usage(src_res);
+
+      crocus_resource_prepare_texture(ice, src_res, src_fmt.fmt,
+                                      info->src.level, 1, info->src.box.z,
+                                      info->src.box.depth);
+      //      crocus_emit_buffer_barrier_for(batch, src_res->bo,
+      //                                   CROCUS_DOMAIN_OTHER_READ);
+
+      struct crocus_format_info dst_fmt =
+         crocus_format_for_usage(devinfo, dst_pfmt,
+                                 ISL_SURF_USAGE_RENDER_TARGET_BIT);
+      enum isl_aux_usage dst_aux_usage =
+         crocus_resource_render_aux_usage(ice, dst_res, info->dst.level,
+                                          dst_fmt.fmt, false);
+
+      struct blorp_surf src_surf, dst_surf;
+      crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+                                     &src_res->base, src_aux_usage,
+                                     info->src.level, false);
+      crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+                                     &dst_res->base, dst_aux_usage,
+                                     info->dst.level, true);
+
+      crocus_resource_prepare_render(ice, dst_res, info->dst.level,
+                                     info->dst.box.z, info->dst.box.depth,
+                                     dst_aux_usage);
+      //      crocus_emit_buffer_barrier_for(batch, dst_res->bo,
+      //                                   CROCUS_DOMAIN_RENDER_WRITE);
+
+      if (crocus_batch_references(batch, src_res->bo))
+         tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+      if (dst_res->base.target == PIPE_BUFFER) {
+         util_range_add(&dst_res->base, &dst_res->valid_buffer_range,
+                        dst_x0, dst_x1);
+      }
+
+      struct isl_swizzle src_swiz = pipe_to_isl_swizzles(src_fmt.swizzles);
+      struct isl_swizzle dst_swiz = pipe_to_isl_swizzles(dst_fmt.swizzles);
+
+      for (int slice = 0; slice < info->dst.box.depth; slice++) {
+         unsigned dst_z = info->dst.box.z + slice;
+         float src_z = info->src.box.z + slice * src_z_step +
+            depth_center_offset;
+
+         crocus_batch_maybe_flush(batch, 1500);
+
+         blorp_blit(&blorp_batch,
+                    &src_surf, info->src.level, src_z,
+                    src_fmt.fmt, src_swiz,
+                    &dst_surf, info->dst.level, dst_z,
+                    dst_fmt.fmt, dst_swiz,
+                    src_x0, src_y0, src_x1, src_y1,
+                    dst_x0, dst_y0, dst_x1, dst_y1,
+                    filter, mirror_x, mirror_y);
+
+      }
+
+      tex_cache_flush_hack(batch, src_fmt.fmt, src_res->surf.format);
+
+      crocus_resource_finish_render(ice, dst_res, info->dst.level,
+                                    info->dst.box.z, info->dst.box.depth,
+                                    dst_aux_usage);
+   }
+
+   blorp_batch_finish(&blorp_batch);
+
+   crocus_flush_and_dirty_for_history(ice, batch, (struct crocus_resource *)
+                                      info->dst.resource,
+                                      PIPE_CONTROL_RENDER_TARGET_FLUSH,
+                                      "cache history: post-blit");
+}
+
+static void
+get_copy_region_aux_settings(struct crocus_resource *res,
+                             enum isl_aux_usage *out_aux_usage,
+                             bool is_render_target)
+{
+   switch (res->aux.usage) {
+   case ISL_AUX_USAGE_MCS:
+      /* A stencil resolve operation must be performed prior to doing resource
+       * copies or used by CPU.
+       * (see HSD 1209978162)
+       */
+      if (is_render_target && isl_surf_usage_is_stencil(res->surf.usage)) {
+         *out_aux_usage = ISL_AUX_USAGE_NONE;
+      } else {
+         *out_aux_usage = res->aux.usage;
+      }
+      break;
+   default:
+      *out_aux_usage = ISL_AUX_USAGE_NONE;
+      break;
+   }
+}
+
+/**
+ * Perform a GPU-based raw memory copy between compatible view classes.
+ *
+ * Does not perform any flushing - the new data may still be left in the
+ * render cache, and old data may remain in other caches.
+ *
+ * Wraps blorp_copy() and blorp_buffer_copy().
+ */
+void
+crocus_copy_region(struct blorp_context *blorp,
+                   struct crocus_batch *batch,
+                   struct pipe_resource *dst,
+                   unsigned dst_level,
+                   unsigned dstx, unsigned dsty, unsigned dstz,
+                   struct pipe_resource *src,
+                   unsigned src_level,
+                   const struct pipe_box *src_box)
+{
+   struct blorp_batch blorp_batch;
+   struct crocus_context *ice = blorp->driver_ctx;
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_resource *src_res = (void *) src;
+   struct crocus_resource *dst_res = (void *) dst;
+
+   if (devinfo->ver <= 5) {
+      if (screen->vtbl.copy_region_blt(batch, dst_res,
+                                       dst_level, dstx, dsty, dstz,
+                                       src_res, src_level, src_box))
+         return;
+   }
+   enum isl_aux_usage src_aux_usage, dst_aux_usage;
+   get_copy_region_aux_settings(src_res, &src_aux_usage,
+                                false);
+   get_copy_region_aux_settings(dst_res, &dst_aux_usage,
+                                true);
+
+   if (crocus_batch_references(batch, src_res->bo))
+      tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+
+   if (dst->target == PIPE_BUFFER)
+      util_range_add(&dst_res->base, &dst_res->valid_buffer_range, dstx, dstx + src_box->width);
+
+   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+      struct blorp_address src_addr = {
+         .buffer = crocus_resource_bo(src), .offset = src_box->x,
+      };
+      struct blorp_address dst_addr = {
+         .buffer = crocus_resource_bo(dst), .offset = dstx,
+         .reloc_flags = EXEC_OBJECT_WRITE,
+      };
+
+      crocus_batch_maybe_flush(batch, 1500);
+
+      blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+      blorp_buffer_copy(&blorp_batch, src_addr, dst_addr, src_box->width);
+      blorp_batch_finish(&blorp_batch);
+   } else {
+      // XXX: what about one surface being a buffer and not the other?
+
+      struct blorp_surf src_surf, dst_surf;
+      crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &src_surf,
+                                     src, src_aux_usage, src_level, false);
+      crocus_blorp_surf_for_resource(&screen->vtbl, &screen->isl_dev, &dst_surf,
+                                     dst, dst_aux_usage, dst_level, true);
+
+      crocus_resource_prepare_access(ice, src_res, src_level, 1,
+                                     src_box->z, src_box->depth,
+                                     src_aux_usage, false);
+      crocus_resource_prepare_access(ice, dst_res, dst_level, 1,
+                                     dstz, src_box->depth,
+                                     dst_aux_usage, false);
+
+      blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+
+      for (int slice = 0; slice < src_box->depth; slice++) {
+         crocus_batch_maybe_flush(batch, 1500);
+
+         blorp_copy(&blorp_batch, &src_surf, src_level, src_box->z + slice,
+                    &dst_surf, dst_level, dstz + slice,
+                    src_box->x, src_box->y, dstx, dsty,
+                    src_box->width, src_box->height);
+      }
+      blorp_batch_finish(&blorp_batch);
+
+      crocus_resource_finish_write(ice, dst_res, dst_level, dstz,
+                                   src_box->depth, dst_aux_usage);
+   }
+
+   tex_cache_flush_hack(batch, ISL_FORMAT_UNSUPPORTED, src_res->surf.format);
+}
+
+static struct crocus_batch *
+get_preferred_batch(struct crocus_context *ice, struct crocus_bo *bo)
+{
+   /* If the compute batch is already using this buffer, we'd prefer to
+    * continue queueing in the compute batch.
+    */
+   if (crocus_batch_references(&ice->batches[CROCUS_BATCH_COMPUTE], bo))
+      return &ice->batches[CROCUS_BATCH_COMPUTE];
+
+   /* Otherwise default to the render batch. */
+   return &ice->batches[CROCUS_BATCH_RENDER];
+}
+
+
+/**
+ * The pipe->resource_copy_region() driver hook.
+ *
+ * This implements ARB_copy_image semantics - a raw memory copy between
+ * compatible view classes.
+ */
+static void
+crocus_resource_copy_region(struct pipe_context *ctx,
+                            struct pipe_resource *p_dst,
+                            unsigned dst_level,
+                            unsigned dstx, unsigned dsty, unsigned dstz,
+                            struct pipe_resource *p_src,
+                            unsigned src_level,
+                            const struct pipe_box *src_box)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_resource *src = (void *) p_src;
+   struct crocus_resource *dst = (void *) p_dst;
+
+   if (crocus_resource_unfinished_aux_import(src))
+      crocus_resource_finish_aux_import(ctx->screen, src);
+   if (crocus_resource_unfinished_aux_import(dst))
+      crocus_resource_finish_aux_import(ctx->screen, dst);
+
+   /* Use MI_COPY_MEM_MEM for tiny (<= 16 byte, % 4) buffer copies. */
+   if (p_src->target == PIPE_BUFFER && p_dst->target == PIPE_BUFFER &&
+       (src_box->width % 4 == 0) && src_box->width <= 16 &&
+       screen->vtbl.copy_mem_mem) {
+      struct crocus_bo *dst_bo = crocus_resource_bo(p_dst);
+      batch = get_preferred_batch(ice, dst_bo);
+      crocus_batch_maybe_flush(batch, 24 + 5 * (src_box->width / 4));
+      crocus_emit_pipe_control_flush(batch,
+                                     "stall for MI_COPY_MEM_MEM copy_region",
+                                     PIPE_CONTROL_CS_STALL);
+      screen->vtbl.copy_mem_mem(batch, dst_bo, dstx, crocus_resource_bo(p_src),
+                                src_box->x, src_box->width);
+      return;
+   }
+
+   if (devinfo->ver < 6 && util_format_is_depth_or_stencil(p_dst->format)) {
+      util_resource_copy_region(ctx, p_dst, dst_level, dstx, dsty, dstz,
+                                p_src, src_level, src_box);
+      return;
+   }
+   crocus_copy_region(&ice->blorp, batch, p_dst, dst_level, dstx, dsty, dstz,
+                      p_src, src_level, src_box);
+
+   if (util_format_is_depth_and_stencil(p_dst->format) &&
+       util_format_has_stencil(util_format_description(p_src->format)) &&
+       devinfo->ver >= 6) {
+      struct crocus_resource *junk, *s_src_res, *s_dst_res;
+      crocus_get_depth_stencil_resources(devinfo, p_src, &junk, &s_src_res);
+      crocus_get_depth_stencil_resources(devinfo, p_dst, &junk, &s_dst_res);
+
+      crocus_copy_region(&ice->blorp, batch, &s_dst_res->base, dst_level, dstx,
+                         dsty, dstz, &s_src_res->base, src_level, src_box);
+   }
+
+   crocus_flush_and_dirty_for_history(ice, batch, dst,
+                                      PIPE_CONTROL_RENDER_TARGET_FLUSH,
+                                      "cache history: post copy_region");
+}
+
+void
+crocus_init_blit_functions(struct pipe_context *ctx)
+{
+   ctx->blit = crocus_blit;
+   ctx->resource_copy_region = crocus_resource_copy_region;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c
new file mode 100644
index 00000000000..75f0078d535
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blorp.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_blorp.c
+ *
+ * ============================= GENXML CODE =============================
+ *              [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * GenX specific code for working with BLORP (blitting, resolves, clears
+ * on the 3D engine).  This provides the driver-specific hooks needed to
+ * implement the BLORP API.
+ *
+ * See crocus_blit.c, crocus_clear.c, and so on.
+ */
+
+#include <assert.h>
+
+#include "crocus_batch.h"
+#include "crocus_resource.h"
+#include "crocus_context.h"
+
+#include "util/u_upload_mgr.h"
+#include "intel/common/intel_l3_config.h"
+
+#include "blorp/blorp_genX_exec.h"
+
+#if GFX_VER <= 5
+#include "gen4_blorp_exec.h"
+#endif
+
+static uint32_t *
+stream_state(struct crocus_batch *batch,
+             unsigned size,
+             unsigned alignment,
+             uint32_t *out_offset,
+             struct crocus_bo **out_bo)
+{
+   uint32_t offset = ALIGN(batch->state.used, alignment);
+
+   if (offset + size >= STATE_SZ && !batch->no_wrap) {
+      crocus_batch_flush(batch);
+      offset = ALIGN(batch->state.used, alignment);
+   } else if (offset + size >= batch->state.bo->size) {
+      const unsigned new_size =
+         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
+              MAX_STATE_SIZE);
+      crocus_grow_buffer(batch, true, batch->state.used, new_size);
+      assert(offset + size < batch->state.bo->size);
+   }
+
+   crocus_record_state_size(batch->state_sizes, offset, size);
+
+   batch->state.used = offset + size;
+   *out_offset = offset;
+
+   /* If the caller has asked for a BO, we leave them the responsibility of
+    * adding bo->gtt_offset (say, by handing an address to genxml).  If not,
+    * we assume they want the offset from a base address.
+    */
+   if (out_bo)
+      *out_bo = batch->state.bo;
+
+   return (uint32_t *)batch->state.map + (offset >> 2);
+}
+
+static void *
+blorp_emit_dwords(struct blorp_batch *blorp_batch, unsigned n)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   return crocus_get_command_space(batch, n * sizeof(uint32_t));
+}
+
+static uint64_t
+blorp_emit_reloc(struct blorp_batch *blorp_batch, UNUSED void *location,
+                 struct blorp_address addr, uint32_t delta)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   uint32_t offset;
+
+   if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+      offset = (char *)location - (char *)batch->state.map;
+      return crocus_state_reloc(batch, offset,
+                                addr.buffer, addr.offset + delta,
+                                addr.reloc_flags);
+   }
+
+   assert(!crocus_ptr_in_state_buffer(batch, location));
+
+   offset = (char *)location - (char *)batch->command.map;
+   return crocus_command_reloc(batch, offset,
+                               addr.buffer, addr.offset + delta,
+                               addr.reloc_flags);
+}
+
+static void
+blorp_surface_reloc(struct blorp_batch *blorp_batch, uint32_t ss_offset,
+                    struct blorp_address addr, uint32_t delta)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   struct crocus_bo *bo = addr.buffer;
+
+   uint64_t reloc_val =
+      crocus_state_reloc(batch, ss_offset, bo, addr.offset + delta,
+                         addr.reloc_flags);
+
+   void *reloc_ptr = (void *)batch->state.map + ss_offset;
+   *(uint32_t *)reloc_ptr = reloc_val;
+}
+
+static uint64_t
+blorp_get_surface_address(struct blorp_batch *blorp_batch,
+                          struct blorp_address addr)
+{
+   /* We'll let blorp_surface_reloc write the address. */
+   return 0ull;
+}
+
+#if GFX_VER >= 7
+static struct blorp_address
+blorp_get_surface_base_address(struct blorp_batch *blorp_batch)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   return (struct blorp_address) {
+      .buffer = batch->state.bo,
+      .offset = 0
+   };
+}
+#endif
+
+static void *
+blorp_alloc_dynamic_state(struct blorp_batch *blorp_batch,
+                          uint32_t size,
+                          uint32_t alignment,
+                          uint32_t *offset)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   return stream_state(batch, size, alignment, offset, NULL);
+}
+
+static void
+blorp_alloc_binding_table(struct blorp_batch *blorp_batch,
+                          unsigned num_entries,
+                          unsigned state_size,
+                          unsigned state_alignment,
+                          uint32_t *bt_offset,
+                          uint32_t *surface_offsets,
+                          void **surface_maps)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   uint32_t *bt_map = stream_state(batch, num_entries * sizeof(uint32_t), 32,
+                                   bt_offset, NULL);
+
+   for (unsigned i = 0; i < num_entries; i++) {
+      surface_maps[i] = stream_state(batch,
+                                     state_size, state_alignment,
+                                     &(surface_offsets)[i], NULL);
+      bt_map[i] = surface_offsets[i];
+   }
+}
+
+static void *
+blorp_alloc_vertex_buffer(struct blorp_batch *blorp_batch,
+                          uint32_t size,
+                          struct blorp_address *addr)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   struct crocus_bo *bo;
+   uint32_t offset;
+
+   void *map = stream_state(batch, size, 64,
+                            &offset, &bo);
+
+   *addr = (struct blorp_address) {
+      .buffer = bo,
+      .offset = offset,
+      .reloc_flags = RELOC_32BIT,
+#if GFX_VER >= 7
+      .mocs = crocus_mocs(bo, &batch->screen->isl_dev),
+#endif
+   };
+
+   return map;
+}
+
+/**
+ */
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *blorp_batch,
+                                           const struct blorp_address *addrs,
+                                           UNUSED uint32_t *sizes,
+                                           unsigned num_vbs)
+{
+}
+
+static struct blorp_address
+blorp_get_workaround_address(struct blorp_batch *blorp_batch)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = batch->ice->workaround_bo,
+      .offset = batch->ice->workaround_offset,
+   };
+}
+
+static void
+blorp_flush_range(UNUSED struct blorp_batch *blorp_batch,
+                  UNUSED void *start,
+                  UNUSED size_t size)
+{
+   /* All allocated states come from the batch which we will flush before we
+    * submit it.  There's nothing for us to do here.
+    */
+}
+
+#if GFX_VER >= 7
+static const struct intel_l3_config *
+blorp_get_l3_config(struct blorp_batch *blorp_batch)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   return batch->screen->l3_config_3d;
+}
+#else /* GFX_VER < 7 */
+static void
+blorp_emit_urb_config(struct blorp_batch *blorp_batch,
+                      unsigned vs_entry_size,
+                      UNUSED unsigned sf_entry_size)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+#if GFX_VER <= 5
+   batch->screen->vtbl.calculate_urb_fence(batch, 0, vs_entry_size, sf_entry_size);
+#else
+   genX(upload_urb)(batch, vs_entry_size, false, vs_entry_size);
+#endif
+}
+#endif
+
+static void
+crocus_blorp_exec(struct blorp_batch *blorp_batch,
+                  const struct blorp_params *params)
+{
+   struct crocus_context *ice = blorp_batch->blorp->driver_ctx;
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   /* Flush the sampler and render caches.  We definitely need to flush the
+    * sampler cache so that we get updated contents from the render cache for
+    * the glBlitFramebuffer() source.  Also, we are sometimes warned in the
+    * docs to flush the cache between reinterpretations of the same surface
+    * data with different formats, which blorp does for stencil and depth
+    * data.
+    */
+   if (params->src.enabled)
+      crocus_cache_flush_for_read(batch, params->src.addr.buffer);
+   if (params->dst.enabled) {
+      crocus_cache_flush_for_render(batch, params->dst.addr.buffer,
+                                    params->dst.view.format,
+                                    params->dst.aux_usage);
+   }
+   if (params->depth.enabled)
+      crocus_cache_flush_for_depth(batch, params->depth.addr.buffer);
+   if (params->stencil.enabled)
+      crocus_cache_flush_for_depth(batch, params->stencil.addr.buffer);
+
+   crocus_require_command_space(batch, 1400);
+   crocus_require_statebuffer_space(batch, 600);
+   batch->no_wrap = true;
+#if GFX_VER == 6
+   /* Emit workaround flushes when we switch from drawing to blorping. */
+   crocus_emit_post_sync_nonzero_flush(batch);
+#endif
+
+#if GFX_VER >= 6
+   crocus_emit_depth_stall_flushes(batch);
+#endif
+
+   blorp_emit(blorp_batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+      rect.ClippedDrawingRectangleXMax = MAX2(params->x1, params->x0) - 1;
+      rect.ClippedDrawingRectangleYMax = MAX2(params->y1, params->y0) - 1;
+   }
+
+   batch->screen->vtbl.update_surface_base_address(batch);
+   crocus_handle_always_flush_cache(batch);
+
+   batch->contains_draw = true;
+   blorp_exec(blorp_batch, params);
+
+   batch->no_wrap = false;
+   crocus_handle_always_flush_cache(batch);
+
+   /* We've smashed all state compared to what the normal 3D pipeline
+    * rendering tracks for GL.
+    */
+
+   uint64_t skip_bits = (CROCUS_DIRTY_POLYGON_STIPPLE |
+                         CROCUS_DIRTY_GEN7_SO_BUFFERS |
+                         CROCUS_DIRTY_SO_DECL_LIST |
+                         CROCUS_DIRTY_LINE_STIPPLE |
+                         CROCUS_ALL_DIRTY_FOR_COMPUTE |
+                         CROCUS_DIRTY_GEN6_SCISSOR_RECT |
+                         CROCUS_DIRTY_GEN75_VF |
+                         CROCUS_DIRTY_SF_CL_VIEWPORT);
+
+   uint64_t skip_stage_bits = (CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE |
+                               CROCUS_STAGE_DIRTY_UNCOMPILED_VS |
+                               CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+                               CROCUS_STAGE_DIRTY_UNCOMPILED_TES |
+                               CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+                               CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+                               CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS |
+                               CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS |
+                               CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES |
+                               CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS);
+
+   if (!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL]) {
+      /* BLORP disabled tessellation, that's fine for the next draw */
+     skip_stage_bits |= CROCUS_STAGE_DIRTY_TCS |
+                        CROCUS_STAGE_DIRTY_TES |
+                        CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
+                        CROCUS_STAGE_DIRTY_CONSTANTS_TES |
+                        CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+                        CROCUS_STAGE_DIRTY_BINDINGS_TES;
+   }
+
+   if (!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]) {
+      /* BLORP disabled geometry shaders, that's fine for the next draw */
+     skip_stage_bits |= CROCUS_STAGE_DIRTY_GS |
+                        CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+                        CROCUS_STAGE_DIRTY_BINDINGS_GS;
+   }
+
+   /* we can skip flagging CROCUS_DIRTY_DEPTH_BUFFER, if
+    * BLORP_BATCH_NO_EMIT_DEPTH_STENCIL is set.
+    */
+   if (blorp_batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL)
+      skip_bits |= CROCUS_DIRTY_DEPTH_BUFFER;
+
+   if (!params->wm_prog_data)
+      skip_bits |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+
+   ice->state.dirty |= ~skip_bits;
+   ice->state.stage_dirty |= ~skip_stage_bits;
+
+   ice->urb.vsize = 0;
+   ice->urb.gs_present = false;
+   ice->urb.gsize = 0;
+   ice->urb.tess_present = false;
+   ice->urb.hsize = 0;
+   ice->urb.dsize = 0;
+
+   if (params->dst.enabled) {
+      crocus_render_cache_add_bo(batch, params->dst.addr.buffer,
+                                 params->dst.view.format,
+                                 params->dst.aux_usage);
+   }
+   if (params->depth.enabled)
+      crocus_depth_cache_add_bo(batch, params->depth.addr.buffer);
+   if (params->stencil.enabled)
+      crocus_depth_cache_add_bo(batch, params->stencil.addr.buffer);
+}
+
+static void
+blorp_measure_start(struct blorp_batch *blorp_batch,
+                    const struct blorp_params *params)
+{
+}
+
+void
+genX(init_blorp)(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+
+   blorp_init(&ice->blorp, ice, &screen->isl_dev);
+   ice->blorp.compiler = screen->compiler;
+   ice->blorp.lookup_shader = crocus_blorp_lookup_shader;
+   ice->blorp.upload_shader = crocus_blorp_upload_shader;
+   ice->blorp.exec = crocus_blorp_exec;
+}
diff --git a/src/gallium/drivers/crocus/crocus_blt.c b/src/gallium/drivers/crocus/crocus_blt.c
new file mode 100644
index 00000000000..d27891352bd
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_blt.c
@@ -0,0 +1,337 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* blt command encoding for gen4/5 */
+#include "crocus_context.h"
+
+#include "crocus_genx_macros.h"
+#include "crocus_genx_protos.h"
+#include "crocus_resource.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+#if GFX_VER <= 5
+
+static bool validate_blit_for_blt(struct crocus_batch *batch,
+                                  const struct pipe_blit_info *info)
+{
+   /* If the source and destination are the same size with no mirroring,
+    * the rectangles are within the size of the texture and there is no
+    * scissor, then we can probably use the blit engine.
+    */
+   if (info->dst.box.width != info->src.box.width ||
+       info->dst.box.height != info->src.box.height)
+      return false;
+
+   if (info->scissor_enable)
+      return false;
+
+   if (info->dst.box.height < 0 || info->src.box.height < 0)
+      return false;
+
+   if (info->dst.box.depth > 1 || info->src.box.depth > 1)
+      return false;
+
+   return true;
+}
+
+static inline int crocus_resource_blt_pitch(struct crocus_resource *res)
+{
+   int pitch = res->surf.row_pitch_B;
+   if (res->surf.tiling != ISL_TILING_LINEAR)
+      pitch /= 4;
+   return pitch;
+}
+
+static uint32_t
+color_depth_for_cpp(int cpp)
+{
+   switch (cpp) {
+   case 4: return COLOR_DEPTH__32bit;
+   case 2: return COLOR_DEPTH__565;
+   case 1: return COLOR_DEPTH__8bit;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static bool emit_copy_blt(struct crocus_batch *batch,
+                          struct crocus_resource *src,
+                          struct crocus_resource *dst,
+                          unsigned cpp,
+                          int32_t src_pitch,
+                          unsigned src_offset,
+                          int32_t dst_pitch,
+                          unsigned dst_offset,
+                          uint16_t src_x, uint16_t src_y,
+                          uint16_t dst_x, uint16_t dst_y,
+                          uint16_t w, uint16_t h)
+
+{
+   uint32_t src_tile_w, src_tile_h;
+   uint32_t dst_tile_w, dst_tile_h;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+
+   DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+       __func__,
+       src, src_pitch, src_offset, src_x, src_y,
+       dst, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+   isl_get_tile_dims(src->surf.tiling, cpp, &src_tile_w, &src_tile_h);
+   isl_get_tile_dims(dst->surf.tiling, cpp, &dst_tile_w, &dst_tile_h);
+
+   /* For Tiled surfaces, the pitch has to be a multiple of the Tile width
+    * (X direction width of the Tile). This is ensured while allocating the
+    * buffer object.
+    */
+   assert(src->surf.tiling == ISL_TILING_LINEAR || (src_pitch % src_tile_w) == 0);
+   assert(dst->surf.tiling == ISL_TILING_LINEAR || (dst_pitch % dst_tile_w) == 0);
+
+   /* For big formats (such as floating point), do the copy using 16 or
+    * 32bpp and multiply the coordinates.
+    */
+   if (cpp > 4) {
+      if (cpp % 4 == 2) {
+         dst_x *= cpp / 2;
+         dst_x2 *= cpp / 2;
+         src_x *= cpp / 2;
+         cpp = 2;
+      } else {
+         assert(cpp % 4 == 0);
+         dst_x *= cpp / 4;
+         dst_x2 *= cpp / 4;
+         src_x *= cpp / 4;
+         cpp = 4;
+      }
+   }
+
+   /* For tiled source and destination, pitch value should be specified
+    * as a number of Dwords.
+    */
+   if (dst->surf.tiling != ISL_TILING_LINEAR)
+      dst_pitch /= 4;
+
+   if (src->surf.tiling != ISL_TILING_LINEAR)
+      src_pitch /= 4;
+
+   assert(cpp <= 4);
+   crocus_emit_cmd(batch, GENX(XY_SRC_COPY_BLT), xyblt) {
+      xyblt.RasterOperation = 0xCC;
+      xyblt.DestinationTilingEnable = dst->surf.tiling != ISL_TILING_LINEAR;
+      xyblt.SourceTilingEnable = src->surf.tiling != ISL_TILING_LINEAR;
+      xyblt.SourceBaseAddress = ro_bo(src->bo, src_offset);
+      xyblt.DestinationBaseAddress = rw_bo(dst->bo, dst_offset);
+      xyblt.ColorDepth = color_depth_for_cpp(cpp);
+      xyblt._32bppByteMask = cpp == 4 ? 0x3 : 0x1;
+      xyblt.DestinationX1Coordinate = dst_x;
+      xyblt.DestinationY1Coordinate = dst_y;
+      xyblt.DestinationX2Coordinate = dst_x2;
+      xyblt.DestinationY2Coordinate = dst_y2;
+      xyblt.DestinationPitch = dst_pitch;
+      xyblt.SourceX1Coordinate = src_x;
+      xyblt.SourceY1Coordinate = src_y;
+      xyblt.SourcePitch = src_pitch;
+   };
+
+   crocus_emit_mi_flush(batch);
+   return true;
+}
+
+static bool crocus_emit_blt(struct crocus_batch *batch,
+                            struct crocus_resource *src,
+                            struct crocus_resource *dst,
+                            unsigned dst_level,
+                            unsigned dst_x, unsigned dst_y,
+                            unsigned dst_z,
+                            unsigned src_level,
+                            const struct pipe_box *src_box)
+{
+   const struct isl_format_layout *src_fmtl = isl_format_get_layout(src->surf.format);
+   unsigned src_cpp = src_fmtl->bpb / 8;
+   const struct isl_format_layout *dst_fmtl = isl_format_get_layout(dst->surf.format);
+   const unsigned dst_cpp = dst_fmtl->bpb / 8;
+   uint16_t src_x, src_y;
+   uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
+   uint32_t src_width = src_box->width, src_height = src_box->height;
+
+   /* gen4/5 can't handle Y tiled blits. */
+   if (src->surf.tiling == ISL_TILING_Y0 || dst->surf.tiling == ISL_TILING_Y0)
+      return false;
+
+   if (src->surf.format != dst->surf.format)
+      return false;
+
+   if (src_cpp != dst_cpp)
+      return false;
+
+   src_x = src_box->x;
+   src_y = src_box->y;
+
+   assert(src_cpp == dst_cpp);
+
+   crocus_resource_get_image_offset(src, src_level, src_box->z, &src_image_x,
+                                    &src_image_y);
+   if (util_format_is_compressed(src->base.format)) {
+      int bw = util_format_get_blockwidth(src->base.format);
+      int bh = util_format_get_blockheight(src->base.format);
+      assert(src_x % bw == 0);
+      assert(src_y % bh == 0);
+      src_x /= (int)bw;
+      src_y /= (int)bh;
+      src_width = DIV_ROUND_UP(src_width, (int)bw);
+      src_height = DIV_ROUND_UP(src_height, (int)bh);
+   }
+
+   crocus_resource_get_image_offset(dst, dst_level, dst_z, &dst_image_x,
+                                    &dst_image_y);
+   if (util_format_is_compressed(dst->base.format)) {
+      int bw = util_format_get_blockwidth(dst->base.format);
+      int bh = util_format_get_blockheight(dst->base.format);
+      assert(dst_x % bw == 0);
+      assert(dst_y % bh == 0);
+      dst_x /= (int)bw;
+      dst_y /= (int)bh;
+   }
+   src_x += src_image_x;
+   src_y += src_image_y;
+   dst_x += dst_image_x;
+   dst_y += dst_image_y;
+
+   /* According to the Ivy Bridge PRM, Vol1 Part4, section 1.2.1.2 (Graphics
+    * Data Size Limitations):
+    *
+    *    The BLT engine is capable of transferring very large quantities of
+    *    graphics data. Any graphics data read from and written to the
+    *    destination is permitted to represent a number of pixels that
+    *    occupies up to 65,536 scan lines and up to 32,768 bytes per scan line
+    *    at the destination. The maximum number of pixels that may be
+    *    represented per scan line’s worth of graphics data depends on the
+    *    color depth.
+    *
+    * The blitter's pitch is a signed 16-bit integer, but measured in bytes
+    * for linear surfaces and DWords for tiled surfaces.  So the maximum
+    * pitch is 32k linear and 128k tiled.
+    */
+   if (crocus_resource_blt_pitch(src) >= 32768 ||
+       crocus_resource_blt_pitch(dst) >= 32768) {
+      return false;
+   }
+
+   /* We need to split the blit into chunks that each fit within the blitter's
+    * restrictions.  We can't use a chunk size of 32768 because we need to
+    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
+    * a nice round power of two, big enough that performance won't suffer, and
+    * small enough to guarantee everything fits.
+    */
+   const uint32_t max_chunk_size = 16384;
+
+   for (uint32_t chunk_x = 0; chunk_x < src_width; chunk_x += max_chunk_size) {
+      for (uint32_t chunk_y = 0; chunk_y < src_height; chunk_y += max_chunk_size) {
+         const uint32_t chunk_w = MIN2(max_chunk_size, src_width - chunk_x);
+         const uint32_t chunk_h = MIN2(max_chunk_size, src_height - chunk_y);
+
+         ASSERTED uint32_t z_offset_el, array_offset;
+         uint32_t src_offset, src_tile_x, src_tile_y;
+         isl_tiling_get_intratile_offset_el(src->surf.tiling,
+                                            src_cpp * 8, src->surf.row_pitch_B,
+                                            src->surf.array_pitch_el_rows,
+                                            src_x + chunk_x, src_y + chunk_y, 0, 0,
+                                            &src_offset,
+                                            &src_tile_x, &src_tile_y,
+                                            &z_offset_el, &array_offset);
+         assert(z_offset_el == 0);
+         assert(array_offset == 0);
+
+         uint32_t dst_offset, dst_tile_x, dst_tile_y;
+         isl_tiling_get_intratile_offset_el(dst->surf.tiling,
+                                            dst_cpp * 8, dst->surf.row_pitch_B,
+                                            dst->surf.array_pitch_el_rows,
+                                            dst_x + chunk_x, dst_y + chunk_y, 0, 0,
+                                            &dst_offset,
+                                            &dst_tile_x, &dst_tile_y,
+                                            &z_offset_el, &array_offset);
+         assert(z_offset_el == 0);
+         assert(array_offset == 0);
+         if (!emit_copy_blt(batch, src, dst,
+                            src_cpp, src->surf.row_pitch_B,
+                            src_offset,
+                            dst->surf.row_pitch_B, dst_offset,
+                            src_tile_x, src_tile_y,
+                            dst_tile_x, dst_tile_y,
+                            chunk_w, chunk_h)) {
+            return false;
+         }
+      }
+   }
+   return true;
+}
+
+static bool crocus_blit_blt(struct crocus_batch *batch,
+                            const struct pipe_blit_info *info)
+{
+   if (!validate_blit_for_blt(batch, info))
+      return false;
+
+   return crocus_emit_blt(batch,
+                          (struct crocus_resource *)info->src.resource,
+                          (struct crocus_resource *)info->dst.resource,
+                          info->dst.level,
+                          info->dst.box.x,
+                          info->dst.box.y,
+                          info->dst.box.z,
+                          info->src.level,
+                          &info->src.box);
+}
+
+
+static bool crocus_copy_region_blt(struct crocus_batch *batch,
+                                   struct crocus_resource *dst,
+                                   unsigned dst_level,
+                                   unsigned dstx, unsigned dsty, unsigned dstz,
+                                   struct crocus_resource *src,
+                                   unsigned src_level,
+                                   const struct pipe_box *src_box)
+{
+   if (dst->base.target == PIPE_BUFFER || src->base.target == PIPE_BUFFER)
+      return false;
+   return crocus_emit_blt(batch,
+                          src,
+                          dst,
+                          dst_level,
+                          dstx, dsty, dstz,
+                          src_level,
+                          src_box);
+}
+#endif
+
+void
+genX(init_blt)(struct crocus_screen *screen)
+{
+#if GFX_VER <= 5
+   screen->vtbl.blit_blt = crocus_blit_blt;
+   screen->vtbl.copy_region_blt = crocus_copy_region_blt;
+#else
+   screen->vtbl.blit_blt = NULL;
+   screen->vtbl.copy_region_blt = NULL;
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.c b/src/gallium/drivers/crocus/crocus_bufmgr.c
new file mode 100644
index 00000000000..caca821cd7e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.c
@@ -0,0 +1,1689 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_bufmgr.c
+ *
+ * The crocus buffer manager.
+ *
+ * XXX: write better comments
+ * - BOs
+ * - Explain BO cache
+ * - main interface to GEM in the kernel
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <xf86drm.h>
+#include <util/u_atomic.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "errno.h"
+#include "common/intel_clflush.h"
+#include "dev/intel_debug.h"
+#include "common/intel_gem.h"
+#include "dev/intel_device_info.h"
+#include "main/macros.h"
+#include "util/debug.h"
+#include "util/macros.h"
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/os_file.h"
+#include "util/u_dynarray.h"
+#include "util/vma.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "string.h"
+
+#include "drm-uapi/i915_drm.h"
+
+#ifdef HAVE_VALGRIND
+#include <valgrind.h>
+#include <memcheck.h>
+#define VG(x) x
+#else
+#define VG(x)
+#endif
+
+/**
+ * For debugging purposes, this returns a time in seconds.
+ */
+static double
+get_time(void)
+{
+   struct timespec tp;
+
+   clock_gettime(CLOCK_MONOTONIC, &tp);
+
+   return tp.tv_sec + tp.tv_nsec / 1000000000.0;
+}
+
+/* VALGRIND_FREELIKE_BLOCK unfortunately does not actually undo the earlier
+ * VALGRIND_MALLOCLIKE_BLOCK but instead leaves vg convinced the memory is
+ * leaked. All because it does not call VG(cli_free) from its
+ * VG_USERREQ__FREELIKE_BLOCK handler. Instead of treating the memory like
+ * and allocation, we mark it available for use upon mmapping and remove
+ * it upon unmapping.
+ */
+#define VG_DEFINED(ptr, size) VG(VALGRIND_MAKE_MEM_DEFINED(ptr, size))
+#define VG_NOACCESS(ptr, size) VG(VALGRIND_MAKE_MEM_NOACCESS(ptr, size))
+
+#define PAGE_SIZE 4096
+
+#define WARN_ONCE(cond, fmt...) do {                            \
+   if (unlikely(cond)) {                                        \
+      static bool _warned = false;                              \
+      if (!_warned) {                                           \
+         fprintf(stderr, "WARNING: ");                          \
+         fprintf(stderr, fmt);                                  \
+         _warned = true;                                        \
+      }                                                         \
+   }                                                            \
+} while (0)
+
+#define FILE_DEBUG_FLAG DEBUG_BUFMGR
+
+static inline int
+atomic_add_unless(int *v, int add, int unless)
+{
+   int c, old;
+   c = p_atomic_read(v);
+   while (c != unless && (old = p_atomic_cmpxchg(v, c, c + add)) != c)
+      c = old;
+   return c == unless;
+}
+
+struct bo_cache_bucket {
+   /** List of cached BOs. */
+   struct list_head head;
+
+   /** Size of this bucket, in bytes. */
+   uint64_t size;
+};
+
+struct bo_export {
+   /** File descriptor associated with a handle export. */
+   int drm_fd;
+
+   /** GEM handle in drm_fd */
+   uint32_t gem_handle;
+
+   struct list_head link;
+};
+
+struct crocus_bufmgr {
+   /**
+    * List into the list of bufmgr.
+    */
+   struct list_head link;
+
+   uint32_t refcount;
+
+   int fd;
+
+   mtx_t lock;
+
+   /** Array of lists of cached gem objects of power-of-two sizes */
+   struct bo_cache_bucket cache_bucket[14 * 4];
+   int num_buckets;
+   time_t time;
+
+   struct hash_table *name_table;
+   struct hash_table *handle_table;
+
+   /**
+    * List of BOs which we've effectively freed, but are hanging on to
+    * until they're idle before closing and returning the VMA.
+    */
+   struct list_head zombie_list;
+
+   bool has_llc:1;
+   bool has_mmap_offset:1;
+   bool has_tiling_uapi:1;
+   bool bo_reuse:1;
+};
+
+static mtx_t global_bufmgr_list_mutex = _MTX_INITIALIZER_NP;
+static struct list_head global_bufmgr_list = {
+   .next = &global_bufmgr_list,
+   .prev = &global_bufmgr_list,
+};
+
+static int bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+                                  uint32_t stride);
+
+static void bo_free(struct crocus_bo *bo);
+
+static uint32_t
+key_hash_uint(const void *key)
+{
+   return _mesa_hash_data(key, 4);
+}
+
+static bool
+key_uint_equal(const void *a, const void *b)
+{
+   return *((unsigned *) a) == *((unsigned *) b);
+}
+
+static struct crocus_bo *
+find_and_ref_external_bo(struct hash_table *ht, unsigned int key)
+{
+   struct hash_entry *entry = _mesa_hash_table_search(ht, &key);
+   struct crocus_bo *bo = entry ? entry->data : NULL;
+
+   if (bo) {
+      assert(bo->external);
+      assert(!bo->reusable);
+
+      /* Being non-reusable, the BO cannot be in the cache lists, but it
+       * may be in the zombie list if it had reached zero references, but
+       * we hadn't yet closed it...and then reimported the same BO.  If it
+       * is, then remove it since it's now been resurrected.
+       */
+      if (bo->head.prev || bo->head.next)
+         list_del(&bo->head);
+
+      crocus_bo_reference(bo);
+   }
+
+   return bo;
+}
+
+/**
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
+static struct bo_cache_bucket *
+bucket_for_size(struct crocus_bufmgr *bufmgr, uint64_t size)
+{
+   /* Calculating the pages and rounding up to the page size. */
+   const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+   /* Row  Bucket sizes    clz((x-1) | 3)   Row    Column
+    *        in pages                      stride   size
+    *   0:   1  2  3  4 -> 30 30 30 30        4       1
+    *   1:   5  6  7  8 -> 29 29 29 29        4       1
+    *   2:  10 12 14 16 -> 28 28 28 28        8       2
+    *   3:  20 24 28 32 -> 27 27 27 27       16       4
+    */
+   const unsigned row = 30 - __builtin_clz((pages - 1) | 3);
+   const unsigned row_max_pages = 4 << row;
+
+   /* The '& ~2' is the special case for row 1. In row 1, max pages /
+    * 2 is 2, but the previous row maximum is zero (because there is
+    * no previous row). All row maximum sizes are power of 2, so that
+    * is the only case where that bit will be set.
+    */
+   const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2;
+   int col_size_log2 = row - 1;
+   col_size_log2 += (col_size_log2 < 0);
+
+   const unsigned col = (pages - prev_row_max_pages +
+                         ((1 << col_size_log2) - 1)) >> col_size_log2;
+
+   /* Calculating the index based on the row and column. */
+   const unsigned index = (row * 4) + (col - 1);
+
+   return (index < bufmgr->num_buckets) ?
+          &bufmgr->cache_bucket[index] : NULL;
+}
+
+
+int
+crocus_bo_busy(struct crocus_bo *bo)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+   struct drm_i915_gem_busy busy = { .handle = bo->gem_handle };
+
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
+   if (ret == 0) {
+      bo->idle = !busy.busy;
+      return busy.busy;
+   }
+   return false;
+}
+
+int
+crocus_bo_madvise(struct crocus_bo *bo, int state)
+{
+   struct drm_i915_gem_madvise madv = {
+      .handle = bo->gem_handle,
+      .madv = state,
+      .retained = 1,
+   };
+
+   intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_MADVISE, &madv);
+
+   return madv.retained;
+}
+
+static struct crocus_bo *
+bo_calloc(void)
+{
+   struct crocus_bo *bo = calloc(1, sizeof(*bo));
+   if (!bo)
+      return NULL;
+
+   list_inithead(&bo->exports);
+   bo->hash = _mesa_hash_pointer(bo);
+   return bo;
+}
+
+static struct crocus_bo *
+alloc_bo_from_cache(struct crocus_bufmgr *bufmgr,
+                    struct bo_cache_bucket *bucket,
+                    uint32_t alignment,
+                    unsigned flags)
+{
+   if (!bucket)
+      return NULL;
+
+   struct crocus_bo *bo = NULL;
+
+   list_for_each_entry_safe(struct crocus_bo, cur, &bucket->head, head) {
+      /* If the last BO in the cache is busy, there are no idle BOs.  Bail,
+       * either falling back to a non-matching memzone, or if that fails,
+       * allocating a fresh buffer.
+       */
+      if (crocus_bo_busy(cur))
+         return NULL;
+
+      list_del(&cur->head);
+
+      /* Tell the kernel we need this BO.  If it still exists, we're done! */
+      if (crocus_bo_madvise(cur, I915_MADV_WILLNEED)) {
+         bo = cur;
+         break;
+      }
+
+      /* This BO was purged, throw it out and keep looking. */
+      bo_free(cur);
+   }
+
+   if (!bo)
+      return NULL;
+
+   /* Zero the contents if necessary.  If this fails, fall back to
+    * allocating a fresh BO, which will always be zeroed by the kernel.
+    */
+   if (flags & BO_ALLOC_ZEROED) {
+      void *map = crocus_bo_map(NULL, bo, MAP_WRITE | MAP_RAW);
+      if (map) {
+         memset(map, 0, bo->size);
+      } else {
+         bo_free(bo);
+         return NULL;
+      }
+   }
+
+   return bo;
+}
+
+static struct crocus_bo *
+alloc_fresh_bo(struct crocus_bufmgr *bufmgr, uint64_t bo_size)
+{
+   struct crocus_bo *bo = bo_calloc();
+   if (!bo)
+      return NULL;
+
+   struct drm_i915_gem_create create = { .size = bo_size };
+
+   /* All new BOs we get from the kernel are zeroed, so we don't need to
+    * worry about that here.
+    */
+   if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CREATE, &create) != 0) {
+      free(bo);
+      return NULL;
+   }
+
+   bo->gem_handle = create.handle;
+   bo->bufmgr = bufmgr;
+   bo->size = bo_size;
+   bo->idle = true;
+   bo->tiling_mode = I915_TILING_NONE;
+   bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
+   bo->stride = 0;
+
+   /* Calling set_domain() will allocate pages for the BO outside of the
+    * struct mutex lock in the kernel, which is more efficient than waiting
+    * to create them during the first execbuf that uses the BO.
+    */
+   struct drm_i915_gem_set_domain sd = {
+      .handle = bo->gem_handle,
+      .read_domains = I915_GEM_DOMAIN_CPU,
+      .write_domain = 0,
+   };
+
+   if (intel_ioctl(bo->bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd) != 0) {
+      bo_free(bo);
+      return NULL;
+   }
+
+   return bo;
+}
+
+static struct crocus_bo *
+bo_alloc_internal(struct crocus_bufmgr *bufmgr,
+                  const char *name,
+                  uint64_t size,
+                  uint32_t alignment,
+                  unsigned flags,
+                  uint32_t tiling_mode,
+                  uint32_t stride)
+{
+   struct crocus_bo *bo;
+   unsigned int page_size = getpagesize();
+   struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
+
+   /* Round the size up to the bucket size, or if we don't have caching
+    * at this size, a multiple of the page size.
+    */
+   uint64_t bo_size =
+      bucket ? bucket->size : MAX2(ALIGN(size, page_size), page_size);
+
+   mtx_lock(&bufmgr->lock);
+
+   /* Get a buffer out of the cache if available.  First, we try to find
+    * one with a matching memory zone so we can avoid reallocating VMA.
+    */
+   bo = alloc_bo_from_cache(bufmgr, bucket, alignment, flags);
+
+   mtx_unlock(&bufmgr->lock);
+
+   if (!bo) {
+      bo = alloc_fresh_bo(bufmgr, bo_size);
+      if (!bo)
+         return NULL;
+   }
+
+   if (bo_set_tiling_internal(bo, tiling_mode, stride))
+      goto err_free;
+
+   bo->name = name;
+   p_atomic_set(&bo->refcount, 1);
+   bo->reusable = bucket && bufmgr->bo_reuse;
+   bo->cache_coherent = bufmgr->has_llc;
+   bo->index = -1;
+   bo->kflags = 0;
+
+   if ((flags & BO_ALLOC_COHERENT) && !bo->cache_coherent) {
+      struct drm_i915_gem_caching arg = {
+         .handle = bo->gem_handle,
+         .caching = 1,
+      };
+      if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_CACHING, &arg) == 0) {
+         bo->cache_coherent = true;
+         bo->reusable = false;
+      }
+   }
+
+   DBG("bo_create: buf %d (%s) %llub\n", bo->gem_handle,
+       bo->name, (unsigned long long) size);
+
+   return bo;
+
+err_free:
+   bo_free(bo);
+   return NULL;
+}
+
+struct crocus_bo *
+crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+                const char *name,
+                uint64_t size)
+{
+   return bo_alloc_internal(bufmgr, name, size, 1,
+                            0, I915_TILING_NONE, 0);
+}
+
+struct crocus_bo *
+crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr, const char *name,
+                      uint64_t size, uint32_t alignment,
+                      uint32_t tiling_mode, uint32_t pitch, unsigned flags)
+{
+   return bo_alloc_internal(bufmgr, name, size, alignment,
+                            flags, tiling_mode, pitch);
+}
+
+struct crocus_bo *
+crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr, const char *name,
+                         void *ptr, size_t size)
+{
+   struct crocus_bo *bo;
+
+   bo = bo_calloc();
+   if (!bo)
+      return NULL;
+
+   struct drm_i915_gem_userptr arg = {
+      .user_ptr = (uintptr_t)ptr,
+      .user_size = size,
+   };
+   if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_USERPTR, &arg))
+      goto err_free;
+   bo->gem_handle = arg.handle;
+
+   /* Check the buffer for validity before we try and use it in a batch */
+   struct drm_i915_gem_set_domain sd = {
+      .handle = bo->gem_handle,
+      .read_domains = I915_GEM_DOMAIN_CPU,
+   };
+   if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &sd))
+      goto err_close;
+
+   bo->name = name;
+   bo->size = size;
+   bo->map_cpu = ptr;
+
+   bo->bufmgr = bufmgr;
+   bo->kflags = 0;
+
+   if (bo->gtt_offset == 0ull)
+      goto err_close;
+
+   p_atomic_set(&bo->refcount, 1);
+   bo->userptr = true;
+   bo->cache_coherent = true;
+   bo->index = -1;
+   bo->idle = true;
+
+   return bo;
+
+err_close:
+   intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &bo->gem_handle);
+err_free:
+   free(bo);
+   return NULL;
+}
+
+/**
+ * Returns a crocus_bo wrapping the given buffer object handle.
+ *
+ * This can be used when one application needs to pass a buffer object
+ * to another.
+ */
+struct crocus_bo *
+crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+                               const char *name, unsigned int handle)
+{
+   struct crocus_bo *bo;
+
+   /* At the moment most applications only have a few named bo.
+    * For instance, in a DRI client only the render buffers passed
+    * between X and the client are named. And since X returns the
+    * alternating names for the front/back buffer a linear search
+    * provides a sufficiently fast match.
+    */
+   mtx_lock(&bufmgr->lock);
+   bo = find_and_ref_external_bo(bufmgr->name_table, handle);
+   if (bo)
+      goto out;
+
+   struct drm_gem_open open_arg = { .name = handle };
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_OPEN, &open_arg);
+   if (ret != 0) {
+      DBG("Couldn't reference %s handle 0x%08x: %s\n",
+          name, handle, strerror(errno));
+      bo = NULL;
+      goto out;
+   }
+   /* Now see if someone has used a prime handle to get this
+    * object from the kernel before by looking through the list
+    * again for a matching gem_handle
+    */
+   bo = find_and_ref_external_bo(bufmgr->handle_table, open_arg.handle);
+   if (bo)
+      goto out;
+
+   bo = bo_calloc();
+   if (!bo)
+      goto out;
+
+   p_atomic_set(&bo->refcount, 1);
+
+   bo->size = open_arg.size;
+   bo->gtt_offset = 0;
+   bo->bufmgr = bufmgr;
+   bo->gem_handle = open_arg.handle;
+   bo->name = name;
+   bo->global_name = handle;
+   bo->reusable = false;
+   bo->external = true;
+   bo->kflags = 0;
+
+   _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+   _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+
+   struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+   ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling);
+   if (ret != 0)
+      goto err_unref;
+
+   bo->tiling_mode = get_tiling.tiling_mode;
+   bo->swizzle_mode = get_tiling.swizzle_mode;
+   /* XXX stride is unknown */
+   DBG("bo_create_from_handle: %d (%s)\n", handle, bo->name);
+
+out:
+   mtx_unlock(&bufmgr->lock);
+   return bo;
+
+err_unref:
+   bo_free(bo);
+   mtx_unlock(&bufmgr->lock);
+   return NULL;
+}
+
+static void
+bo_close(struct crocus_bo *bo)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   if (bo->external) {
+      struct hash_entry *entry;
+
+      if (bo->global_name) {
+         entry = _mesa_hash_table_search(bufmgr->name_table, &bo->global_name);
+         _mesa_hash_table_remove(bufmgr->name_table, entry);
+      }
+
+      entry = _mesa_hash_table_search(bufmgr->handle_table, &bo->gem_handle);
+      _mesa_hash_table_remove(bufmgr->handle_table, entry);
+   }
+
+   /* Close this object */
+   struct drm_gem_close close = { .handle = bo->gem_handle };
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close);
+   if (ret != 0) {
+      DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n",
+          bo->gem_handle, bo->name, strerror(errno));
+   }
+
+   free(bo);
+}
+
+static void
+bo_free(struct crocus_bo *bo)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   if (bo->map_cpu && !bo->userptr) {
+      VG_NOACCESS(bo->map_cpu, bo->size);
+      munmap(bo->map_cpu, bo->size);
+   }
+   if (bo->map_wc) {
+      VG_NOACCESS(bo->map_wc, bo->size);
+      munmap(bo->map_wc, bo->size);
+   }
+   if (bo->map_gtt) {
+      VG_NOACCESS(bo->map_gtt, bo->size);
+      munmap(bo->map_gtt, bo->size);
+   }
+
+   if (bo->idle) {
+      bo_close(bo);
+   } else {
+      /* Defer closing the GEM BO and returning the VMA for reuse until the
+       * BO is idle.  Just move it to the dead list for now.
+       */
+      list_addtail(&bo->head, &bufmgr->zombie_list);
+   }
+}
+
+/** Frees all cached buffers significantly older than @time. */
+static void
+cleanup_bo_cache(struct crocus_bufmgr *bufmgr, time_t time)
+{
+   int i;
+
+   if (bufmgr->time == time)
+      return;
+
+   for (i = 0; i < bufmgr->num_buckets; i++) {
+      struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+      list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+         if (time - bo->free_time <= 1)
+            break;
+
+         list_del(&bo->head);
+
+         bo_free(bo);
+      }
+   }
+
+   list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+      /* Stop once we reach a busy BO - all others past this point were
+       * freed more recently so are likely also busy.
+       */
+      if (!bo->idle && crocus_bo_busy(bo))
+         break;
+
+      list_del(&bo->head);
+      bo_close(bo);
+   }
+
+   bufmgr->time = time;
+}
+
+static void
+bo_unreference_final(struct crocus_bo *bo, time_t time)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+   struct bo_cache_bucket *bucket;
+
+   DBG("bo_unreference final: %d (%s)\n", bo->gem_handle, bo->name);
+
+   bucket = NULL;
+   if (bo->reusable)
+      bucket = bucket_for_size(bufmgr, bo->size);
+   /* Put the buffer into our internal cache for reuse if we can. */
+   if (bucket && crocus_bo_madvise(bo, I915_MADV_DONTNEED)) {
+      bo->free_time = time;
+      bo->name = NULL;
+
+      list_addtail(&bo->head, &bucket->head);
+   } else {
+      bo_free(bo);
+   }
+}
+
+void
+crocus_bo_unreference(struct crocus_bo *bo)
+{
+   if (bo == NULL)
+      return;
+
+   assert(p_atomic_read(&bo->refcount) > 0);
+
+   if (atomic_add_unless(&bo->refcount, -1, 1)) {
+      struct crocus_bufmgr *bufmgr = bo->bufmgr;
+      struct timespec time;
+
+      clock_gettime(CLOCK_MONOTONIC, &time);
+
+      mtx_lock(&bufmgr->lock);
+
+      if (p_atomic_dec_zero(&bo->refcount)) {
+         bo_unreference_final(bo, time.tv_sec);
+         cleanup_bo_cache(bufmgr, time.tv_sec);
+      }
+
+      mtx_unlock(&bufmgr->lock);
+   }
+}
+
+static void
+bo_wait_with_stall_warning(struct pipe_debug_callback *dbg,
+                           struct crocus_bo *bo,
+                           const char *action)
+{
+   bool busy = dbg && !bo->idle;
+   double elapsed = unlikely(busy) ? -get_time() : 0.0;
+
+   crocus_bo_wait_rendering(bo);
+
+   if (unlikely(busy)) {
+      elapsed += get_time();
+      if (elapsed > 1e-5) /* 0.01ms */ {
+         perf_debug(dbg, "%s a busy \"%s\" BO stalled and took %.03f ms.\n",
+                    action, bo->name, elapsed * 1000);
+      }
+   }
+}
+
+static void
+print_flags(unsigned flags)
+{
+   if (flags & MAP_READ)
+      DBG("READ ");
+   if (flags & MAP_WRITE)
+      DBG("WRITE ");
+   if (flags & MAP_ASYNC)
+      DBG("ASYNC ");
+   if (flags & MAP_PERSISTENT)
+      DBG("PERSISTENT ");
+   if (flags & MAP_COHERENT)
+      DBG("COHERENT ");
+   if (flags & MAP_RAW)
+      DBG("RAW ");
+   DBG("\n");
+}
+
+static void *
+crocus_bo_gem_mmap_legacy(struct pipe_debug_callback *dbg,
+                          struct crocus_bo *bo, bool wc)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   struct drm_i915_gem_mmap mmap_arg = {
+      .handle = bo->gem_handle,
+      .size = bo->size,
+      .flags = wc ? I915_MMAP_WC : 0,
+   };
+
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg);
+   if (ret != 0) {
+      DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+          __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+      return NULL;
+   }
+   void *map = (void *) (uintptr_t) mmap_arg.addr_ptr;
+
+   return map;
+}
+
+static void *
+crocus_bo_gem_mmap_offset(struct pipe_debug_callback *dbg, struct crocus_bo *bo,
+                          bool wc)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   struct drm_i915_gem_mmap_offset mmap_arg = {
+      .handle = bo->gem_handle,
+      .flags = wc ? I915_MMAP_OFFSET_WC : I915_MMAP_OFFSET_WB,
+   };
+
+   /* Get the fake offset back */
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_OFFSET, &mmap_arg);
+   if (ret != 0) {
+      DBG("%s:%d: Error preparing buffer %d (%s): %s .\n",
+          __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+      return NULL;
+   }
+
+   /* And map it */
+   void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                    bufmgr->fd, mmap_arg.offset);
+   if (map == MAP_FAILED) {
+      DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+          __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+      return NULL;
+   }
+
+   return map;
+}
+
+static void *
+crocus_bo_gem_mmap(struct pipe_debug_callback *dbg, struct crocus_bo *bo, bool wc)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   if (bufmgr->has_mmap_offset)
+      return crocus_bo_gem_mmap_offset(dbg, bo, wc);
+   else
+      return crocus_bo_gem_mmap_legacy(dbg, bo, wc);
+}
+
+static void *
+crocus_bo_map_cpu(struct pipe_debug_callback *dbg,
+                  struct crocus_bo *bo, unsigned flags)
+{
+   /* We disallow CPU maps for writing to non-coherent buffers, as the
+    * CPU map can become invalidated when a batch is flushed out, which
+    * can happen at unpredictable times.  You should use WC maps instead.
+    */
+   assert(bo->cache_coherent || !(flags & MAP_WRITE));
+
+   if (!bo->map_cpu) {
+      DBG("crocus_bo_map_cpu: %d (%s)\n", bo->gem_handle, bo->name);
+
+      void *map = crocus_bo_gem_mmap(dbg, bo, false);
+      if (!map) {
+         return NULL;
+      }
+
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_cpu, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         munmap(map, bo->size);
+      }
+   }
+   assert(bo->map_cpu);
+
+   DBG("crocus_bo_map_cpu: %d (%s) -> %p, ", bo->gem_handle, bo->name,
+       bo->map_cpu);
+   print_flags(flags);
+
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(dbg, bo, "CPU mapping");
+   }
+
+   if (!bo->cache_coherent && !bo->bufmgr->has_llc) {
+      /* If we're reusing an existing CPU mapping, the CPU caches may
+       * contain stale data from the last time we read from that mapping.
+       * (With the BO cache, it might even be data from a previous buffer!)
+       * Even if it's a brand new mapping, the kernel may have zeroed the
+       * buffer via CPU writes.
+       *
+       * We need to invalidate those cachelines so that we see the latest
+       * contents, and so long as we only read from the CPU mmap we do not
+       * need to write those cachelines back afterwards.
+       *
+       * On LLC, the emprical evidence suggests that writes from the GPU
+       * that bypass the LLC (i.e. for scanout) do *invalidate* the CPU
+       * cachelines. (Other reads, such as the display engine, bypass the
+       * LLC entirely requiring us to keep dirty pixels for the scanout
+       * out of any cache.)
+       */
+      intel_invalidate_range(bo->map_cpu, bo->size);
+   }
+
+   return bo->map_cpu;
+}
+
+static void *
+crocus_bo_map_wc(struct pipe_debug_callback *dbg,
+                 struct crocus_bo *bo, unsigned flags)
+{
+   if (!bo->map_wc) {
+      DBG("crocus_bo_map_wc: %d (%s)\n", bo->gem_handle, bo->name);
+
+      void *map = crocus_bo_gem_mmap(dbg, bo, true);
+      if (!map) {
+         return NULL;
+      }
+
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_wc, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         munmap(map, bo->size);
+      }
+   }
+   assert(bo->map_wc);
+
+   DBG("crocus_bo_map_wc: %d (%s) -> %p\n", bo->gem_handle, bo->name, bo->map_wc);
+   print_flags(flags);
+
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(dbg, bo, "WC mapping");
+   }
+
+   return bo->map_wc;
+}
+
+/**
+ * Perform an uncached mapping via the GTT.
+ *
+ * Write access through the GTT is not quite fully coherent. On low power
+ * systems especially, like modern Atoms, we can observe reads from RAM before
+ * the write via GTT has landed. A write memory barrier that flushes the Write
+ * Combining Buffer (i.e. sfence/mfence) is not sufficient to order the later
+ * read after the write as the GTT write suffers a small delay through the GTT
+ * indirection. The kernel uses an uncached mmio read to ensure the GTT write
+ * is ordered with reads (either by the GPU, WB or WC) and unconditionally
+ * flushes prior to execbuf submission. However, if we are not informing the
+ * kernel about our GTT writes, it will not flush before earlier access, such
+ * as when using the cmdparser. Similarly, we need to be careful if we should
+ * ever issue a CPU read immediately following a GTT write.
+ *
+ * Telling the kernel about write access also has one more important
+ * side-effect. Upon receiving notification about the write, it cancels any
+ * scanout buffering for FBC/PSR and friends. Later FBC/PSR is then flushed by
+ * either SW_FINISH or DIRTYFB. The presumption is that we never write to the
+ * actual scanout via a mmaping, only to a backbuffer and so all the FBC/PSR
+ * tracking is handled on the buffer exchange instead.
+ */
+static void *
+crocus_bo_map_gtt(struct pipe_debug_callback *dbg,
+                  struct crocus_bo *bo, unsigned flags)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   /* If we don't support get/set_tiling, there's no support for GTT mapping
+    * either (it won't do any de-tiling for us).
+    */
+   assert(bufmgr->has_tiling_uapi);
+
+   /* Get a mapping of the buffer if we haven't before. */
+   if (bo->map_gtt == NULL) {
+      DBG("bo_map_gtt: mmap %d (%s)\n", bo->gem_handle, bo->name);
+
+      struct drm_i915_gem_mmap_gtt mmap_arg = { .handle = bo->gem_handle };
+
+      /* Get the fake offset back... */
+      int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_MMAP_GTT, &mmap_arg);
+      if (ret != 0) {
+         DBG("%s:%d: Error preparing buffer map %d (%s): %s .\n",
+             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+         return NULL;
+      }
+
+      /* and mmap it. */
+      void *map = mmap(0, bo->size, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, bufmgr->fd, mmap_arg.offset);
+      if (map == MAP_FAILED) {
+         DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
+             __FILE__, __LINE__, bo->gem_handle, bo->name, strerror(errno));
+         return NULL;
+      }
+
+      /* We don't need to use VALGRIND_MALLOCLIKE_BLOCK because Valgrind will
+       * already intercept this mmap call. However, for consistency between
+       * all the mmap paths, we mark the pointer as defined now and mark it
+       * as inaccessible afterwards.
+       */
+      VG_DEFINED(map, bo->size);
+
+      if (p_atomic_cmpxchg(&bo->map_gtt, NULL, map)) {
+         VG_NOACCESS(map, bo->size);
+         munmap(map, bo->size);
+      }
+   }
+   assert(bo->map_gtt);
+
+   DBG("bo_map_gtt: %d (%s) -> %p, ", bo->gem_handle, bo->name, bo->map_gtt);
+   print_flags(flags);
+
+   if (!(flags & MAP_ASYNC)) {
+      bo_wait_with_stall_warning(dbg, bo, "GTT mapping");
+   }
+
+   return bo->map_gtt;
+}
+
+static bool
+can_map_cpu(struct crocus_bo *bo, unsigned flags)
+{
+   if (bo->cache_coherent)
+      return true;
+
+   /* Even if the buffer itself is not cache-coherent (such as a scanout), on
+    * an LLC platform reads always are coherent (as they are performed via the
+    * central system agent). It is just the writes that we need to take special
+    * care to ensure that land in main memory and not stick in the CPU cache.
+    */
+   if (!(flags & MAP_WRITE) && bo->bufmgr->has_llc)
+      return true;
+
+   /* If PERSISTENT or COHERENT are set, the mmapping needs to remain valid
+    * across batch flushes where the kernel will change cache domains of the
+    * bo, invalidating continued access to the CPU mmap on non-LLC device.
+    *
+    * Similarly, ASYNC typically means that the buffer will be accessed via
+    * both the CPU and the GPU simultaneously.  Batches may be executed that
+    * use the BO even while it is mapped.  While OpenGL technically disallows
+    * most drawing while non-persistent mappings are active, we may still use
+    * the GPU for blits or other operations, causing batches to happen at
+    * inconvenient times.
+    *
+    * If RAW is set, we expect the caller to be able to handle a WC buffer
+    * more efficiently than the involuntary clflushes.
+    */
+   if (flags & (MAP_PERSISTENT | MAP_COHERENT | MAP_ASYNC | MAP_RAW))
+      return false;
+
+   return !(flags & MAP_WRITE);
+}
+
+void *
+crocus_bo_map(struct pipe_debug_callback *dbg,
+              struct crocus_bo *bo, unsigned flags)
+{
+   if (bo->tiling_mode != I915_TILING_NONE && !(flags & MAP_RAW))
+      return crocus_bo_map_gtt(dbg, bo, flags);
+
+   void *map;
+
+   if (can_map_cpu(bo, flags))
+      map = crocus_bo_map_cpu(dbg, bo, flags);
+   else
+      map = crocus_bo_map_wc(dbg, bo, flags);
+
+   /* Allow the attempt to fail by falling back to the GTT where necessary.
+    *
+    * Not every buffer can be mmaped directly using the CPU (or WC), for
+    * example buffers that wrap stolen memory or are imported from other
+    * devices. For those, we have little choice but to use a GTT mmapping.
+    * However, if we use a slow GTT mmapping for reads where we expected fast
+    * access, that order of magnitude difference in throughput will be clearly
+    * expressed by angry users.
+    *
+    * We skip MAP_RAW because we want to avoid map_gtt's fence detiling.
+    */
+   if (!map && !(flags & MAP_RAW)) {
+      perf_debug(dbg, "Fallback GTT mapping for %s with access flags %x\n",
+                 bo->name, flags);
+      map = crocus_bo_map_gtt(dbg, bo, flags);
+   }
+
+   return map;
+}
+
+/** Waits for all GPU rendering with the object to have completed. */
+void
+crocus_bo_wait_rendering(struct crocus_bo *bo)
+{
+   /* We require a kernel recent enough for WAIT_IOCTL support.
+    * See intel_init_bufmgr()
+    */
+   crocus_bo_wait(bo, -1);
+}
+
+/**
+ * Waits on a BO for the given amount of time.
+ *
+ * @bo: buffer object to wait for
+ * @timeout_ns: amount of time to wait in nanoseconds.
+ *   If value is less than 0, an infinite wait will occur.
+ *
+ * Returns 0 if the wait was successful ie. the last batch referencing the
+ * object has completed within the allotted time. Otherwise some negative return
+ * value describes the error. Of particular interest is -ETIME when the wait has
+ * failed to yield the desired result.
+ *
+ * Similar to crocus_bo_wait_rendering except a timeout parameter allows
+ * the operation to give up after a certain amount of time. Another subtle
+ * difference is the internal locking semantics are different (this variant does
+ * not hold the lock for the duration of the wait). This makes the wait subject
+ * to a larger userspace race window.
+ *
+ * The implementation shall wait until the object is no longer actively
+ * referenced within a batch buffer at the time of the call. The wait will
+ * not guarantee that the buffer is re-issued via another thread, or an flinked
+ * handle. Userspace must make sure this race does not occur if such precision
+ * is important.
+ *
+ * Note that some kernels have broken the inifite wait for negative values
+ * promise, upgrade to latest stable kernels if this is the case.
+ */
+int
+crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   /* If we know it's idle, don't bother with the kernel round trip */
+   if (bo->idle && !bo->external)
+      return 0;
+
+   struct drm_i915_gem_wait wait = {
+      .bo_handle = bo->gem_handle,
+      .timeout_ns = timeout_ns,
+   };
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   if (ret != 0)
+      return -errno;
+
+   bo->idle = true;
+
+   return ret;
+}
+
+static void
+crocus_bufmgr_destroy(struct crocus_bufmgr *bufmgr)
+{
+   mtx_destroy(&bufmgr->lock);
+
+   /* Free any cached buffer objects we were going to reuse */
+   for (int i = 0; i < bufmgr->num_buckets; i++) {
+      struct bo_cache_bucket *bucket = &bufmgr->cache_bucket[i];
+
+      list_for_each_entry_safe(struct crocus_bo, bo, &bucket->head, head) {
+         list_del(&bo->head);
+
+         bo_free(bo);
+      }
+   }
+
+   /* Close any buffer objects on the dead list. */
+   list_for_each_entry_safe(struct crocus_bo, bo, &bufmgr->zombie_list, head) {
+      list_del(&bo->head);
+      bo_close(bo);
+   }
+
+   _mesa_hash_table_destroy(bufmgr->name_table, NULL);
+   _mesa_hash_table_destroy(bufmgr->handle_table, NULL);
+
+   close(bufmgr->fd);
+
+   free(bufmgr);
+}
+
+static int
+bo_set_tiling_internal(struct crocus_bo *bo, uint32_t tiling_mode,
+                       uint32_t stride)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+   struct drm_i915_gem_set_tiling set_tiling;
+   int ret;
+
+   if (bo->global_name == 0 &&
+       tiling_mode == bo->tiling_mode && stride == bo->stride)
+      return 0;
+
+   memset(&set_tiling, 0, sizeof(set_tiling));
+   do {
+      /* set_tiling is slightly broken and overwrites the
+       * input on the error path, so we have to open code
+       * drm_ioctl.
+       */
+      set_tiling.handle = bo->gem_handle;
+      set_tiling.tiling_mode = tiling_mode;
+      set_tiling.stride = stride;
+
+      ret = ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+   if (ret == -1)
+      return -errno;
+
+   bo->tiling_mode = set_tiling.tiling_mode;
+   bo->swizzle_mode = set_tiling.swizzle_mode;
+   bo->stride = set_tiling.stride;
+   return 0;
+}
+
+int
+crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+                     uint32_t *swizzle_mode)
+{
+   *tiling_mode = bo->tiling_mode;
+   *swizzle_mode = bo->swizzle_mode;
+   return 0;
+}
+
+struct crocus_bo *
+crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr, int prime_fd,
+                        uint32_t tiling, uint32_t stride)
+{
+   uint32_t handle;
+   struct crocus_bo *bo;
+
+   mtx_lock(&bufmgr->lock);
+   int ret = drmPrimeFDToHandle(bufmgr->fd, prime_fd, &handle);
+   if (ret) {
+      DBG("import_dmabuf: failed to obtain handle from fd: %s\n",
+          strerror(errno));
+      mtx_unlock(&bufmgr->lock);
+      return NULL;
+   }
+
+   /*
+    * See if the kernel has already returned this buffer to us. Just as
+    * for named buffers, we must not create two bo's pointing at the same
+    * kernel object
+    */
+   bo = find_and_ref_external_bo(bufmgr->handle_table, handle);
+   if (bo)
+      goto out;
+
+   bo = bo_calloc();
+   if (!bo)
+      goto out;
+
+   p_atomic_set(&bo->refcount, 1);
+
+   /* Determine size of bo.  The fd-to-handle ioctl really should
+    * return the size, but it doesn't.  If we have kernel 3.12 or
+    * later, we can lseek on the prime fd to get the size.  Older
+    * kernels will just fail, in which case we fall back to the
+    * provided (estimated or guess size). */
+   ret = lseek(prime_fd, 0, SEEK_END);
+   if (ret != -1)
+      bo->size = ret;
+
+   bo->bufmgr = bufmgr;
+   bo->name = "prime";
+   bo->reusable = false;
+   bo->external = true;
+   bo->kflags = 0;
+   bo->gem_handle = handle;
+   _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
+
+   struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
+   if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
+      goto err;
+
+   if (get_tiling.tiling_mode == tiling || tiling > I915_TILING_LAST) {
+      bo->tiling_mode = get_tiling.tiling_mode;
+      bo->swizzle_mode = get_tiling.swizzle_mode;
+      /* XXX stride is unknown */
+   } else {
+      if (bo_set_tiling_internal(bo, tiling, stride)) {
+         goto err;
+      }
+   }
+
+out:
+   mtx_unlock(&bufmgr->lock);
+   return bo;
+
+err:
+   bo_free(bo);
+   mtx_unlock(&bufmgr->lock);
+   return NULL;
+}
+
+static void
+crocus_bo_make_external_locked(struct crocus_bo *bo)
+{
+   if (!bo->external) {
+      _mesa_hash_table_insert(bo->bufmgr->handle_table, &bo->gem_handle, bo);
+      bo->external = true;
+      bo->reusable = false;
+   }
+}
+
+static void
+crocus_bo_make_external(struct crocus_bo *bo)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   if (bo->external) {
+      assert(!bo->reusable);
+      return;
+   }
+
+   mtx_lock(&bufmgr->lock);
+   crocus_bo_make_external_locked(bo);
+   mtx_unlock(&bufmgr->lock);
+}
+
+int
+crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   crocus_bo_make_external(bo);
+
+   if (drmPrimeHandleToFD(bufmgr->fd, bo->gem_handle,
+                          DRM_CLOEXEC, prime_fd) != 0)
+      return -errno;
+
+   return 0;
+}
+
+uint32_t
+crocus_bo_export_gem_handle(struct crocus_bo *bo)
+{
+   crocus_bo_make_external(bo);
+
+   return bo->gem_handle;
+}
+
+int
+crocus_bo_flink(struct crocus_bo *bo, uint32_t *name)
+{
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+
+   if (!bo->global_name) {
+      struct drm_gem_flink flink = { .handle = bo->gem_handle };
+
+      if (intel_ioctl(bufmgr->fd, DRM_IOCTL_GEM_FLINK, &flink))
+         return -errno;
+
+      mtx_lock(&bufmgr->lock);
+      if (!bo->global_name) {
+         crocus_bo_make_external_locked(bo);
+         bo->global_name = flink.name;
+         _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
+      }
+      mtx_unlock(&bufmgr->lock);
+   }
+
+   *name = bo->global_name;
+   return 0;
+}
+
+int
+crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+                                       uint32_t *out_handle)
+{
+   /* Only add the new GEM handle to the list of export if it belongs to a
+    * different GEM device. Otherwise we might close the same buffer multiple
+    * times.
+    */
+   struct crocus_bufmgr *bufmgr = bo->bufmgr;
+   int ret = os_same_file_description(drm_fd, bufmgr->fd);
+   WARN_ONCE(ret < 0,
+             "Kernel has no file descriptor comparison support: %s\n",
+             strerror(errno));
+   if (ret == 0) {
+      *out_handle = crocus_bo_export_gem_handle(bo);
+      return 0;
+   }
+
+   struct bo_export *export = calloc(1, sizeof(*export));
+   if (!export)
+      return -ENOMEM;
+
+   export->drm_fd = drm_fd;
+
+   int dmabuf_fd = -1;
+   int err = crocus_bo_export_dmabuf(bo, &dmabuf_fd);
+   if (err) {
+      free(export);
+      return err;
+   }
+
+   mtx_lock(&bufmgr->lock);
+   err = drmPrimeFDToHandle(drm_fd, dmabuf_fd, &export->gem_handle);
+   close(dmabuf_fd);
+   if (err) {
+      mtx_unlock(&bufmgr->lock);
+      free(export);
+      return err;
+   }
+
+   bool found = false;
+   list_for_each_entry(struct bo_export, iter, &bo->exports, link) {
+      if (iter->drm_fd != drm_fd)
+         continue;
+      /* Here we assume that for a given DRM fd, we'll always get back the
+       * same GEM handle for a given buffer.
+       */
+      assert(iter->gem_handle == export->gem_handle);
+      free(export);
+      export = iter;
+      found = true;
+      break;
+   }
+   if (!found)
+      list_addtail(&export->link, &bo->exports);
+
+   mtx_unlock(&bufmgr->lock);
+
+   *out_handle = export->gem_handle;
+
+   return 0;
+}
+
+static void
+add_bucket(struct crocus_bufmgr *bufmgr, int size)
+{
+   unsigned int i = bufmgr->num_buckets;
+
+   assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
+
+   list_inithead(&bufmgr->cache_bucket[i].head);
+   bufmgr->cache_bucket[i].size = size;
+   bufmgr->num_buckets++;
+
+   assert(bucket_for_size(bufmgr, size) == &bufmgr->cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size - 2048) == &bufmgr->cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size + 1) != &bufmgr->cache_bucket[i]);
+}
+
+static void
+init_cache_buckets(struct crocus_bufmgr *bufmgr)
+{
+   uint64_t size, cache_max_size = 64 * 1024 * 1024;
+
+   /* OK, so power of two buckets was too wasteful of memory.
+    * Give 3 other sizes between each power of two, to hopefully
+    * cover things accurately enough.  (The alternative is
+    * probably to just go for exact matching of sizes, and assume
+    * that for things like composited window resize the tiled
+    * width/height alignment and rounding of sizes to pages will
+    * get us useful cache hit rates anyway)
+    */
+   add_bucket(bufmgr, PAGE_SIZE);
+   add_bucket(bufmgr, PAGE_SIZE * 2);
+   add_bucket(bufmgr, PAGE_SIZE * 3);
+
+   /* Initialize the linked lists for BO reuse cache. */
+   for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) {
+      add_bucket(bufmgr, size);
+
+      add_bucket(bufmgr, size + size * 1 / 4);
+      add_bucket(bufmgr, size + size * 2 / 4);
+      add_bucket(bufmgr, size + size * 3 / 4);
+   }
+}
+
+uint32_t
+crocus_create_hw_context(struct crocus_bufmgr *bufmgr)
+{
+   struct drm_i915_gem_context_create create = { };
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &create);
+   if (ret != 0) {
+      DBG("DRM_IOCTL_I915_GEM_CONTEXT_CREATE failed: %s\n", strerror(errno));
+      return 0;
+   }
+
+   /* Upon declaring a GPU hang, the kernel will zap the guilty context
+    * back to the default logical HW state and attempt to continue on to
+    * our next submitted batchbuffer.  However, our render batches assume
+    * the previous GPU state is preserved, and only emit commands needed
+    * to incrementally change that state.  In particular, we inherit the
+    * STATE_BASE_ADDRESS and PIPELINE_SELECT settings, which are critical.
+    * With default base addresses, our next batches will almost certainly
+    * cause more GPU hangs, leading to repeated hangs until we're banned
+    * or the machine is dead.
+    *
+    * Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves.  Ideally,
+    * we'll have two lost batches instead of a continual stream of hangs.
+    */
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = create.ctx_id,
+      .param = I915_CONTEXT_PARAM_RECOVERABLE,
+      .value = false,
+   };
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+
+   return create.ctx_id;
+}
+
+static int
+crocus_hw_context_get_priority(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_PRIORITY,
+   };
+   drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &p);
+   return p.value; /* on error, return 0 i.e. default priority */
+}
+
+int
+crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+                               uint32_t ctx_id,
+                               int priority)
+{
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_PRIORITY,
+      .value = priority,
+   };
+   int err;
+
+   err = 0;
+   if (intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p))
+      err = -errno;
+
+   return err;
+}
+
+uint32_t
+crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   uint32_t new_ctx = crocus_create_hw_context(bufmgr);
+
+   if (new_ctx) {
+      int priority = crocus_hw_context_get_priority(bufmgr, ctx_id);
+      crocus_hw_context_set_priority(bufmgr, new_ctx, priority);
+   }
+
+   return new_ctx;
+}
+
+void
+crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   struct drm_i915_gem_context_destroy d = { .ctx_id = ctx_id };
+
+   if (ctx_id != 0 &&
+       intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_DESTROY, &d) != 0) {
+      fprintf(stderr, "DRM_IOCTL_I915_GEM_CONTEXT_DESTROY failed: %s\n",
+              strerror(errno));
+   }
+}
+
+int
+crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset, uint64_t *result)
+{
+   struct drm_i915_reg_read reg_read = { .offset = offset };
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_REG_READ, &reg_read);
+
+   *result = reg_read.val;
+   return ret;
+}
+
+static int
+gem_param(int fd, int name)
+{
+   int v = -1; /* No param uses (yet) the sign bit, reserve it for errors */
+
+   struct drm_i915_getparam gp = { .param = name, .value = &v };
+   if (intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
+      return -1;
+
+   return v;
+}
+
+/**
+ * Initializes the GEM buffer manager, which uses the kernel to allocate, map,
+ * and manage map buffer objections.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+static struct crocus_bufmgr *
+crocus_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+   struct crocus_bufmgr *bufmgr = calloc(1, sizeof(*bufmgr));
+   if (bufmgr == NULL)
+      return NULL;
+
+   /* Handles to buffer objects belong to the device fd and are not
+    * reference counted by the kernel.  If the same fd is used by
+    * multiple parties (threads sharing the same screen bufmgr, or
+    * even worse the same device fd passed to multiple libraries)
+    * ownership of those handles is shared by those independent parties.
+    *
+    * Don't do this! Ensure that each library/bufmgr has its own device
+    * fd so that its namespace does not clash with another.
+    */
+   bufmgr->fd = os_dupfd_cloexec(fd);
+
+   p_atomic_set(&bufmgr->refcount, 1);
+
+   if (mtx_init(&bufmgr->lock, mtx_plain) != 0) {
+      free(bufmgr);
+      return NULL;
+   }
+
+   list_inithead(&bufmgr->zombie_list);
+
+   bufmgr->has_llc = devinfo->has_llc;
+   bufmgr->has_tiling_uapi = devinfo->has_tiling_uapi;
+   bufmgr->bo_reuse = bo_reuse;
+   bufmgr->has_mmap_offset = gem_param(fd, I915_PARAM_MMAP_GTT_VERSION) >= 4;
+
+   init_cache_buckets(bufmgr);
+
+   bufmgr->name_table =
+      _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+   bufmgr->handle_table =
+      _mesa_hash_table_create(NULL, key_hash_uint, key_uint_equal);
+
+   return bufmgr;
+}
+
+static struct crocus_bufmgr *
+crocus_bufmgr_ref(struct crocus_bufmgr *bufmgr)
+{
+   p_atomic_inc(&bufmgr->refcount);
+   return bufmgr;
+}
+
+void
+crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr)
+{
+   mtx_lock(&global_bufmgr_list_mutex);
+   if (p_atomic_dec_zero(&bufmgr->refcount)) {
+      list_del(&bufmgr->link);
+      crocus_bufmgr_destroy(bufmgr);
+   }
+   mtx_unlock(&global_bufmgr_list_mutex);
+}
+
+/**
+ * Gets an already existing GEM buffer manager or create a new one.
+ *
+ * \param fd File descriptor of the opened DRM device.
+ */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd, bool bo_reuse)
+{
+   struct stat st;
+
+   if (fstat(fd, &st))
+      return NULL;
+
+   struct crocus_bufmgr *bufmgr = NULL;
+
+   mtx_lock(&global_bufmgr_list_mutex);
+   list_for_each_entry(struct crocus_bufmgr, iter_bufmgr, &global_bufmgr_list, link) {
+      struct stat iter_st;
+      if (fstat(iter_bufmgr->fd, &iter_st))
+         continue;
+
+      if (st.st_rdev == iter_st.st_rdev) {
+         assert(iter_bufmgr->bo_reuse == bo_reuse);
+         bufmgr = crocus_bufmgr_ref(iter_bufmgr);
+         goto unlock;
+      }
+   }
+
+   bufmgr = crocus_bufmgr_create(devinfo, fd, bo_reuse);
+   if (bufmgr)
+      list_addtail(&bufmgr->link, &global_bufmgr_list);
+
+ unlock:
+   mtx_unlock(&global_bufmgr_list_mutex);
+
+   return bufmgr;
+}
+
+int
+crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr)
+{
+   return bufmgr->fd;
+}
diff --git a/src/gallium/drivers/crocus/crocus_bufmgr.h b/src/gallium/drivers/crocus/crocus_bufmgr.h
new file mode 100644
index 00000000000..8bb328fdeae
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_bufmgr.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_BUFMGR_H
+#define CROCUS_BUFMGR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include "util/macros.h"
+#include "util/u_atomic.h"
+#include "util/list.h"
+#include "pipe/p_defines.h"
+
+struct crocus_batch;
+struct intel_device_info;
+struct pipe_debug_callback;
+
+#define CROCUS_BINDER_SIZE (64 * 1024)
+#define CROCUS_MAX_BINDERS 100
+
+struct crocus_bo {
+   /**
+    * Size in bytes of the buffer object.
+    *
+    * The size may be larger than the size originally requested for the
+    * allocation, such as being aligned to page size.
+    */
+   uint64_t size;
+
+   /** Buffer manager context associated with this buffer object */
+   struct crocus_bufmgr *bufmgr;
+
+   /** The GEM handle for this buffer object. */
+   uint32_t gem_handle;
+
+   /**
+    * Virtual address of the buffer inside the PPGTT (Per-Process Graphics
+    * Translation Table).
+    *
+    * Although each hardware context has its own VMA, we assign BO's to the
+    * same address in all contexts, for simplicity.
+    */
+   uint64_t gtt_offset;
+
+   /**
+    * The validation list index for this buffer, or -1 when not in a batch.
+    * Note that a single buffer may be in multiple batches (contexts), and
+    * this is a global field, which refers to the last batch using the BO.
+    * It should not be considered authoritative, but can be used to avoid a
+    * linear walk of the validation list in the common case by guessing that
+    * exec_bos[bo->index] == bo and confirming whether that's the case.
+    *
+    * XXX: this is not ideal now that we have more than one batch per context,
+    * XXX: as the index will flop back and forth between the render index and
+    * XXX: compute index...
+    */
+   unsigned index;
+
+   /**
+    * Boolean of whether the GPU is definitely not accessing the buffer.
+    *
+    * This is only valid when reusable, since non-reusable
+    * buffers are those that have been shared with other
+    * processes, so we don't know their state.
+    */
+   bool idle;
+
+   int refcount;
+   const char *name;
+
+   uint64_t kflags;
+
+   /**
+    * Kenel-assigned global name for this object
+    *
+    * List contains both flink named and prime fd'd objects
+    */
+   unsigned global_name;
+
+   /**
+    * Current tiling mode
+    */
+   uint32_t tiling_mode;
+   uint32_t swizzle_mode;
+   uint32_t stride;
+
+   time_t free_time;
+
+   /** Mapped address for the buffer, saved across map/unmap cycles */
+   void *map_cpu;
+   /** GTT virtual address for the buffer, saved across map/unmap cycles */
+   void *map_gtt;
+   /** WC CPU address for the buffer, saved across map/unmap cycles */
+   void *map_wc;
+
+   /** BO cache list */
+   struct list_head head;
+
+   /** List of GEM handle exports of this buffer (bo_export) */
+   struct list_head exports;
+
+   /**
+    * Boolean of whether this buffer can be re-used
+    */
+   bool reusable;
+
+   /**
+    * Boolean of whether this buffer has been shared with an external client.
+    */
+   bool external;
+
+   /**
+    * Boolean of whether this buffer is cache coherent
+    */
+   bool cache_coherent;
+
+   /**
+    * Boolean of whether this buffer points into user memory
+    */
+   bool userptr;
+
+   /** Pre-computed hash using _mesa_hash_pointer for cache tracking sets */
+   uint32_t hash;
+};
+
+#define BO_ALLOC_ZEROED   (1 << 0)
+#define BO_ALLOC_COHERENT (1 << 1)
+
+/**
+ * Allocate a buffer object.
+ *
+ * Buffer objects are not necessarily initially mapped into CPU virtual
+ * address space or graphics device aperture.  They must be mapped
+ * using crocus_bo_map() to be used by the CPU.
+ */
+struct crocus_bo *crocus_bo_alloc(struct crocus_bufmgr *bufmgr,
+                                  const char *name, uint64_t size);
+
+/**
+ * Allocate a tiled buffer object.
+ *
+ * Alignment for tiled objects is set automatically; the 'flags'
+ * argument provides a hint about how the object will be used initially.
+ *
+ * Valid tiling formats are:
+ *  I915_TILING_NONE
+ *  I915_TILING_X
+ *  I915_TILING_Y
+ */
+struct crocus_bo *crocus_bo_alloc_tiled(struct crocus_bufmgr *bufmgr,
+                                        const char *name, uint64_t size,
+                                        uint32_t alignment,
+                                        uint32_t tiling_mode, uint32_t pitch,
+                                        unsigned flags);
+
+struct crocus_bo *crocus_bo_create_userptr(struct crocus_bufmgr *bufmgr,
+                                           const char *name, void *ptr,
+                                           size_t size);
+
+/** Takes a reference on a buffer object */
+static inline void
+crocus_bo_reference(struct crocus_bo *bo)
+{
+   p_atomic_inc(&bo->refcount);
+}
+
+/**
+ * Releases a reference on a buffer object, freeing the data if
+ * no references remain.
+ */
+void crocus_bo_unreference(struct crocus_bo *bo);
+
+#define MAP_READ          PIPE_MAP_READ
+#define MAP_WRITE         PIPE_MAP_WRITE
+#define MAP_ASYNC         PIPE_MAP_UNSYNCHRONIZED
+#define MAP_PERSISTENT    PIPE_MAP_PERSISTENT
+#define MAP_COHERENT      PIPE_MAP_COHERENT
+/* internal */
+#define MAP_INTERNAL_MASK (0xff << 24)
+#define MAP_RAW           (0x01 << 24)
+
+#define MAP_FLAGS         (MAP_READ | MAP_WRITE | MAP_ASYNC | \
+                           MAP_PERSISTENT | MAP_COHERENT | MAP_INTERNAL_MASK)
+
+/**
+ * Maps the buffer into userspace.
+ *
+ * This function will block waiting for any existing execution on the
+ * buffer to complete, first.  The resulting mapping is returned.
+ */
+MUST_CHECK void *crocus_bo_map(struct pipe_debug_callback *dbg,
+                             struct crocus_bo *bo, unsigned flags);
+
+/**
+ * Reduces the refcount on the userspace mapping of the buffer
+ * object.
+ */
+static inline int crocus_bo_unmap(struct crocus_bo *bo) { return 0; }
+
+/**
+ * Waits for rendering to an object by the GPU to have completed.
+ *
+ * This is not required for any access to the BO by bo_map,
+ * bo_subdata, etc.  It is merely a way for the driver to implement
+ * glFinish.
+ */
+void crocus_bo_wait_rendering(struct crocus_bo *bo);
+
+/**
+ * Unref a buffer manager instance.
+ */
+void crocus_bufmgr_unref(struct crocus_bufmgr *bufmgr);
+
+/**
+ * Get the current tiling (and resulting swizzling) mode for the bo.
+ *
+ * \param buf Buffer to get tiling mode for
+ * \param tiling_mode returned tiling mode
+ * \param swizzle_mode returned swizzling mode
+ */
+int crocus_bo_get_tiling(struct crocus_bo *bo, uint32_t *tiling_mode,
+                         uint32_t *swizzle_mode);
+
+/**
+ * Create a visible name for a buffer which can be used by other apps
+ *
+ * \param buf Buffer to create a name for
+ * \param name Returned name
+ */
+int crocus_bo_flink(struct crocus_bo *bo, uint32_t *name);
+
+/**
+ * Is this buffer shared with external clients (exported)?
+ */
+static inline bool
+crocus_bo_is_external(const struct crocus_bo *bo)
+{
+   return bo->external;
+}
+
+/**
+ * Returns 1 if mapping the buffer for write could cause the process
+ * to block, due to the object being active in the GPU.
+ */
+int crocus_bo_busy(struct crocus_bo *bo);
+
+/**
+ * Specify the volatility of the buffer.
+ * \param bo Buffer to create a name for
+ * \param madv The purgeable status
+ *
+ * Use I915_MADV_DONTNEED to mark the buffer as purgeable, and it will be
+ * reclaimed under memory pressure. If you subsequently require the buffer,
+ * then you must pass I915_MADV_WILLNEED to mark the buffer as required.
+ *
+ * Returns 1 if the buffer was retained, or 0 if it was discarded whilst
+ * marked as I915_MADV_DONTNEED.
+ */
+int crocus_bo_madvise(struct crocus_bo *bo, int madv);
+
+/* drm_bacon_bufmgr_gem.c */
+struct crocus_bufmgr *
+crocus_bufmgr_get_for_fd(struct intel_device_info *devinfo, int fd,
+                         bool bo_reuse);
+int crocus_bufmgr_get_fd(struct crocus_bufmgr *bufmgr);
+
+struct crocus_bo *crocus_bo_gem_create_from_name(struct crocus_bufmgr *bufmgr,
+                                                 const char *name,
+                                                 unsigned handle);
+
+int crocus_bo_wait(struct crocus_bo *bo, int64_t timeout_ns);
+
+uint32_t crocus_create_hw_context(struct crocus_bufmgr *bufmgr);
+uint32_t crocus_clone_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+#define CROCUS_CONTEXT_LOW_PRIORITY    ((I915_CONTEXT_MIN_USER_PRIORITY - 1) / 2)
+#define CROCUS_CONTEXT_MEDIUM_PRIORITY (I915_CONTEXT_DEFAULT_PRIORITY)
+#define CROCUS_CONTEXT_HIGH_PRIORITY   ((I915_CONTEXT_MAX_USER_PRIORITY + 1) / 2)
+
+int crocus_hw_context_set_priority(struct crocus_bufmgr *bufmgr,
+                                   uint32_t ctx_id, int priority);
+
+void crocus_destroy_hw_context(struct crocus_bufmgr *bufmgr, uint32_t ctx_id);
+
+int crocus_bo_export_dmabuf(struct crocus_bo *bo, int *prime_fd);
+struct crocus_bo *crocus_bo_import_dmabuf(struct crocus_bufmgr *bufmgr,
+                                          int prime_fd, uint32_t tiling,
+                                          uint32_t stride);
+
+/**
+ * Exports a bo as a GEM handle into a given DRM file descriptor
+ * \param bo Buffer to export
+ * \param drm_fd File descriptor where the new handle is created
+ * \param out_handle Pointer to store the new handle
+ *
+ * Returns 0 if the buffer was successfully exported, a non zero error code
+ * otherwise.
+ */
+int crocus_bo_export_gem_handle_for_device(struct crocus_bo *bo, int drm_fd,
+                                           uint32_t *out_handle);
+
+uint32_t crocus_bo_export_gem_handle(struct crocus_bo *bo);
+
+int crocus_reg_read(struct crocus_bufmgr *bufmgr, uint32_t offset,
+                    uint64_t *out);
+
+int drm_ioctl(int fd, unsigned long request, void *arg);
+
+#endif /* CROCUS_BUFMGR_H */
diff --git a/src/gallium/drivers/crocus/crocus_clear.c b/src/gallium/drivers/crocus/crocus_clear.c
new file mode 100644
index 00000000000..1c56e23f794
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_clear.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_surface.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/compiler/brw_compiler.h"
+#include "util/format_srgb.h"
+
+static bool
+crocus_is_color_fast_clear_compatible(struct crocus_context *ice,
+                                      enum isl_format format,
+                                      const union isl_color_value color)
+{
+   if (isl_format_has_int_channel(format)) {
+      perf_debug(&ice->dbg, "Integer fast clear not enabled for %s",
+                 isl_format_get_name(format));
+      return false;
+   }
+
+   for (int i = 0; i < 4; i++) {
+      if (!isl_format_has_color_component(format, i)) {
+         continue;
+      }
+
+      if (color.f32[i] != 0.0f && color.f32[i] != 1.0f) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+can_fast_clear_color(struct crocus_context *ice,
+                     struct pipe_resource *p_res,
+                     unsigned level,
+                     const struct pipe_box *box,
+                     bool render_condition_enabled,
+                     enum isl_format format,
+                     enum isl_format render_format,
+                     union isl_color_value color)
+{
+   struct crocus_resource *res = (void *) p_res;
+
+   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+      return false;
+
+   if (!isl_aux_usage_has_fast_clears(res->aux.usage))
+      return false;
+
+   /* Check for partial clear */
+   if (box->x > 0 || box->y > 0 ||
+       box->width < minify(p_res->width0, level) ||
+       box->height < minify(p_res->height0, level)) {
+      return false;
+   }
+
+   /* Avoid conditional fast clears to maintain correct tracking of the aux
+    * state (see iris_resource_finish_write for more info). Note that partial
+    * fast clears (if they existed) would not pose a problem with conditional
+    * rendering.
+    */
+   if (render_condition_enabled &&
+       ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+      return false;
+   }
+
+   /* We store clear colors as floats or uints as needed.  If there are
+    * texture views in play, the formats will not properly be respected
+    * during resolves because the resolve operations only know about the
+    * resource and not the renderbuffer.
+    */
+   if (isl_format_srgb_to_linear(render_format) !=
+       isl_format_srgb_to_linear(format)) {
+      return false;
+   }
+
+   /* XXX: if (irb->mt->supports_fast_clear)
+    * see intel_miptree_create_for_dri_image()
+    */
+
+   if (!crocus_is_color_fast_clear_compatible(ice, format, color))
+      return false;
+
+   return true;
+}
+
+static union isl_color_value
+convert_fast_clear_color(struct crocus_context *ice,
+                         struct crocus_resource *res,
+                         enum isl_format render_format,
+                         const union isl_color_value color)
+{
+   union isl_color_value override_color = color;
+   struct pipe_resource *p_res = (void *) res;
+
+   const enum pipe_format format = p_res->format;
+   const struct util_format_description *desc =
+      util_format_description(format);
+   unsigned colormask = util_format_colormask(desc);
+
+   if (util_format_is_intensity(format) ||
+       util_format_is_luminance(format) ||
+       util_format_is_luminance_alpha(format)) {
+      override_color.u32[1] = override_color.u32[0];
+      override_color.u32[2] = override_color.u32[0];
+      if (util_format_is_intensity(format))
+         override_color.u32[3] = override_color.u32[0];
+   } else {
+      for (int chan = 0; chan < 3; chan++) {
+         if (!(colormask & (1 << chan)))
+            override_color.u32[chan] = 0;
+      }
+   }
+
+   if (util_format_is_unorm(format)) {
+      for (int i = 0; i < 4; i++)
+         override_color.f32[i] = CLAMP(override_color.f32[i], 0.0f, 1.0f);
+   } else if (util_format_is_snorm(format)) {
+      for (int i = 0; i < 4; i++)
+         override_color.f32[i] = CLAMP(override_color.f32[i], -1.0f, 1.0f);
+   } else if (util_format_is_pure_uint(format)) {
+      for (int i = 0; i < 4; i++) {
+         unsigned bits = util_format_get_component_bits(
+            format, UTIL_FORMAT_COLORSPACE_RGB, i);
+         if (bits < 32) {
+            uint32_t max = (1u << bits) - 1;
+            override_color.u32[i] = MIN2(override_color.u32[i], max);
+         }
+      }
+   } else if (util_format_is_pure_sint(format)) {
+      for (int i = 0; i < 4; i++) {
+         unsigned bits = util_format_get_component_bits(
+            format, UTIL_FORMAT_COLORSPACE_RGB, i);
+         if (bits < 32) {
+            int32_t max = (1 << (bits - 1)) - 1;
+            int32_t min = -(1 << (bits - 1));
+            override_color.i32[i] = CLAMP(override_color.i32[i], min, max);
+         }
+      }
+   } else if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+              format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      /* these packed float formats only store unsigned values */
+      for (int i = 0; i < 4; i++)
+         override_color.f32[i] = MAX2(override_color.f32[i], 0.0f);
+   }
+
+   if (!(colormask & 1 << 3)) {
+      if (util_format_is_pure_integer(format))
+         override_color.u32[3] = 1;
+      else
+         override_color.f32[3] = 1.0f;
+   }
+
+   /* Handle linear to SRGB conversion */
+   if (isl_format_is_srgb(render_format)) {
+      for (int i = 0; i < 3; i++) {
+         override_color.f32[i] =
+            util_format_linear_to_srgb_float(override_color.f32[i]);
+      }
+   }
+
+   return override_color;
+}
+
+static void
+fast_clear_color(struct crocus_context *ice,
+                 struct crocus_resource *res,
+                 unsigned level,
+                 const struct pipe_box *box,
+                 enum isl_format format,
+                 union isl_color_value color,
+                 enum blorp_batch_flags blorp_flags)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   struct pipe_resource *p_res = (void *) res;
+
+   color = convert_fast_clear_color(ice, res, format, color);
+
+   bool color_changed = !!memcmp(&res->aux.clear_color, &color,
+                                 sizeof(color));
+
+   if (color_changed) {
+      /* If we are clearing to a new clear value, we need to resolve fast
+       * clears from other levels/layers first, since we can't have different
+       * levels/layers with different fast clear colors.
+       */
+      for (unsigned res_lvl = 0; res_lvl < res->surf.levels; res_lvl++) {
+         const unsigned level_layers =
+            crocus_get_num_logical_layers(res, res_lvl);
+         for (unsigned layer = 0; layer < level_layers; layer++) {
+            if (res_lvl == level &&
+                layer >= box->z &&
+                layer < box->z + box->depth) {
+               /* We're going to clear this layer anyway.  Leave it alone. */
+               continue;
+            }
+
+            enum isl_aux_state aux_state =
+               crocus_resource_get_aux_state(res, res_lvl, layer);
+
+            if (aux_state != ISL_AUX_STATE_CLEAR &&
+                aux_state != ISL_AUX_STATE_PARTIAL_CLEAR &&
+                aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+               /* This slice doesn't have any fast-cleared bits. */
+               continue;
+            }
+
+            /* If we got here, then the level may have fast-clear bits that use
+             * the old clear value.  We need to do a color resolve to get rid
+             * of their use of the clear color before we can change it.
+             * Fortunately, few applications ever change their clear color at
+             * different levels/layers, so this shouldn't happen often.
+             */
+            crocus_resource_prepare_access(ice, res,
+                                           res_lvl, 1, layer, 1,
+                                           res->aux.usage,
+                                           false);
+            perf_debug(&ice->dbg,
+                       "Resolving resource (%p) level %d, layer %d: color changing from "
+                       "(%0.2f, %0.2f, %0.2f, %0.2f) to "
+                       "(%0.2f, %0.2f, %0.2f, %0.2f)\n",
+                       res, res_lvl, layer,
+                       res->aux.clear_color.f32[0],
+                       res->aux.clear_color.f32[1],
+                       res->aux.clear_color.f32[2],
+                       res->aux.clear_color.f32[3],
+                       color.f32[0], color.f32[1], color.f32[2], color.f32[3]);
+         }
+      }
+   }
+
+   crocus_resource_set_clear_color(ice, res, color);
+
+   /* If the buffer is already in ISL_AUX_STATE_CLEAR, and the color hasn't
+    * changed, the clear is redundant and can be skipped.
+    */
+   const enum isl_aux_state aux_state =
+      crocus_resource_get_aux_state(res, level, box->z);
+   if (!color_changed && box->depth == 1 && aux_state == ISL_AUX_STATE_CLEAR)
+      return;
+
+   /* Ivybrigde PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *    "Any transition from any value in {Clear, Render, Resolve} to a
+    *    different value in {Clear, Render, Resolve} requires end of pipe
+    *    synchronization."
+    *
+    * In other words, fast clear ops are not properly synchronized with
+    * other drawing.  We need to use a PIPE_CONTROL to ensure that the
+    * contents of the previous draw hit the render target before we resolve
+    * and again afterwards to ensure that the resolve is complete before we
+    * do any more regular drawing.
+    */
+   crocus_emit_end_of_pipe_sync(batch,
+                                "fast clear: pre-flush",
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+   /* If we reach this point, we need to fast clear to change the state to
+    * ISL_AUX_STATE_CLEAR, or to update the fast clear color (or both).
+    */
+   blorp_flags |= color_changed ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+   struct blorp_surf surf;
+   crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+                                  p_res, res->aux.usage, level, true);
+
+   /* In newer gens (> 9), the hardware will do a linear -> sRGB conversion of
+    * the clear color during the fast clear, if the surface format is of sRGB
+    * type. We use the linear version of the surface format here to prevent
+    * that from happening, since we already do our own linear -> sRGB
+    * conversion in convert_fast_clear_color().
+    */
+   blorp_fast_clear(&blorp_batch, &surf, isl_format_srgb_to_linear(format),
+                    ISL_SWIZZLE_IDENTITY,
+                    level, box->z, box->depth,
+                    box->x, box->y, box->x + box->width,
+                    box->y + box->height);
+   blorp_batch_finish(&blorp_batch);
+   crocus_emit_end_of_pipe_sync(batch,
+                                "fast clear: post flush",
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+   crocus_resource_set_aux_state(ice, res, level, box->z,
+                                 box->depth, ISL_AUX_STATE_CLEAR);
+   ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+   return;
+}
+
+static void
+clear_color(struct crocus_context *ice,
+            struct pipe_resource *p_res,
+            unsigned level,
+            const struct pipe_box *box,
+            bool render_condition_enabled,
+            enum isl_format format,
+            struct isl_swizzle swizzle,
+            union isl_color_value color)
+{
+   struct crocus_resource *res = (void *) p_res;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   enum blorp_batch_flags blorp_flags = 0;
+
+   if (render_condition_enabled) {
+      if (!crocus_check_conditional_render(ice))
+         return;
+
+      if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+         blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+   }
+
+   if (p_res->target == PIPE_BUFFER)
+      util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+
+   crocus_batch_maybe_flush(batch, 1500);
+
+   bool can_fast_clear = can_fast_clear_color(ice, p_res, level, box,
+                                              render_condition_enabled,
+                                              res->surf.format, format, color);
+   if (can_fast_clear) {
+      fast_clear_color(ice, res, level, box, format, color,
+                       blorp_flags);
+      return;
+   }
+
+   bool color_write_disable[4] = { false, false, false, false };
+   enum isl_aux_usage aux_usage =
+      crocus_resource_render_aux_usage(ice, res, format,
+                                       false, false);
+
+   crocus_resource_prepare_render(ice, res, level,
+                                  box->z, box->depth, aux_usage);
+
+   struct blorp_surf surf;
+   crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+                                  p_res, aux_usage, level, true);
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+   if (!isl_format_supports_rendering(devinfo, format) &&
+       isl_format_is_rgbx(format))
+      format = isl_format_rgbx_to_rgba(format);
+
+   blorp_clear(&blorp_batch, &surf, format, swizzle,
+               level, box->z, box->depth, box->x, box->y,
+               box->x + box->width, box->y + box->height,
+               color, color_write_disable);
+
+   blorp_batch_finish(&blorp_batch);
+   crocus_flush_and_dirty_for_history(ice, batch, res,
+                                      PIPE_CONTROL_RENDER_TARGET_FLUSH,
+                                      "cache history: post color clear");
+
+   crocus_resource_finish_render(ice, res, level,
+                                 box->z, box->depth, aux_usage);
+}
+
+static bool
+can_fast_clear_depth(struct crocus_context *ice,
+                     struct crocus_resource *res,
+                     unsigned level,
+                     const struct pipe_box *box,
+                     bool render_condition_enabled,
+                     float depth)
+{
+   struct pipe_resource *p_res = (void *) res;
+   struct pipe_context *ctx = (void *) ice;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (devinfo->ver < 6)
+      return false;
+
+   if (INTEL_DEBUG & DEBUG_NO_FAST_CLEAR)
+      return false;
+
+   /* Check for partial clears */
+   if (box->x > 0 || box->y > 0 ||
+       box->width < u_minify(p_res->width0, level) ||
+       box->height < u_minify(p_res->height0, level)) {
+      return false;
+   }
+
+   /* Avoid conditional fast clears to maintain correct tracking of the aux
+    * state (see iris_resource_finish_write for more info). Note that partial
+    * fast clears would not pose a problem with conditional rendering.
+    */
+   if (render_condition_enabled &&
+       ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+      return false;
+   }
+
+   if (!crocus_resource_level_has_hiz(res, level))
+      return false;
+
+   if (res->base.format == PIPE_FORMAT_Z16_UNORM) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+       *
+       *     "[DevSNB+]: Several cases exist where Depth Buffer Clear cannot be
+       *      enabled (the legacy method of clearing must be performed):
+       *
+       *      - DevSNB{W/A}]: When depth buffer format is D16_UNORM and the
+       *        width of the map (LOD0) is not multiple of 16, fast clear
+       *        optimization must be disabled.
+       */
+      if (devinfo->ver == 6 &&
+          (minify(res->surf.phys_level0_sa.width,
+                  level) % 16) != 0)
+         return false;
+   }
+   return true;
+}
+
+static void
+fast_clear_depth(struct crocus_context *ice,
+                 struct crocus_resource *res,
+                 unsigned level,
+                 const struct pipe_box *box,
+                 float depth)
+{
+   struct pipe_resource *p_res = (void *) res;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+   /* Quantize the clear value to what can be stored in the actual depth
+    * buffer.  This makes the following check more accurate because it now
+    * checks if the actual depth bits will match.  It also prevents us from
+    * getting a too-accurate depth value during depth testing or when sampling
+    * with HiZ enabled.
+    */
+   const unsigned nbits = p_res->format == PIPE_FORMAT_Z16_UNORM ? 16 : 24;
+   const uint32_t depth_max = (1 << nbits) - 1;
+   depth = p_res->format == PIPE_FORMAT_Z32_FLOAT ? depth :
+      (unsigned)(depth * depth_max) / (float)depth_max;
+
+   bool update_clear_depth = false;
+
+   /* If we're clearing to a new clear value, then we need to resolve any clear
+    * flags out of the HiZ buffer into the real depth buffer.
+    */
+   if (res->aux.clear_color.f32[0] != depth) {
+      for (unsigned res_level = 0; res_level < res->surf.levels; res_level++) {
+         if (!crocus_resource_level_has_hiz(res, res_level))
+            continue;
+
+         const unsigned level_layers =
+            crocus_get_num_logical_layers(res, res_level);
+         for (unsigned layer = 0; layer < level_layers; layer++) {
+            if (res_level == level &&
+                layer >= box->z &&
+                layer < box->z + box->depth) {
+               /* We're going to clear this layer anyway.  Leave it alone. */
+               continue;
+            }
+
+            enum isl_aux_state aux_state =
+               crocus_resource_get_aux_state(res, res_level, layer);
+
+            if (aux_state != ISL_AUX_STATE_CLEAR &&
+                aux_state != ISL_AUX_STATE_COMPRESSED_CLEAR) {
+               /* This slice doesn't have any fast-cleared bits. */
+               continue;
+            }
+
+            /* If we got here, then the level may have fast-clear bits that
+             * use the old clear value.  We need to do a depth resolve to get
+             * rid of their use of the clear value before we can change it.
+             * Fortunately, few applications ever change their depth clear
+             * value so this shouldn't happen often.
+             */
+            crocus_hiz_exec(ice, batch, res, res_level, layer, 1,
+                            ISL_AUX_OP_FULL_RESOLVE, false);
+            crocus_resource_set_aux_state(ice, res, res_level, layer, 1,
+                                          ISL_AUX_STATE_RESOLVED);
+         }
+      }
+      const union isl_color_value clear_value = { .f32 = {depth, } };
+      crocus_resource_set_clear_color(ice, res, clear_value);
+      update_clear_depth = true;
+   }
+
+   for (unsigned l = 0; l < box->depth; l++) {
+      enum isl_aux_state aux_state =
+         crocus_resource_level_has_hiz(res, level) ?
+         crocus_resource_get_aux_state(res, level, box->z + l) :
+         ISL_AUX_STATE_AUX_INVALID;
+      if (update_clear_depth || aux_state != ISL_AUX_STATE_CLEAR) {
+         if (aux_state == ISL_AUX_STATE_CLEAR) {
+            perf_debug(&ice->dbg, "Performing HiZ clear just to update the "
+                       "depth clear value\n");
+         }
+         crocus_hiz_exec(ice, batch, res, level,
+                         box->z + l, 1, ISL_AUX_OP_FAST_CLEAR,
+                         update_clear_depth);
+      }
+   }
+
+   crocus_resource_set_aux_state(ice, res, level, box->z, box->depth,
+                                 ISL_AUX_STATE_CLEAR);
+   ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
+}
+
+static void
+clear_depth_stencil(struct crocus_context *ice,
+                    struct pipe_resource *p_res,
+                    unsigned level,
+                    const struct pipe_box *box,
+                    bool render_condition_enabled,
+                    bool clear_depth,
+                    bool clear_stencil,
+                    float depth,
+                    uint8_t stencil)
+{
+   struct crocus_resource *res = (void *) p_res;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   enum blorp_batch_flags blorp_flags = 0;
+
+   if (render_condition_enabled) {
+      if (!crocus_check_conditional_render(ice))
+         return;
+
+      if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT)
+         blorp_flags |= BLORP_BATCH_PREDICATE_ENABLE;
+   }
+
+   crocus_batch_maybe_flush(batch, 1500);
+
+   struct crocus_resource *z_res;
+   struct crocus_resource *stencil_res;
+   struct blorp_surf z_surf;
+   struct blorp_surf stencil_surf;
+
+   crocus_get_depth_stencil_resources(&batch->screen->devinfo, p_res, &z_res, &stencil_res);
+   if (z_res && clear_depth &&
+       can_fast_clear_depth(ice, z_res, level, box, render_condition_enabled,
+                            depth)) {
+      fast_clear_depth(ice, z_res, level, box, depth);
+      crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+                                         "cache history: post fast Z clear");
+      clear_depth = false;
+      z_res = NULL;
+   }
+
+   /* At this point, we might have fast cleared the depth buffer. So if there's
+    * no stencil clear pending, return early.
+    */
+   if (!(clear_depth || (clear_stencil && stencil_res))) {
+      return;
+   }
+
+   if (clear_depth && z_res) {
+      const enum isl_aux_usage aux_usage =
+         crocus_resource_render_aux_usage(ice, z_res, level, z_res->surf.format,
+                                          false);
+      crocus_resource_prepare_render(ice, z_res, level, box->z, box->depth,
+                                     aux_usage);
+      crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+                                     &z_surf, &z_res->base, aux_usage,
+                                     level, true);
+   }
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, blorp_flags);
+
+   uint8_t stencil_mask = clear_stencil && stencil_res ? 0xff : 0;
+   if (stencil_mask) {
+      crocus_resource_prepare_access(ice, stencil_res, level, 1, box->z,
+                                     box->depth, stencil_res->aux.usage, false);
+      crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev,
+                                     &stencil_surf, &stencil_res->base,
+                                     stencil_res->aux.usage, level, true);
+   }
+
+   blorp_clear_depth_stencil(&blorp_batch, &z_surf, &stencil_surf,
+                             level, box->z, box->depth,
+                             box->x, box->y,
+                             box->x + box->width,
+                             box->y + box->height,
+                             clear_depth && z_res, depth,
+                             stencil_mask, stencil);
+
+   blorp_batch_finish(&blorp_batch);
+   crocus_flush_and_dirty_for_history(ice, batch, res, 0,
+                                      "cache history: post slow ZS clear");
+
+   if (clear_depth && z_res) {
+      crocus_resource_finish_render(ice, z_res, level,
+                                    box->z, box->depth, z_surf.aux_usage);
+   }
+
+   if (stencil_mask) {
+      crocus_resource_finish_write(ice, stencil_res, level, box->z, box->depth,
+                                   stencil_res->aux.usage);
+   }
+}
+
+/**
+ * The pipe->clear() driver hook.
+ *
+ * This clears buffers attached to the current draw framebuffer.
+ */
+static void
+crocus_clear(struct pipe_context *ctx,
+             unsigned buffers,
+             const struct pipe_scissor_state *scissor_state,
+             const union pipe_color_union *p_color,
+             double depth,
+             unsigned stencil)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   assert(buffers != 0);
+
+   struct pipe_box box = {
+      .width = cso_fb->width,
+      .height = cso_fb->height,
+   };
+
+   if (scissor_state) {
+      box.x = scissor_state->minx;
+      box.y = scissor_state->miny;
+      box.width = MIN2(box.width, scissor_state->maxx - scissor_state->minx);
+      box.height = MIN2(box.height, scissor_state->maxy - scissor_state->miny);
+   }
+
+   if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      if (devinfo->ver < 6) {
+         crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE, true);
+         util_blitter_clear(ice->blitter, cso_fb->width, cso_fb->height,
+                            util_framebuffer_get_num_layers(cso_fb),
+                            buffers & PIPE_CLEAR_DEPTHSTENCIL, p_color, depth, stencil, false);
+      } else {
+         struct pipe_surface *psurf = cso_fb->zsbuf;
+         box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1;
+         box.z = psurf->u.tex.first_layer;
+
+         clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box, true,
+                             buffers & PIPE_CLEAR_DEPTH,
+                             buffers & PIPE_CLEAR_STENCIL,
+                             depth, stencil);
+      }
+      buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+   }
+
+   if (buffers & PIPE_CLEAR_COLOR) {
+      /* pipe_color_union and isl_color_value are interchangeable */
+      union isl_color_value *color = (void *) p_color;
+
+      for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+         if (buffers & (PIPE_CLEAR_COLOR0 << i)) {
+            struct pipe_surface *psurf = cso_fb->cbufs[i];
+            struct crocus_surface *isurf = (void *) psurf;
+            box.depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1,
+            box.z = psurf->u.tex.first_layer,
+
+            clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+                        true, isurf->view.format, isurf->view.swizzle,
+                        *color);
+         }
+      }
+   }
+}
+
+/**
+ * The pipe->clear_texture() driver hook.
+ *
+ * This clears the given texture resource.
+ */
+static void
+crocus_clear_texture(struct pipe_context *ctx,
+                     struct pipe_resource *p_res,
+                     unsigned level,
+                     const struct pipe_box *box,
+                     const void *data)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_resource *res = (void *) p_res;
+
+   if (devinfo->ver < 6) {
+      util_clear_texture(ctx, p_res,
+                         level, box, data);
+      return;
+   }
+
+   if (crocus_resource_unfinished_aux_import(res))
+      crocus_resource_finish_aux_import(ctx->screen, res);
+
+   if (util_format_is_depth_or_stencil(p_res->format)) {
+      const struct util_format_unpack_description *fmt_unpack =
+         util_format_unpack_description(p_res->format);
+
+      float depth = 0.0;
+      uint8_t stencil = 0;
+
+      if (fmt_unpack->unpack_z_float)
+         fmt_unpack->unpack_z_float(&depth, 0, data, 0, 1, 1);
+
+      if (fmt_unpack->unpack_s_8uint)
+         fmt_unpack->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+
+      clear_depth_stencil(ice, p_res, level, box, true, true, true,
+                          depth, stencil);
+   } else {
+      union isl_color_value color;
+      struct crocus_resource *res = (void *) p_res;
+      enum isl_format format = res->surf.format;
+
+      if (!isl_format_supports_rendering(devinfo, format)) {
+         const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+         // XXX: actually just get_copy_format_for_bpb from BLORP
+         // XXX: don't cut and paste this
+         switch (fmtl->bpb) {
+         case 8:   format = ISL_FORMAT_R8_UINT;           break;
+         case 16:  format = ISL_FORMAT_R8G8_UINT;         break;
+         case 24:  format = ISL_FORMAT_R8G8B8_UINT;       break;
+         case 32:  format = ISL_FORMAT_R8G8B8A8_UINT;     break;
+         case 48:  format = ISL_FORMAT_R16G16B16_UINT;    break;
+         case 64:  format = ISL_FORMAT_R16G16B16A16_UINT; break;
+         case 96:  format = ISL_FORMAT_R32G32B32_UINT;    break;
+         case 128: format = ISL_FORMAT_R32G32B32A32_UINT; break;
+         default:
+            unreachable("Unknown format bpb");
+         }
+
+         /* No aux surfaces for non-renderable surfaces */
+         assert(res->aux.usage == ISL_AUX_USAGE_NONE);
+      }
+
+      isl_color_value_unpack(&color, format, data);
+
+      clear_color(ice, p_res, level, box, true, format,
+                  ISL_SWIZZLE_IDENTITY, color);
+   }
+}
+
+/**
+ * The pipe->clear_render_target() driver hook.
+ *
+ * This clears the given render target surface.
+ */
+static void
+crocus_clear_render_target(struct pipe_context *ctx,
+                           struct pipe_surface *psurf,
+                           const union pipe_color_union *p_color,
+                           unsigned dst_x, unsigned dst_y,
+                           unsigned width, unsigned height,
+                           bool render_condition_enabled)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_surface *isurf = (void *) psurf;
+   struct pipe_box box = {
+      .x = dst_x,
+      .y = dst_y,
+      .z = psurf->u.tex.first_layer,
+      .width = width,
+      .height = height,
+      .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+   };
+
+   /* pipe_color_union and isl_color_value are interchangeable */
+   union isl_color_value *color = (void *) p_color;
+
+   clear_color(ice, psurf->texture, psurf->u.tex.level, &box,
+               render_condition_enabled,
+               isurf->view.format, isurf->view.swizzle, *color);
+}
+
+/**
+ * The pipe->clear_depth_stencil() driver hook.
+ *
+ * This clears the given depth/stencil surface.
+ */
+static void
+crocus_clear_depth_stencil(struct pipe_context *ctx,
+                           struct pipe_surface *psurf,
+                           unsigned flags,
+                           double depth,
+                           unsigned stencil,
+                           unsigned dst_x, unsigned dst_y,
+                           unsigned width, unsigned height,
+                           bool render_condition_enabled)
+{
+   return;
+#if 0
+   struct crocus_context *ice = (void *) ctx;
+   struct pipe_box box = {
+      .x = dst_x,
+      .y = dst_y,
+      .z = psurf->u.tex.first_layer,
+      .width = width,
+      .height = height,
+      .depth = psurf->u.tex.last_layer - psurf->u.tex.first_layer + 1
+   };
+   uint32_t blit_flags = 0;
+
+   assert(util_format_is_depth_or_stencil(psurf->texture->format));
+
+   crocus_blitter_begin(ice, CROCUS_SAVE_FRAGMENT_STATE);
+   util_blitter_clear(ice->blitter, width, height,
+                      1, flags, NULL, depth, stencil, render_condition_enabled);
+#if 0
+   clear_depth_stencil(ice, psurf->texture, psurf->u.tex.level, &box,
+                       render_condition_enabled,
+                       flags & PIPE_CLEAR_DEPTH, flags & PIPE_CLEAR_STENCIL,
+                       depth, stencil);
+#endif
+#endif
+}
+
+void
+crocus_init_clear_functions(struct pipe_context *ctx)
+{
+   ctx->clear = crocus_clear;
+   ctx->clear_texture = crocus_clear_texture;
+   ctx->clear_render_target = crocus_clear_render_target;
+   ctx->clear_depth_stencil = crocus_clear_depth_stencil;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.c b/src/gallium/drivers/crocus/crocus_context.c
new file mode 100644
index 00000000000..cd8a54d6d34
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <time.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_upload_mgr.h"
+#include "drm-uapi/i915_drm.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "common/intel_defines.h"
+#include "common/intel_sample_positions.h"
+
+/**
+ * The pipe->set_debug_callback() driver hook.
+ */
+static void
+crocus_set_debug_callback(struct pipe_context *ctx,
+                          const struct pipe_debug_callback *cb)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   if (cb)
+      ice->dbg = *cb;
+   else
+      memset(&ice->dbg, 0, sizeof(ice->dbg));
+}
+
+static bool
+crocus_init_identifier_bo(struct crocus_context *ice)
+{
+   void *bo_map;
+
+   bo_map = crocus_bo_map(NULL, ice->workaround_bo, MAP_READ | MAP_WRITE);
+   if (!bo_map)
+      return false;
+
+   ice->workaround_bo->kflags |= EXEC_OBJECT_CAPTURE;
+   ice->workaround_offset = ALIGN(
+      intel_debug_write_identifiers(bo_map, 4096, "Crocus") + 8, 8);
+
+   crocus_bo_unmap(ice->workaround_bo);
+
+   return true;
+}
+
+/**
+ * Called from the batch module when it detects a GPU hang.
+ *
+ * In this case, we've lost our GEM context, and can't rely on any existing
+ * state on the GPU.  We must mark everything dirty and wipe away any saved
+ * assumptions about the last known state of the GPU.
+ */
+void
+crocus_lost_context_state(struct crocus_batch *batch)
+{
+   /* The batch module doesn't have an crocus_context, because we want to
+    * avoid introducing lots of layering violations.  Unfortunately, here
+    * we do need to inform the context of batch catastrophe.  We know the
+    * batch is one of our context's, so hackily claw our way back.
+    */
+   struct crocus_context *ice = batch->ice;
+   struct crocus_screen *screen = batch->screen;
+   if (batch->name == CROCUS_BATCH_RENDER) {
+      screen->vtbl.init_render_context(batch);
+   } else if (batch->name == CROCUS_BATCH_COMPUTE) {
+      screen->vtbl.init_compute_context(batch);
+   } else {
+      unreachable("unhandled batch reset");
+   }
+
+   ice->state.dirty = ~0ull;
+   memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+   batch->state_base_address_emitted = false;
+   screen->vtbl.lost_genx_state(ice, batch);
+}
+
+static enum pipe_reset_status
+crocus_get_device_reset_status(struct pipe_context *ctx)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   enum pipe_reset_status worst_reset = PIPE_NO_RESET;
+
+   /* Check the reset status of each batch's hardware context, and take the
+    * worst status (if one was guilty, proclaim guilt).
+    */
+   for (int i = 0; i < ice->batch_count; i++) {
+      /* This will also recreate the hardware contexts as necessary, so any
+       * future queries will show no resets.  We only want to report once.
+       */
+      enum pipe_reset_status batch_reset =
+         crocus_batch_check_for_reset(&ice->batches[i]);
+
+      if (batch_reset == PIPE_NO_RESET)
+         continue;
+
+      if (worst_reset == PIPE_NO_RESET) {
+         worst_reset = batch_reset;
+      } else {
+         /* GUILTY < INNOCENT < UNKNOWN */
+         worst_reset = MIN2(worst_reset, batch_reset);
+      }
+   }
+
+   if (worst_reset != PIPE_NO_RESET && ice->reset.reset)
+      ice->reset.reset(ice->reset.data, worst_reset);
+
+   return worst_reset;
+}
+
+static void
+crocus_set_device_reset_callback(struct pipe_context *ctx,
+                                 const struct pipe_device_reset_callback *cb)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   if (cb)
+      ice->reset = *cb;
+   else
+      memset(&ice->reset, 0, sizeof(ice->reset));
+}
+
+static void
+crocus_get_sample_position(struct pipe_context *ctx,
+                           unsigned sample_count,
+                           unsigned sample_index,
+                           float *out_value)
+{
+   union {
+      struct {
+         float x[16];
+         float y[16];
+      } a;
+      struct {
+         float  _0XOffset,  _1XOffset,  _2XOffset,  _3XOffset,
+                _4XOffset,  _5XOffset,  _6XOffset,  _7XOffset,
+                _8XOffset,  _9XOffset, _10XOffset, _11XOffset,
+               _12XOffset, _13XOffset, _14XOffset, _15XOffset;
+         float  _0YOffset,  _1YOffset,  _2YOffset,  _3YOffset,
+                _4YOffset,  _5YOffset,  _6YOffset,  _7YOffset,
+                _8YOffset,  _9YOffset, _10YOffset, _11YOffset,
+               _12YOffset, _13YOffset, _14YOffset, _15YOffset;
+      } v;
+   } u;
+   switch (sample_count) {
+   case 1:  INTEL_SAMPLE_POS_1X(u.v._);  break;
+   case 2:  INTEL_SAMPLE_POS_2X(u.v._);  break;
+   case 4:  INTEL_SAMPLE_POS_4X(u.v._);  break;
+   case 8:  INTEL_SAMPLE_POS_8X(u.v._);  break;
+   case 16: INTEL_SAMPLE_POS_16X(u.v._); break;
+   default: unreachable("invalid sample count");
+   }
+
+   out_value[0] = u.a.x[sample_index];
+   out_value[1] = u.a.y[sample_index];
+}
+
+/**
+ * Destroy a context, freeing any associated memory.
+ */
+static void
+crocus_destroy_context(struct pipe_context *ctx)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   if (ctx->stream_uploader)
+      u_upload_destroy(ctx->stream_uploader);
+
+   if (ice->blitter)
+      util_blitter_destroy(ice->blitter);
+   screen->vtbl.destroy_state(ice);
+   crocus_destroy_program_cache(ice);
+   u_upload_destroy(ice->query_buffer_uploader);
+
+   crocus_bo_unreference(ice->workaround_bo);
+
+   slab_destroy_child(&ice->transfer_pool);
+
+   crocus_batch_free(&ice->batches[CROCUS_BATCH_RENDER]);
+   if (ice->batches[CROCUS_BATCH_COMPUTE].ice)
+      crocus_batch_free(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+   ralloc_free(ice);
+}
+
+#define genX_call(devinfo, func, ...)                   \
+   switch ((devinfo)->verx10) {                         \
+   case 75:                                             \
+      gfx75_##func(__VA_ARGS__);                        \
+      break;                                            \
+   case 70:                                             \
+      gfx7_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 60:                                             \
+      gfx6_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 50:                                             \
+      gfx5_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 45:                                             \
+      gfx45_##func(__VA_ARGS__);                        \
+      break;                                            \
+   case 40:                                             \
+      gfx4_##func(__VA_ARGS__);                         \
+      break;                                            \
+   default:                                             \
+      unreachable("Unknown hardware generation");       \
+   }
+
+/**
+ * Create a context.
+ *
+ * This is where each context begins.
+ */
+struct pipe_context *
+crocus_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+   struct crocus_screen *screen = (struct crocus_screen*)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_context *ice = rzalloc(NULL, struct crocus_context);
+
+   if (!ice)
+      return NULL;
+
+   struct pipe_context *ctx = &ice->ctx;
+
+   ctx->screen = pscreen;
+   ctx->priv = priv;
+
+   ctx->stream_uploader = u_upload_create_default(ctx);
+   if (!ctx->stream_uploader) {
+      free(ctx);
+      return NULL;
+   }
+   ctx->const_uploader = ctx->stream_uploader;
+
+   ctx->destroy = crocus_destroy_context;
+   ctx->set_debug_callback = crocus_set_debug_callback;
+   ctx->set_device_reset_callback = crocus_set_device_reset_callback;
+   ctx->get_device_reset_status = crocus_get_device_reset_status;
+   ctx->get_sample_position = crocus_get_sample_position;
+
+   ice->shaders.urb_size = devinfo->urb.size;
+
+   crocus_init_context_fence_functions(ctx);
+   crocus_init_blit_functions(ctx);
+   crocus_init_clear_functions(ctx);
+   crocus_init_program_functions(ctx);
+   crocus_init_resource_functions(ctx);
+   crocus_init_flush_functions(ctx);
+
+   crocus_init_program_cache(ice);
+
+   slab_create_child(&ice->transfer_pool, &screen->transfer_pool);
+
+   ice->query_buffer_uploader =
+      u_upload_create(ctx, 4096, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING,
+                      0);
+
+   ice->workaround_bo =
+      crocus_bo_alloc(screen->bufmgr, "workaround", 4096);
+   if (!ice->workaround_bo)
+      return NULL;
+
+   if (!crocus_init_identifier_bo(ice))
+      return NULL;
+
+   genX_call(devinfo, init_state, ice);
+   genX_call(devinfo, init_blorp, ice);
+   genX_call(devinfo, init_query, ice);
+
+   ice->blitter = util_blitter_create(&ice->ctx);
+   if (ice->blitter == NULL)
+      return NULL;
+   int priority = 0;
+   if (flags & PIPE_CONTEXT_HIGH_PRIORITY)
+      priority = INTEL_CONTEXT_HIGH_PRIORITY;
+   if (flags & PIPE_CONTEXT_LOW_PRIORITY)
+      priority = INTEL_CONTEXT_LOW_PRIORITY;
+
+   ice->batch_count = devinfo->ver >= 7 ? CROCUS_BATCH_COUNT : 1;
+   for (int i = 0; i < ice->batch_count; i++) {
+      crocus_init_batch(ice, (enum crocus_batch_name) i,
+                        priority);
+   }
+
+   ice->urb.size = devinfo->urb.size;
+   screen->vtbl.init_render_context(&ice->batches[CROCUS_BATCH_RENDER]);
+   if (ice->batch_count > 1)
+      screen->vtbl.init_compute_context(&ice->batches[CROCUS_BATCH_COMPUTE]);
+
+   return ctx;
+}
+
+bool
+crocus_sw_check_cond_render(struct crocus_context *ice)
+{
+   struct crocus_query *q = ice->condition.query;
+   union pipe_query_result result;
+
+   bool wait = ice->condition.mode == PIPE_RENDER_COND_WAIT ||
+      ice->condition.mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+   if (!q)
+      return true;
+
+   bool ret = ice->ctx.get_query_result(&ice->ctx, (void *)q, wait, &result);
+   if (!ret)
+      return true;
+
+   return ice->condition.condition ? result.u64 == 0 : result.u64 != 0;
+}
diff --git a/src/gallium/drivers/crocus/crocus_context.h b/src/gallium/drivers/crocus/crocus_context.h
new file mode 100644
index 00000000000..8d6e43d80f6
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_context.h
@@ -0,0 +1,955 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_CONTEXT_H
+#define CROCUS_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "intel/blorp/blorp.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/compiler/brw_compiler.h"
+#include "crocus_batch.h"
+#include "crocus_fence.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "util/u_blitter.h"
+
+struct crocus_bo;
+struct crocus_context;
+struct blorp_batch;
+struct blorp_params;
+
+#define CROCUS_MAX_TEXTURE_BUFFER_SIZE (1 << 27)
+#define CROCUS_MAX_TEXTURE_SAMPLERS 32
+/* CROCUS_MAX_ABOS and CROCUS_MAX_SSBOS must be the same. */
+#define CROCUS_MAX_ABOS 16
+#define CROCUS_MAX_SSBOS 16
+#define CROCUS_MAX_VIEWPORTS 16
+#define CROCUS_MAX_CLIP_PLANES 8
+
+enum crocus_param_domain {
+   BRW_PARAM_DOMAIN_BUILTIN = 0,
+   BRW_PARAM_DOMAIN_IMAGE,
+};
+
+enum {
+   DRI_CONF_BO_REUSE_DISABLED,
+   DRI_CONF_BO_REUSE_ALL
+};
+
+#define BRW_PARAM(domain, val)   (BRW_PARAM_DOMAIN_##domain << 24 | (val))
+#define BRW_PARAM_DOMAIN(param)  ((uint32_t)(param) >> 24)
+#define BRW_PARAM_VALUE(param)   ((uint32_t)(param) & 0x00ffffff)
+#define BRW_PARAM_IMAGE(idx, offset) BRW_PARAM(IMAGE, ((idx) << 8) | (offset))
+#define BRW_PARAM_IMAGE_IDX(value)   (BRW_PARAM_VALUE(value) >> 8)
+#define BRW_PARAM_IMAGE_OFFSET(value)(BRW_PARAM_VALUE(value) & 0xf)
+
+/**
+ * Dirty flags.  When state changes, we flag some combination of these
+ * to indicate that particular GPU commands need to be re-emitted.
+ *
+ * Each bit typically corresponds to a single 3DSTATE_* command packet, but
+ * in rare cases they map to a group of related packets that need to be
+ * emitted together.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_DIRTY_COLOR_CALC_STATE         (1ull <<  0)
+#define CROCUS_DIRTY_POLYGON_STIPPLE          (1ull <<  1)
+#define CROCUS_DIRTY_CC_VIEWPORT              (1ull <<  2)
+#define CROCUS_DIRTY_SF_CL_VIEWPORT           (1ull <<  3)
+#define CROCUS_DIRTY_RASTER                   (1ull <<  4)
+#define CROCUS_DIRTY_CLIP                     (1ull <<  5)
+#define CROCUS_DIRTY_LINE_STIPPLE             (1ull <<  6)
+#define CROCUS_DIRTY_VERTEX_ELEMENTS          (1ull <<  7)
+#define CROCUS_DIRTY_VERTEX_BUFFERS           (1ull <<  8)
+#define CROCUS_DIRTY_DRAWING_RECTANGLE        (1ull <<  9)
+#define CROCUS_DIRTY_GEN6_URB                 (1ull << 10)
+#define CROCUS_DIRTY_DEPTH_BUFFER             (1ull << 11)
+#define CROCUS_DIRTY_WM                       (1ull << 12)
+#define CROCUS_DIRTY_SO_DECL_LIST             (1ull << 13)
+#define CROCUS_DIRTY_STREAMOUT                (1ull << 14)
+#define CROCUS_DIRTY_GEN4_CONSTANT_COLOR      (1ull << 15)
+#define CROCUS_DIRTY_GEN4_CURBE               (1ull << 16)
+#define CROCUS_DIRTY_GEN4_URB_FENCE           (1ull << 17)
+#define CROCUS_DIRTY_GEN5_PIPELINED_POINTERS  (1ull << 18)
+#define CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS  (1ull << 19)
+#define CROCUS_DIRTY_GEN6_BLEND_STATE         (1ull << 20)
+#define CROCUS_DIRTY_GEN6_SCISSOR_RECT        (1ull << 21)
+#define CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL    (1ull << 22)
+#define CROCUS_DIRTY_GEN6_MULTISAMPLE         (1ull << 23)
+#define CROCUS_DIRTY_GEN6_SAMPLE_MASK         (1ull << 24)
+#define CROCUS_DIRTY_GEN7_SBE                 (1ull << 25)
+#define CROCUS_DIRTY_GEN7_L3_CONFIG           (1ull << 26)
+#define CROCUS_DIRTY_GEN7_SO_BUFFERS          (1ull << 27)
+#define CROCUS_DIRTY_GEN75_VF                 (1ull << 28)
+#define CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES  (1ull << 29)
+#define CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES (1ull << 30)
+#define CROCUS_DIRTY_VF_STATISTICS            (1ull << 31)
+#define CROCUS_DIRTY_GEN4_CLIP_PROG           (1ull << 32)
+#define CROCUS_DIRTY_GEN4_SF_PROG             (1ull << 33)
+#define CROCUS_DIRTY_GEN4_FF_GS_PROG          (1ull << 34)
+#define CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS (1ull << 35)
+#define CROCUS_DIRTY_GEN6_SVBI                (1ull << 36)
+
+#define CROCUS_ALL_DIRTY_FOR_COMPUTE (CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES)
+
+#define CROCUS_ALL_DIRTY_FOR_RENDER (~CROCUS_ALL_DIRTY_FOR_COMPUTE)
+
+/**
+ * Per-stage dirty flags.  When state changes, we flag some combination of
+ * these to indicate that particular GPU commands need to be re-emitted.
+ * Unlike the IRIS_DIRTY_* flags these are shader stage-specific and can be
+ * indexed by shifting the mask by the shader stage index.
+ *
+ * See crocus_upload_render_state().
+ */
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS        (1ull << 0)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS       (1ull << 1)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES       (1ull << 2)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS        (1ull << 3)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS        (1ull << 4)
+#define CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS        (1ull << 5)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_VS            (1ull << 6)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TCS           (1ull << 7)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_TES           (1ull << 8)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_GS            (1ull << 9)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_FS            (1ull << 10)
+#define CROCUS_STAGE_DIRTY_UNCOMPILED_CS            (1ull << 11)
+#define CROCUS_STAGE_DIRTY_VS                       (1ull << 12)
+#define CROCUS_STAGE_DIRTY_TCS                      (1ull << 13)
+#define CROCUS_STAGE_DIRTY_TES                      (1ull << 14)
+#define CROCUS_STAGE_DIRTY_GS                       (1ull << 15)
+#define CROCUS_STAGE_DIRTY_FS                       (1ull << 16)
+#define CROCUS_STAGE_DIRTY_CS                       (1ull << 17)
+#define CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS      18
+#define CROCUS_STAGE_DIRTY_CONSTANTS_VS             (1ull << 18)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TCS            (1ull << 19)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_TES            (1ull << 20)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_GS             (1ull << 21)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_FS             (1ull << 22)
+#define CROCUS_STAGE_DIRTY_CONSTANTS_CS             (1ull << 23)
+#define CROCUS_STAGE_DIRTY_BINDINGS_VS              (1ull << 24)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TCS             (1ull << 25)
+#define CROCUS_STAGE_DIRTY_BINDINGS_TES             (1ull << 26)
+#define CROCUS_STAGE_DIRTY_BINDINGS_GS              (1ull << 27)
+#define CROCUS_STAGE_DIRTY_BINDINGS_FS              (1ull << 28)
+#define CROCUS_STAGE_DIRTY_BINDINGS_CS              (1ull << 29)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE (CROCUS_STAGE_DIRTY_CS | \
+                                          CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS | \
+                                          CROCUS_STAGE_DIRTY_UNCOMPILED_CS |    \
+                                          CROCUS_STAGE_DIRTY_CONSTANTS_CS |     \
+                                          CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_ALL_STAGE_DIRTY_FOR_RENDER (~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE)
+
+#define CROCUS_ALL_STAGE_DIRTY_BINDINGS (CROCUS_STAGE_DIRTY_BINDINGS_VS  | \
+                                       CROCUS_STAGE_DIRTY_BINDINGS_TCS | \
+                                       CROCUS_STAGE_DIRTY_BINDINGS_TES | \
+                                       CROCUS_STAGE_DIRTY_BINDINGS_GS  | \
+                                       CROCUS_STAGE_DIRTY_BINDINGS_FS  | \
+                                       CROCUS_STAGE_DIRTY_BINDINGS_CS)
+
+#define CROCUS_RENDER_STAGE_DIRTY_CONSTANTS (CROCUS_STAGE_DIRTY_CONSTANTS_VS  | \
+                                             CROCUS_STAGE_DIRTY_CONSTANTS_TCS | \
+                                             CROCUS_STAGE_DIRTY_CONSTANTS_TES | \
+                                             CROCUS_STAGE_DIRTY_CONSTANTS_GS  | \
+                                             CROCUS_STAGE_DIRTY_CONSTANTS_FS)
+
+/**
+ * Non-orthogonal state (NOS) dependency flags.
+ *
+ * Shader programs may depend on non-orthogonal state.  These flags are
+ * used to indicate that a shader's key depends on the state provided by
+ * a certain Gallium CSO.  Changing any CSOs marked as a dependency will
+ * cause the driver to re-compute the shader key, possibly triggering a
+ * shader recompile.
+ */
+enum crocus_nos_dep {
+   CROCUS_NOS_FRAMEBUFFER,
+   CROCUS_NOS_DEPTH_STENCIL_ALPHA,
+   CROCUS_NOS_RASTERIZER,
+   CROCUS_NOS_BLEND,
+   CROCUS_NOS_LAST_VUE_MAP,
+   CROCUS_NOS_TEXTURES,
+   CROCUS_NOS_VERTEX_ELEMENTS,
+   CROCUS_NOS_COUNT,
+};
+
+struct crocus_depth_stencil_alpha_state;
+
+/**
+ * Cache IDs for the in-memory program cache (ice->shaders.cache).
+ */
+enum crocus_program_cache_id {
+   CROCUS_CACHE_VS  = MESA_SHADER_VERTEX,
+   CROCUS_CACHE_TCS = MESA_SHADER_TESS_CTRL,
+   CROCUS_CACHE_TES = MESA_SHADER_TESS_EVAL,
+   CROCUS_CACHE_GS  = MESA_SHADER_GEOMETRY,
+   CROCUS_CACHE_FS  = MESA_SHADER_FRAGMENT,
+   CROCUS_CACHE_CS  = MESA_SHADER_COMPUTE,
+   CROCUS_CACHE_BLORP,
+   CROCUS_CACHE_SF,
+   CROCUS_CACHE_CLIP,
+   CROCUS_CACHE_FF_GS,
+};
+
+/** @{
+ *
+ * Defines for PIPE_CONTROL operations, which trigger cache flushes,
+ * synchronization, pipelined memory writes, and so on.
+ *
+ * The bits here are not the actual hardware values.  The actual fields
+ * move between various generations, so we just have flags for each
+ * potential operation, and use genxml to encode the actual packet.
+ */
+enum pipe_control_flags
+{
+   PIPE_CONTROL_FLUSH_LLC                       = (1 << 1),
+   PIPE_CONTROL_LRI_POST_SYNC_OP                = (1 << 2),
+   PIPE_CONTROL_STORE_DATA_INDEX                = (1 << 3),
+   PIPE_CONTROL_CS_STALL                        = (1 << 4),
+   PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET     = (1 << 5),
+   PIPE_CONTROL_SYNC_GFDT                       = (1 << 6),
+   PIPE_CONTROL_TLB_INVALIDATE                  = (1 << 7),
+   PIPE_CONTROL_MEDIA_STATE_CLEAR               = (1 << 8),
+   PIPE_CONTROL_WRITE_IMMEDIATE                 = (1 << 9),
+   PIPE_CONTROL_WRITE_DEPTH_COUNT               = (1 << 10),
+   PIPE_CONTROL_WRITE_TIMESTAMP                 = (1 << 11),
+   PIPE_CONTROL_DEPTH_STALL                     = (1 << 12),
+   PIPE_CONTROL_RENDER_TARGET_FLUSH             = (1 << 13),
+   PIPE_CONTROL_INSTRUCTION_INVALIDATE          = (1 << 14),
+   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE        = (1 << 15),
+   PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE = (1 << 16),
+   PIPE_CONTROL_NOTIFY_ENABLE                   = (1 << 17),
+   PIPE_CONTROL_FLUSH_ENABLE                    = (1 << 18),
+   PIPE_CONTROL_DATA_CACHE_FLUSH                = (1 << 19),
+   PIPE_CONTROL_VF_CACHE_INVALIDATE             = (1 << 20),
+   PIPE_CONTROL_CONST_CACHE_INVALIDATE          = (1 << 21),
+   PIPE_CONTROL_STATE_CACHE_INVALIDATE          = (1 << 22),
+   PIPE_CONTROL_STALL_AT_SCOREBOARD             = (1 << 23),
+   PIPE_CONTROL_DEPTH_CACHE_FLUSH               = (1 << 24),
+   PIPE_CONTROL_TILE_CACHE_FLUSH                = (1 << 25),
+};
+
+#define PIPE_CONTROL_CACHE_FLUSH_BITS           \
+   (PIPE_CONTROL_DEPTH_CACHE_FLUSH |            \
+    PIPE_CONTROL_DATA_CACHE_FLUSH |             \
+    PIPE_CONTROL_RENDER_TARGET_FLUSH)
+
+#define PIPE_CONTROL_CACHE_INVALIDATE_BITS      \
+   (PIPE_CONTROL_STATE_CACHE_INVALIDATE |       \
+    PIPE_CONTROL_CONST_CACHE_INVALIDATE |       \
+    PIPE_CONTROL_VF_CACHE_INVALIDATE |          \
+    PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |     \
+    PIPE_CONTROL_INSTRUCTION_INVALIDATE)
+
+enum crocus_predicate_state {
+   /* The first two states are used if we can determine whether to draw
+    * without having to look at the values in the query object buffer. This
+    * will happen if there is no conditional render in progress, if the query
+    * object is already completed or if something else has already added
+    * samples to the preliminary result.
+    */
+   CROCUS_PREDICATE_STATE_RENDER,
+   CROCUS_PREDICATE_STATE_DONT_RENDER,
+
+   /* In this case whether to draw or not depends on the result of an
+    * MI_PREDICATE command so the predicate enable bit needs to be checked.
+    */
+   CROCUS_PREDICATE_STATE_USE_BIT,
+   /* In this case, either MI_PREDICATE doesn't exist or we lack the
+    * necessary kernel features to use it.  Stall for the query result.
+    */
+   CROCUS_PREDICATE_STATE_STALL_FOR_QUERY,
+};
+
+/** @} */
+
+/**
+ * An uncompiled, API-facing shader.  This is the Gallium CSO for shaders.
+ * It primarily contains the NIR for the shader.
+ *
+ * Each API-facing shader can be compiled into multiple shader variants,
+ * based on non-orthogonal state dependencies, recorded in the shader key.
+ *
+ * See crocus_compiled_shader, which represents a compiled shader variant.
+ */
+struct crocus_uncompiled_shader {
+   struct nir_shader *nir;
+
+   struct pipe_stream_output_info stream_output;
+
+   /* A SHA1 of the serialized NIR for the disk cache. */
+   unsigned char nir_sha1[20];
+
+   unsigned program_id;
+
+   /** Bitfield of (1 << CROCUS_NOS_*) flags. */
+   unsigned nos;
+
+   /** Have any shader variants been compiled yet? */
+   bool compiled_once;
+
+   /** Should we use ALT mode for math?  Useful for ARB programs. */
+   bool use_alt_mode;
+
+   bool needs_edge_flag;
+
+   /** Constant data scraped from the shader by nir_opt_large_constants */
+   struct pipe_resource *const_data;
+
+   /** Surface state for const_data */
+   struct crocus_state_ref const_data_state;
+};
+
+enum crocus_surface_group {
+   CROCUS_SURFACE_GROUP_RENDER_TARGET,
+   CROCUS_SURFACE_GROUP_RENDER_TARGET_READ,
+   CROCUS_SURFACE_GROUP_SOL,
+   CROCUS_SURFACE_GROUP_CS_WORK_GROUPS,
+   CROCUS_SURFACE_GROUP_TEXTURE,
+   CROCUS_SURFACE_GROUP_TEXTURE_GATHER,
+   CROCUS_SURFACE_GROUP_IMAGE,
+   CROCUS_SURFACE_GROUP_UBO,
+   CROCUS_SURFACE_GROUP_SSBO,
+
+   CROCUS_SURFACE_GROUP_COUNT,
+};
+
+enum {
+   /* Invalid value for a binding table index. */
+   CROCUS_SURFACE_NOT_USED = 0xa0a0a0a0,
+};
+
+struct crocus_binding_table {
+   uint32_t size_bytes;
+
+   /** Number of surfaces in each group, before compacting. */
+   uint32_t sizes[CROCUS_SURFACE_GROUP_COUNT];
+
+   /** Initial offset of each group. */
+   uint32_t offsets[CROCUS_SURFACE_GROUP_COUNT];
+
+   /** Mask of surfaces used in each group. */
+   uint64_t used_mask[CROCUS_SURFACE_GROUP_COUNT];
+};
+
+/**
+ * A compiled shader variant, containing a pointer to the GPU assembly,
+ * as well as program data and other packets needed by state upload.
+ *
+ * There can be several crocus_compiled_shader variants per API-level shader
+ * (crocus_uncompiled_shader), due to state-based recompiles (brw_*_prog_key).
+ */
+struct crocus_compiled_shader {
+   /** Reference to the uploaded assembly. */
+   uint32_t offset;
+
+   /* asm size in map */
+   uint32_t map_size;
+
+   /** The program data (owned by the program cache hash table) */
+   struct brw_stage_prog_data *prog_data;
+   uint32_t prog_data_size;
+
+   /** A list of system values to be uploaded as uniforms. */
+   enum brw_param_builtin *system_values;
+   unsigned num_system_values;
+
+   /** Number of constbufs expected by the shader. */
+   unsigned num_cbufs;
+
+   /**
+    * Derived 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets
+    * (the VUE-based information for transform feedback outputs).
+    */
+   uint32_t *streamout;
+
+   struct crocus_binding_table bt;
+
+   uint32_t bind_bo_offset;
+   uint32_t surf_offset[128];//TODO
+};
+
+/**
+ * API context state that is replicated per shader stage.
+ */
+struct crocus_shader_state {
+   /** Uniform Buffers */
+   struct pipe_constant_buffer constbufs[PIPE_MAX_CONSTANT_BUFFERS];
+
+   bool sysvals_need_upload;
+
+   /** Shader Storage Buffers */
+   struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
+
+   /** Shader Storage Images (image load store) */
+   struct crocus_image_view image[PIPE_MAX_SHADER_IMAGES];
+
+   struct crocus_sampler_state *samplers[CROCUS_MAX_TEXTURE_SAMPLERS];
+   struct crocus_sampler_view *textures[CROCUS_MAX_TEXTURE_SAMPLERS];
+
+   /** Bitfield of which constant buffers are bound (non-null). */
+   uint32_t bound_cbufs;
+
+   /** Bitfield of which image views are bound (non-null). */
+   uint32_t bound_image_views;
+
+   /** Bitfield of which sampler views are bound (non-null). */
+   uint32_t bound_sampler_views;
+
+   /** Bitfield of which shader storage buffers are bound (non-null). */
+   uint32_t bound_ssbos;
+
+   /** Bitfield of which shader storage buffers are writable. */
+   uint32_t writable_ssbos;
+
+   uint32_t sampler_offset;
+};
+
+/**
+ * The API context (derived from pipe_context).
+ *
+ * Most driver state is tracked here.
+ */
+struct crocus_context {
+   struct pipe_context ctx;
+
+   /** A debug callback for KHR_debug output. */
+   struct pipe_debug_callback dbg;
+
+   /** A device reset status callback for notifying that the GPU is hosed. */
+   struct pipe_device_reset_callback reset;
+
+   /** Slab allocator for crocus_transfer_map objects. */
+   struct slab_child_pool transfer_pool;
+
+   struct blorp_context blorp;
+
+   int batch_count;
+   struct crocus_batch batches[CROCUS_BATCH_COUNT];
+
+   struct u_upload_mgr *query_buffer_uploader;
+
+   struct blitter_context *blitter;
+
+   struct {
+      struct {
+         /**
+          * Either the value of BaseVertex for indexed draw calls or the value
+          * of the argument <first> for non-indexed draw calls.
+          */
+         int firstvertex;
+         int baseinstance;
+      } params;
+
+      /**
+       * Are the above values the ones stored in the draw_params buffer?
+       * If so, we can compare them against new values to see if anything
+       * changed.  If not, we need to assume they changed.
+       */
+      bool params_valid;
+
+      /**
+       * Resource and offset that stores draw_parameters from the indirect
+       * buffer or to the buffer that stures the previous values for non
+       * indirect draws.
+       */
+      struct crocus_state_ref draw_params;
+
+      struct {
+         /**
+          * The value of DrawID. This always comes in from it's own vertex
+          * buffer since it's not part of the indirect draw parameters.
+          */
+         int drawid;
+
+         /**
+          * Stores if an indexed or non-indexed draw (~0/0). Useful to
+          * calculate BaseVertex as an AND of firstvertex and is_indexed_draw.
+          */
+         int is_indexed_draw;
+      } derived_params;
+
+      /**
+       * Resource and offset used for GL_ARB_shader_draw_parameters which
+       * contains parameters that are not present in the indirect buffer as
+       * drawid and is_indexed_draw. They will go in their own vertex element.
+       */
+      struct crocus_state_ref derived_draw_params;
+   } draw;
+
+   struct {
+      struct crocus_uncompiled_shader *uncompiled[MESA_SHADER_STAGES];
+      struct crocus_compiled_shader *prog[MESA_SHADER_STAGES];
+      struct brw_vue_map *last_vue_map;
+
+      struct crocus_bo *cache_bo;
+      uint32_t cache_next_offset;
+      void *cache_bo_map;
+      struct hash_table *cache;
+
+      unsigned urb_size;
+
+      /* gen 4/5 clip/sf progs */
+      struct crocus_compiled_shader *clip_prog;
+      struct crocus_compiled_shader *sf_prog;
+      /* gen4/5 prims, gen6 streamout */
+      struct crocus_compiled_shader *ff_gs_prog;
+      uint32_t clip_offset;
+      uint32_t sf_offset;
+      uint32_t wm_offset;
+      uint32_t vs_offset;
+      uint32_t gs_offset;
+      uint32_t cc_offset;
+
+      /** Is a GS or TES outputting points or lines? */
+      bool output_topology_is_points_or_lines;
+
+      /* Track last VS URB entry size */
+      unsigned last_vs_entry_size;
+
+      /**
+       * Scratch buffers for various sizes and stages.
+       *
+       * Indexed by the "Per-Thread Scratch Space" field's 4-bit encoding,
+       * and shader stage.
+       */
+      struct crocus_bo *scratch_bos[1 << 4][MESA_SHADER_STAGES];
+   } shaders;
+
+   struct {
+      struct crocus_query *query;
+      bool condition;
+      enum pipe_render_cond_flag mode;
+   } condition;
+
+   struct intel_perf_context *perf_ctx;
+
+   struct {
+      uint64_t dirty;
+      uint64_t stage_dirty;
+      uint64_t stage_dirty_for_nos[CROCUS_NOS_COUNT];
+
+      unsigned num_viewports;
+      unsigned sample_mask;
+      struct crocus_blend_state *cso_blend;
+      struct crocus_rasterizer_state *cso_rast;
+      struct crocus_depth_stencil_alpha_state *cso_zsa;
+      struct crocus_vertex_element_state *cso_vertex_elements;
+      struct pipe_blend_color blend_color;
+      struct pipe_poly_stipple poly_stipple;
+      struct pipe_viewport_state viewports[CROCUS_MAX_VIEWPORTS];
+      struct pipe_scissor_state scissors[CROCUS_MAX_VIEWPORTS];
+      struct pipe_stencil_ref stencil_ref;
+      struct pipe_framebuffer_state framebuffer;
+      struct pipe_clip_state clip_planes;
+
+      float default_outer_level[4];
+      float default_inner_level[2];
+
+      /** Bitfield of which vertex buffers are bound (non-null). */
+      uint32_t bound_vertex_buffers;
+      struct pipe_vertex_buffer vertex_buffers[16];
+      uint32_t vb_end[16];
+
+      bool primitive_restart;
+      unsigned cut_index;
+      enum pipe_prim_type prim_mode:8;
+      bool prim_is_points_or_lines;
+      uint8_t vertices_per_patch;
+
+      bool window_space_position;
+
+      /** The last compute group size */
+      uint32_t last_block[3];
+
+      /** The last compute grid size */
+      uint32_t last_grid[3];
+      /** Reference to the BO containing the compute grid size */
+      struct crocus_state_ref grid_size;
+
+      /**
+       * Array of aux usages for drawing, altered to account for any
+       * self-dependencies from resources bound for sampling and rendering.
+       */
+      enum isl_aux_usage draw_aux_usage[BRW_MAX_DRAW_BUFFERS];
+
+      /** Aux usage of the fb's depth buffer (which may or may not exist). */
+      enum isl_aux_usage hiz_usage;
+
+      /** Bitfield of whether color blending is enabled for RT[i] */
+      uint8_t blend_enables;
+
+      /** Are depth writes enabled?  (Depth buffer may or may not exist.) */
+      bool depth_writes_enabled;
+
+      /** Are stencil writes enabled?  (Stencil buffer may or may not exist.) */
+      bool stencil_writes_enabled;
+
+      /** GenX-specific current state */
+      struct crocus_genx_state *genx;
+
+      struct crocus_shader_state shaders[MESA_SHADER_STAGES];
+
+      /** Do vertex shader uses shader draw parameters ? */
+      bool vs_uses_draw_params;
+      bool vs_uses_derived_draw_params;
+      bool vs_needs_sgvs_element;
+      bool vs_uses_vertexid;
+      bool vs_uses_instanceid;
+
+      /** Do vertex shader uses edge flag ? */
+      bool vs_needs_edge_flag;
+
+      struct pipe_stream_output_target *so_target[PIPE_MAX_SO_BUFFERS];
+      bool streamout_active;
+      int so_targets;
+
+      bool statistics_counters_enabled;
+
+      /** Current conditional rendering mode */
+      enum crocus_predicate_state predicate;
+      bool predicate_supported;
+
+      /**
+       * Query BO with a MI_PREDICATE_RESULT snapshot calculated on the
+       * render context that needs to be uploaded to the compute context.
+       */
+      struct crocus_bo *compute_predicate;
+
+      /** Is a PIPE_QUERY_PRIMITIVES_GENERATED query active? */
+      bool prims_generated_query_active;
+
+      /** 3DSTATE_STREAMOUT and 3DSTATE_SO_DECL_LIST packets */
+      uint32_t *streamout;
+
+      /**
+       * Resources containing streamed state which our render context
+       * currently points to.  Used to re-add these to the validation
+       * list when we start a new batch and haven't resubmitted commands.
+       */
+      struct {
+         struct pipe_resource *res;
+         uint32_t offset;
+         uint32_t size;
+         uint32_t index_size;
+         bool prim_restart;
+      } index_buffer;
+
+      uint32_t sf_vp_address;
+      uint32_t clip_vp_address;
+      uint32_t cc_vp_address;
+
+      uint32_t stats_wm;
+      float global_depth_offset_clamp;
+
+      uint32_t last_xfb_verts_per_prim;
+      uint64_t svbi;
+   } state;
+
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      uint32_t vsize;                /* vertex size plus header in urb registers */
+      uint32_t gsize;                /* GS output size in urb registers */
+      uint32_t hsize;             /* Tessellation control output size in urb registers */
+      uint32_t dsize;             /* Tessellation evaluation output size in urb registers */
+      uint32_t csize;                /* constant buffer size in urb registers */
+      uint32_t sfsize;                /* setup data size in urb registers */
+
+      bool constrained;
+
+      uint32_t nr_vs_entries;
+      uint32_t nr_hs_entries;
+      uint32_t nr_ds_entries;
+      uint32_t nr_gs_entries;
+      uint32_t nr_clip_entries;
+      uint32_t nr_sf_entries;
+      uint32_t nr_cs_entries;
+
+      uint32_t vs_start;
+      uint32_t hs_start;
+      uint32_t ds_start;
+      uint32_t gs_start;
+      uint32_t clip_start;
+      uint32_t sf_start;
+      uint32_t cs_start;
+      /**
+       * URB size in the current configuration.  The units this is expressed
+       * in are somewhat inconsistent, see intel_device_info::urb::size.
+       *
+       * FINISHME: Represent the URB size consistently in KB on all platforms.
+       */
+      uint32_t size;
+
+      /* True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the GS.
+       */
+      bool gs_present;
+
+      /* True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the HS and DS.
+       */
+      bool tess_present;
+   } urb;
+
+   /* GEN4/5 curbe */
+   struct {
+      unsigned wm_start;
+      unsigned wm_size;
+      unsigned clip_start;
+      unsigned clip_size;
+      unsigned vs_start;
+      unsigned vs_size;
+      unsigned total_size;
+
+      struct crocus_resource *curbe_res;
+      unsigned curbe_offset;
+   } curbe;
+
+   /**
+    * A buffer containing a marker + description of the driver. This buffer is
+    * added to all execbufs syscalls so that we can identify the driver that
+    * generated a hang by looking at the content of the buffer in the error
+    * state. It is also used for hardware workarounds that require scratch
+    * writes or reads from some unimportant memory. To avoid overriding the
+    * debug data, use the workaround_address field for workarounds.
+    */
+   struct crocus_bo *workaround_bo;
+   unsigned workaround_offset;
+};
+
+#define perf_debug(dbg, ...) do {                      \
+   if (INTEL_DEBUG & DEBUG_PERF)                       \
+      dbg_printf(__VA_ARGS__);                         \
+   if (unlikely(dbg))                                  \
+      pipe_debug_message(dbg, PERF_INFO, __VA_ARGS__); \
+} while(0)
+
+
+struct pipe_context *
+crocus_create_context(struct pipe_screen *screen, void *priv, unsigned flags);
+
+void crocus_lost_context_state(struct crocus_batch *batch);
+
+void crocus_init_blit_functions(struct pipe_context *ctx);
+void crocus_init_clear_functions(struct pipe_context *ctx);
+void crocus_init_program_functions(struct pipe_context *ctx);
+void crocus_init_resource_functions(struct pipe_context *ctx);
+bool crocus_update_compiled_shaders(struct crocus_context *ice);
+void crocus_update_compiled_compute_shader(struct crocus_context *ice);
+void crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+                                      unsigned threads, uint32_t *dst);
+
+
+/* crocus_blit.c */
+enum crocus_blitter_op
+{
+   CROCUS_SAVE_TEXTURES      = 1,
+   CROCUS_SAVE_FRAMEBUFFER   = 2,
+   CROCUS_SAVE_FRAGMENT_STATE = 4,
+   CROCUS_DISABLE_RENDER_COND = 8,
+};
+void crocus_blitter_begin(struct crocus_context *ice, enum crocus_blitter_op op, bool render_cond);
+
+void crocus_blorp_surf_for_resource(struct crocus_vtable *vtbl,
+                                    struct isl_device *isl_dev,
+                                    struct blorp_surf *surf,
+                                    struct pipe_resource *p_res,
+                                    enum isl_aux_usage aux_usage,
+                                    unsigned level,
+                                    bool is_render_target);
+void crocus_copy_region(struct blorp_context *blorp,
+                        struct crocus_batch *batch,
+                        struct pipe_resource *dst,
+                        unsigned dst_level,
+                        unsigned dstx, unsigned dsty, unsigned dstz,
+                        struct pipe_resource *src,
+                        unsigned src_level,
+                        const struct pipe_box *src_box);
+
+/* crocus_draw.c */
+void crocus_draw_vbo(struct pipe_context *ctx,
+                     const struct pipe_draw_info *info,
+                     unsigned drawid_offset,
+                     const struct pipe_draw_indirect_info *indirect,
+                     const struct pipe_draw_start_count_bias *draws,
+                     unsigned num_draws);
+void crocus_launch_grid(struct pipe_context *, const struct pipe_grid_info *);
+
+/* crocus_pipe_control.c */
+
+void crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+                                    const char *reason, uint32_t flags);
+void crocus_emit_pipe_control_write(struct crocus_batch *batch,
+                                    const char *reason, uint32_t flags,
+                                    struct crocus_bo *bo, uint32_t offset,
+                                    uint64_t imm);
+void crocus_emit_mi_flush(struct crocus_batch *batch);
+void crocus_emit_depth_stall_flushes(struct crocus_batch *batch);
+void crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch);
+void crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+                                  const char *reason, uint32_t flags);
+void crocus_flush_all_caches(struct crocus_batch *batch);
+
+#define crocus_handle_always_flush_cache(batch)                 \
+   if (unlikely(batch->screen->driconf.always_flush_cache))     \
+      crocus_flush_all_caches(batch);
+
+void crocus_init_flush_functions(struct pipe_context *ctx);
+
+/* crocus_program.c */
+const struct shader_info *crocus_get_shader_info(const struct crocus_context *ice,
+                                                 gl_shader_stage stage);
+struct crocus_bo *crocus_get_scratch_space(struct crocus_context *ice,
+                                           unsigned per_thread_scratch,
+                                           gl_shader_stage stage);
+uint32_t crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+                                   enum crocus_surface_group group,
+                                   uint32_t index);
+uint32_t crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+                                   enum crocus_surface_group group,
+                                   uint32_t bti);
+
+/* crocus_disk_cache.c */
+
+void crocus_disk_cache_store(struct disk_cache *cache,
+                             const struct crocus_uncompiled_shader *ish,
+                             const struct crocus_compiled_shader *shader,
+                             void *map,
+                             const void *prog_key,
+                             uint32_t prog_key_size);
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+                           const struct crocus_uncompiled_shader *ish,
+                           const void *prog_key,
+                           uint32_t prog_key_size);
+
+/* crocus_program_cache.c */
+
+void crocus_init_program_cache(struct crocus_context *ice);
+void crocus_destroy_program_cache(struct crocus_context *ice);
+void crocus_print_program_cache(struct crocus_context *ice);
+struct crocus_compiled_shader *crocus_find_cached_shader(struct crocus_context *ice,
+                                                         enum crocus_program_cache_id,
+                                                         uint32_t key_size,
+                                                         const void *key);
+struct crocus_compiled_shader *crocus_upload_shader(struct crocus_context *ice,
+                                                    enum crocus_program_cache_id,
+                                                    uint32_t key_size,
+                                                    const void *key,
+                                                    const void *assembly,
+                                                    uint32_t asm_size,
+                                                    struct brw_stage_prog_data *,
+                                                    uint32_t prog_data_size,
+                                                    uint32_t *streamout,
+                                                    enum brw_param_builtin *sysv,
+                                                    unsigned num_system_values,
+                                                    unsigned num_cbufs,
+                                                    const struct crocus_binding_table *bt);
+const void *crocus_find_previous_compile(const struct crocus_context *ice,
+                                         enum crocus_program_cache_id cache_id,
+                                         unsigned program_string_id);
+bool crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch,
+                                const void *key,
+                                uint32_t key_size,
+                                uint32_t *kernel_out,
+                                void *prog_data_out);
+bool crocus_blorp_upload_shader(struct blorp_batch *blorp_batch,
+                                uint32_t stage,
+                                const void *key, uint32_t key_size,
+                                const void *kernel, uint32_t kernel_size,
+                                const struct brw_stage_prog_data *prog_data,
+                                uint32_t prog_data_size,
+                                uint32_t *kernel_out,
+                                void *prog_data_out);
+
+/* crocus_resolve.c */
+
+void crocus_predraw_resolve_inputs(struct crocus_context *ice,
+                                   struct crocus_batch *batch,
+                                   bool *draw_aux_buffer_disabled,
+                                   gl_shader_stage stage,
+                                   bool consider_framebuffer);
+void crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+                                        struct crocus_batch *batch,
+                                        bool *draw_aux_buffer_disabled);
+void crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+                                             struct crocus_batch *batch);
+void crocus_cache_sets_clear(struct crocus_batch *batch);
+void crocus_flush_depth_and_render_caches(struct crocus_batch *batch);
+void crocus_cache_flush_for_read(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_cache_flush_for_render(struct crocus_batch *batch,
+                                   struct crocus_bo *bo,
+                                   enum isl_format format,
+                                   enum isl_aux_usage aux_usage);
+void crocus_render_cache_add_bo(struct crocus_batch *batch,
+                                struct crocus_bo *bo,
+                                enum isl_format format,
+                                enum isl_aux_usage aux_usage);
+void crocus_cache_flush_for_depth(struct crocus_batch *batch, struct crocus_bo *bo);
+void crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo);
+int crocus_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                                 struct pipe_driver_query_info *info);
+int crocus_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                       unsigned index,
+                                       struct pipe_driver_query_group_info *info);
+
+struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ctx);
+
+bool crocus_sw_check_cond_render(struct crocus_context *ice);
+static inline bool crocus_check_conditional_render(struct crocus_context *ice)
+{
+   if (ice->state.predicate == CROCUS_PREDICATE_STATE_STALL_FOR_QUERY)
+      return crocus_sw_check_cond_render(ice);
+   return ice->state.predicate != CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#ifdef genX
+#  include "crocus_genx_protos.h"
+#else
+#  define genX(x) gfx4_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#  define genX(x) gfx45_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#  define genX(x) gfx5_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#  define genX(x) gfx6_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#  define genX(x) gfx7_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#  define genX(x) gfx75_##x
+#  include "crocus_genx_protos.h"
+#  undef genX
+#endif
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_defines.h b/src/gallium/drivers/crocus/crocus_defines.h
new file mode 100644
index 00000000000..a634d0746b0
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_defines.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_DEFINES_H
+#define CROCUS_DEFINES_H
+
+/**
+ * @file crocus_defines.h
+ *
+ * Random hardware #defines that we're not using GENXML for.
+ */
+
+#define MI_PREDICATE                         (0xC << 23)
+# define MI_PREDICATE_LOADOP_KEEP            (0 << 6)
+# define MI_PREDICATE_LOADOP_LOAD            (2 << 6)
+# define MI_PREDICATE_LOADOP_LOADINV         (3 << 6)
+# define MI_PREDICATE_COMBINEOP_SET          (0 << 3)
+# define MI_PREDICATE_COMBINEOP_AND          (1 << 3)
+# define MI_PREDICATE_COMBINEOP_OR           (2 << 3)
+# define MI_PREDICATE_COMBINEOP_XOR          (3 << 3)
+# define MI_PREDICATE_COMPAREOP_TRUE         (0 << 0)
+# define MI_PREDICATE_COMPAREOP_FALSE        (1 << 0)
+# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL   (2 << 0)
+# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL (3 << 0)
+
+/* Predicate registers */
+#define MI_PREDICATE_SRC0                    0x2400
+#define MI_PREDICATE_SRC1                    0x2408
+#define MI_PREDICATE_DATA                    0x2410
+#define MI_PREDICATE_RESULT                  0x2418
+#define MI_PREDICATE_RESULT_1                0x241C
+#define MI_PREDICATE_RESULT_2                0x2214
+
+#define CS_GPR(n) (0x2600 + (n) * 8)
+
+/* The number of bits in our TIMESTAMP queries. */
+#define TIMESTAMP_BITS 36
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_disk_cache.c b/src/gallium/drivers/crocus/crocus_disk_cache.c
new file mode 100644
index 00000000000..c84d043fbc8
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_disk_cache.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_disk_cache.c
+ *
+ * Functions for interacting with the on-disk shader cache.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+
+#include "compiler/nir/nir.h"
+#include "util/blob.h"
+#include "util/build_id.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
+
+#include "crocus_context.h"
+
+static bool debug = false;
+
+/**
+ * Compute a disk cache key for the given uncompiled shader and NOS key.
+ */
+static void
+crocus_disk_cache_compute_key(struct disk_cache *cache,
+                              const struct crocus_uncompiled_shader *ish,
+                              const void *orig_prog_key,
+                              uint32_t prog_key_size,
+                              cache_key cache_key)
+{
+   /* Create a copy of the program key with program_string_id zeroed out.
+    * It's essentially random data which we don't want to include in our
+    * hashing and comparisons.  We'll set a proper value on a cache hit.
+    */
+   union brw_any_prog_key prog_key;
+   memcpy(&prog_key, orig_prog_key, prog_key_size);
+   prog_key.base.program_string_id = 0;
+
+   uint8_t data[sizeof(prog_key) + sizeof(ish->nir_sha1)];
+   uint32_t data_size = prog_key_size + sizeof(ish->nir_sha1);
+
+   memcpy(data, ish->nir_sha1, sizeof(ish->nir_sha1));
+   memcpy(data + sizeof(ish->nir_sha1), &prog_key, prog_key_size);
+
+   disk_cache_compute_key(cache, data, data_size, cache_key);
+}
+
+/**
+ * Store the given compiled shader in the disk cache.
+ *
+ * This should only be called on newly compiled shaders.  No checking is
+ * done to prevent repeated stores of the same shader.
+ */
+void
+crocus_disk_cache_store(struct disk_cache *cache,
+                        const struct crocus_uncompiled_shader *ish,
+                        const struct crocus_compiled_shader *shader,
+                        void *map,
+                        const void *prog_key,
+                        uint32_t prog_key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (!cache)
+      return;
+
+   gl_shader_stage stage = ish->nir->info.stage;
+   const struct brw_stage_prog_data *prog_data = shader->prog_data;
+
+   cache_key cache_key;
+   crocus_disk_cache_compute_key(cache, ish, prog_key, prog_key_size, cache_key);
+
+   if (debug) {
+      char sha1[41];
+      _mesa_sha1_format(sha1, cache_key);
+      fprintf(stderr, "[mesa disk cache] storing %s\n", sha1);
+   }
+
+   struct blob blob;
+   blob_init(&blob);
+
+   /* We write the following data to the cache blob:
+    *
+    * 1. Prog data (must come first because it has the assembly size)
+    * 2. Assembly code
+    * 3. Number of entries in the system value array
+    * 4. System value array
+    * 5. Legacy param array (only used for compute workgroup ID)
+    * 6. Binding table
+    */
+   blob_write_bytes(&blob, shader->prog_data, brw_prog_data_size(stage));
+   blob_write_bytes(&blob, map + shader->offset, shader->prog_data->program_size);
+   blob_write_bytes(&blob, &shader->num_system_values, sizeof(unsigned));
+   blob_write_bytes(&blob, shader->system_values,
+                    shader->num_system_values * sizeof(enum brw_param_builtin));
+   blob_write_bytes(&blob, prog_data->param,
+                    prog_data->nr_params * sizeof(uint32_t));
+   blob_write_bytes(&blob, &shader->bt, sizeof(shader->bt));
+
+   disk_cache_put(cache, cache_key, blob.data, blob.size, NULL);
+   blob_finish(&blob);
+#endif
+}
+
+/**
+ * Search for a compiled shader in the disk cache.  If found, upload it
+ * to the in-memory program cache so we can use it.
+ */
+struct crocus_compiled_shader *
+crocus_disk_cache_retrieve(struct crocus_context *ice,
+                           const struct crocus_uncompiled_shader *ish,
+                           const void *prog_key,
+                           uint32_t key_size)
+{
+#ifdef ENABLE_SHADER_CACHE
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   struct disk_cache *cache = screen->disk_cache;
+   gl_shader_stage stage = ish->nir->info.stage;
+
+   if (!cache)
+      return NULL;
+
+   cache_key cache_key;
+   crocus_disk_cache_compute_key(cache, ish, prog_key, key_size, cache_key);
+
+   if (debug) {
+      char sha1[41];
+      _mesa_sha1_format(sha1, cache_key);
+      fprintf(stderr, "[mesa disk cache] retrieving %s: ", sha1);
+   }
+
+   size_t size;
+   void *buffer = disk_cache_get(screen->disk_cache, cache_key, &size);
+
+   if (debug)
+      fprintf(stderr, "%s\n", buffer ? "found" : "missing");
+
+   if (!buffer)
+      return NULL;
+
+   const uint32_t prog_data_size = brw_prog_data_size(stage);
+
+   struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+   const void *assembly;
+   uint32_t num_system_values;
+   uint32_t *system_values = NULL;
+   uint32_t *so_decls = NULL;
+
+   struct blob_reader blob;
+   blob_reader_init(&blob, buffer, size);
+   blob_copy_bytes(&blob, prog_data, prog_data_size);
+   assembly = blob_read_bytes(&blob, prog_data->program_size);
+   num_system_values = blob_read_uint32(&blob);
+   if (num_system_values) {
+      system_values =
+         ralloc_array(NULL, enum brw_param_builtin, num_system_values);
+      blob_copy_bytes(&blob, system_values,
+                      num_system_values * sizeof(enum brw_param_builtin));
+   }
+
+   prog_data->param = NULL;
+   prog_data->pull_param = NULL;
+   assert(prog_data->nr_pull_params == 0);
+
+   if (prog_data->nr_params) {
+      prog_data->param = ralloc_array(NULL, uint32_t, prog_data->nr_params);
+      blob_copy_bytes(&blob, prog_data->param,
+                      prog_data->nr_params * sizeof(uint32_t));
+   }
+
+   struct crocus_binding_table bt;
+   blob_copy_bytes(&blob, &bt, sizeof(bt));
+
+   if ((stage == MESA_SHADER_VERTEX ||
+        stage == MESA_SHADER_TESS_EVAL ||
+        stage == MESA_SHADER_GEOMETRY) && screen->devinfo.ver > 6) {
+      struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+      so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+                                                  &vue_prog_data->vue_map);
+   }
+
+   /* System values and uniforms are stored in constant buffer 0, the
+    * user-facing UBOs are indexed by one.  So if any constant buffer is
+    * needed, the constant buffer 0 will be needed, so account for it.
+    */
+   unsigned num_cbufs = ish->nir->info.num_ubos;
+
+   if (num_cbufs || ish->nir->num_uniforms)
+      num_cbufs++;
+
+   if (num_system_values)
+      num_cbufs++;
+
+   /* Upload our newly read shader to the in-memory program cache and
+    * return it to the caller.
+    */
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, stage, key_size, prog_key, assembly,
+                           prog_data->program_size,
+                           prog_data, prog_data_size, so_decls, system_values,
+                           num_system_values, num_cbufs, &bt);
+
+   free(buffer);
+
+   return shader;
+#else
+   return NULL;
+#endif
+}
+
+/**
+ * Initialize the on-disk shader cache.
+ */
+void
+crocus_disk_cache_init(struct crocus_screen *screen)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
+      return;
+
+   /* array length = print length + nul char + 1 extra to verify it's unused */
+   char renderer[13];
+   UNUSED int len =
+      snprintf(renderer, sizeof(renderer), "crocus_%04x", screen->pci_id);
+   assert(len == sizeof(renderer) - 2);
+
+   const struct build_id_note *note =
+      build_id_find_nhdr_for_addr(crocus_disk_cache_init);
+   assert(note && build_id_length(note) == 20); /* sha1 */
+
+   const uint8_t *id_sha1 = build_id_data(note);
+   assert(id_sha1);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, id_sha1);
+
+   const uint64_t driver_flags =
+      brw_get_compiler_config_value(screen->compiler);
+   screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#endif
+}
diff --git a/src/gallium/drivers/crocus/crocus_draw.c b/src/gallium/drivers/crocus/crocus_draw.c
new file mode 100644
index 00000000000..119c5571ae1
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_draw.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_draw.c
+ *
+ * The main driver hooks for drawing and launching compute shaders.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_draw.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu_defines.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "util/u_prim_restart.h"
+#include "indices/u_primconvert.h"
+#include "util/u_prim.h"
+
+static bool
+prim_is_points_or_lines(enum pipe_prim_type mode)
+{
+   /* We don't need to worry about adjacency - it can only be used with
+    * geometry shaders, and we don't care about this info when GS is on.
+    */
+   return mode == PIPE_PRIM_POINTS ||
+          mode == PIPE_PRIM_LINES ||
+          mode == PIPE_PRIM_LINE_LOOP ||
+          mode == PIPE_PRIM_LINE_STRIP;
+}
+
+static bool
+can_cut_index_handle_restart_index(struct crocus_context *ice,
+                                   const struct pipe_draw_info *draw)
+{
+   switch (draw->index_size) {
+   case 1:
+      return draw->restart_index == 0xff;
+   case 2:
+      return draw->restart_index == 0xffff;
+   case 4:
+      return draw->restart_index == 0xffffffff;
+   default:
+      unreachable("illegal index size\n");
+   }
+
+   return false;
+}
+
+static bool
+can_cut_index_handle_prim(struct crocus_context *ice,
+                          const struct pipe_draw_info *draw)
+{
+   struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   /* Haswell can do it all. */
+   if (devinfo->is_haswell)
+      return true;
+
+   if (!can_cut_index_handle_restart_index(ice, draw))
+      return false;
+
+   switch (draw->mode) {
+   case PIPE_PRIM_POINTS:
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_LINES_ADJACENCY:
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return true;
+   default:
+      break;
+   }
+   return false;
+}
+
+/**
+ * Record the current primitive mode and restart information, flagging
+ * related packets as dirty if necessary.
+ *
+ * This must be called before updating compiled shaders, because the patch
+ * information informs the TCS key.
+ */
+static void
+crocus_update_draw_info(struct crocus_context *ice,
+                        const struct pipe_draw_info *info,
+                        const struct pipe_draw_start_count_bias *draw)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   enum pipe_prim_type mode = info->mode;
+
+   if (screen->devinfo.ver < 6) {
+      /* Slight optimization to avoid the GS program when not needed:
+       */
+      struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+      if (mode == PIPE_PRIM_QUAD_STRIP && !rs_state->flatshade &&
+          rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+          rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+         mode = PIPE_PRIM_TRIANGLE_STRIP;
+      if (mode == PIPE_PRIM_QUADS &&
+          draw->count == 4 &&
+          !rs_state->flatshade &&
+          rs_state->fill_front == PIPE_POLYGON_MODE_FILL &&
+          rs_state->fill_back == PIPE_POLYGON_MODE_FILL)
+         mode = PIPE_PRIM_TRIANGLE_FAN;
+   }
+
+   if (ice->state.prim_mode != mode) {
+      ice->state.prim_mode = mode;
+
+      if (screen->devinfo.ver < 6)
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+      if (screen->devinfo.ver <= 6)
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+      if (screen->devinfo.ver >= 7)
+         ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+
+      /* For XY Clip enables */
+      bool points_or_lines = prim_is_points_or_lines(mode);
+      if (points_or_lines != ice->state.prim_is_points_or_lines) {
+         ice->state.prim_is_points_or_lines = points_or_lines;
+         ice->state.dirty |= CROCUS_DIRTY_CLIP;
+      }
+   }
+
+   if (info->mode == PIPE_PRIM_PATCHES &&
+       ice->state.vertices_per_patch != info->vertices_per_patch) {
+      ice->state.vertices_per_patch = info->vertices_per_patch;
+
+      /* This is needed for key->input_vertices */
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_TCS;
+
+      /* Flag constants dirty for gl_PatchVerticesIn if needed. */
+      const struct shader_info *tcs_info =
+         crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+      if (tcs_info &&
+          BITSET_TEST(tcs_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+         ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+         ice->state.shaders[MESA_SHADER_TESS_CTRL].sysvals_need_upload = true;
+      }
+   }
+
+   const unsigned cut_index = info->primitive_restart ? info->restart_index :
+                                                        ice->state.cut_index;
+   if (ice->state.primitive_restart != info->primitive_restart ||
+       ice->state.cut_index != cut_index) {
+      if (screen->devinfo.is_haswell)
+         ice->state.dirty |= CROCUS_DIRTY_GEN75_VF;
+      ice->state.primitive_restart = info->primitive_restart;
+      ice->state.cut_index = info->restart_index;
+   }
+}
+
+/**
+ * Update shader draw parameters, flagging VF packets as dirty if necessary.
+ */
+static void
+crocus_update_draw_parameters(struct crocus_context *ice,
+                              const struct pipe_draw_info *info,
+                              unsigned drawid_offset,
+                              const struct pipe_draw_indirect_info *indirect,
+                              const struct pipe_draw_start_count_bias *draw)
+{
+   bool changed = false;
+
+   if (ice->state.vs_uses_draw_params) {
+      struct crocus_state_ref *draw_params = &ice->draw.draw_params;
+
+      if (indirect && indirect->buffer) {
+         pipe_resource_reference(&draw_params->res, indirect->buffer);
+         draw_params->offset =
+            indirect->offset + (info->index_size ? 12 : 8);
+
+         changed = true;
+         ice->draw.params_valid = false;
+      } else {
+         int firstvertex = info->index_size ? draw->index_bias : draw->start;
+
+         if (!ice->draw.params_valid ||
+             ice->draw.params.firstvertex != firstvertex ||
+             ice->draw.params.baseinstance != info->start_instance) {
+
+            changed = true;
+            ice->draw.params.firstvertex = firstvertex;
+            ice->draw.params.baseinstance = info->start_instance;
+            ice->draw.params_valid = true;
+
+            u_upload_data(ice->ctx.stream_uploader, 0,
+                          sizeof(ice->draw.params), 4, &ice->draw.params,
+                          &draw_params->offset, &draw_params->res);
+         }
+      }
+   }
+
+   if (ice->state.vs_uses_derived_draw_params) {
+      struct crocus_state_ref *derived_params = &ice->draw.derived_draw_params;
+      int is_indexed_draw = info->index_size ? -1 : 0;
+
+      if (ice->draw.derived_params.drawid != drawid_offset ||
+          ice->draw.derived_params.is_indexed_draw != is_indexed_draw) {
+
+         changed = true;
+         ice->draw.derived_params.drawid = drawid_offset;
+         ice->draw.derived_params.is_indexed_draw = is_indexed_draw;
+
+         u_upload_data(ice->ctx.stream_uploader, 0,
+                       sizeof(ice->draw.derived_params), 4,
+                       &ice->draw.derived_params, &derived_params->offset,
+                       &derived_params->res);
+      }
+   }
+
+   if (changed) {
+      ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+                          CROCUS_DIRTY_VERTEX_ELEMENTS;
+   }
+}
+
+static void
+crocus_indirect_draw_vbo(struct crocus_context *ice,
+                         const struct pipe_draw_info *dinfo,
+                         unsigned drawid_offset,
+                         const struct pipe_draw_indirect_info *dindirect,
+                         const struct pipe_draw_start_count_bias *draws)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   struct pipe_draw_info info = *dinfo;
+   struct pipe_draw_indirect_info indirect = *dindirect;
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   if (devinfo->is_haswell && indirect.indirect_draw_count &&
+       ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+      /* Upload MI_PREDICATE_RESULT to GPR15.*/
+      screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT);
+   }
+
+   uint64_t orig_dirty = ice->state.dirty;
+   uint64_t orig_stage_dirty = ice->state.stage_dirty;
+
+   for (int i = 0; i < indirect.draw_count; i++) {
+      crocus_batch_maybe_flush(batch, 1500);
+      crocus_require_statebuffer_space(batch, 2400);
+
+      crocus_update_draw_parameters(ice, &info, drawid_offset + i, &indirect, draws);
+
+      screen->vtbl.upload_render_state(ice, batch, &info, drawid_offset + i, &indirect, draws);
+
+      ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+      ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+
+      indirect.offset += indirect.stride;
+   }
+
+   if (devinfo->is_haswell && indirect.indirect_draw_count &&
+       ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+      /* Restore MI_PREDICATE_RESULT. */
+      screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15));
+   }
+
+   /* Put this back for post-draw resolves, we'll clear it again after. */
+   ice->state.dirty = orig_dirty;
+   ice->state.stage_dirty = orig_stage_dirty;
+}
+
+static void
+crocus_simple_draw_vbo(struct crocus_context *ice,
+                       const struct pipe_draw_info *draw,
+                       unsigned drawid_offset,
+                       const struct pipe_draw_indirect_info *indirect,
+                       const struct pipe_draw_start_count_bias *sc)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+
+   crocus_batch_maybe_flush(batch, 1500);
+   crocus_require_statebuffer_space(batch, 2400);
+
+   crocus_update_draw_parameters(ice, draw, drawid_offset, indirect, sc);
+
+   screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc);
+}
+
+static void
+crocus_draw_vbo_get_vertex_count(struct pipe_context *ctx,
+                                 const struct pipe_draw_info *info_in,
+                                 unsigned drawid_offset,
+                                 const struct pipe_draw_indirect_info *indirect)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   struct pipe_draw_info info = *info_in;
+   struct pipe_draw_start_count_bias draw;
+
+   uint32_t val = screen->vtbl.get_so_offset(indirect->count_from_stream_output);
+
+   draw.start = 0;
+   draw.count = val;
+   ctx->draw_vbo(ctx, &info, drawid_offset, NULL, &draw, 1);
+}
+
+/**
+ * The pipe->draw_vbo() driver hook.  Performs a draw on the GPU.
+ */
+void
+crocus_draw_vbo(struct pipe_context *ctx,
+                const struct pipe_draw_info *info,
+                unsigned drawid_offset,
+                const struct pipe_draw_indirect_info *indirect,
+                const struct pipe_draw_start_count_bias *draws,
+                unsigned num_draws)
+{
+   if (num_draws > 1) {
+      util_draw_multi(ctx, info, drawid_offset, indirect, draws, num_draws);
+      return;
+   }
+
+   if (!indirect && (!draws[0].count || !info->instance_count))
+      return;
+
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_screen *screen = (struct crocus_screen*)ice->ctx.screen;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+   if (!crocus_check_conditional_render(ice))
+      return;
+
+   if (info->primitive_restart && !can_cut_index_handle_prim(ice, info)) {
+      util_draw_vbo_without_prim_restart(ctx, info, drawid_offset,
+                                         indirect, draws);
+      return;
+   }
+
+   if (indirect && indirect->count_from_stream_output &&
+       !screen->devinfo.is_haswell) {
+      crocus_draw_vbo_get_vertex_count(ctx, info, drawid_offset, indirect);
+      return;
+   }
+
+   /**
+    * The hardware is capable of removing dangling vertices on its own; however,
+    * prior to Gen6, we sometimes convert quads into trifans (and quad strips
+    * into tristrips), since pre-Gen6 hardware requires a GS to render quads.
+    * This function manually trims dangling vertices from a draw call involving
+    * quads so that those dangling vertices won't get drawn when we convert to
+    * trifans/tristrips.
+    */
+   if (screen->devinfo.ver < 6) {
+      if (info->mode == PIPE_PRIM_QUADS || info->mode == PIPE_PRIM_QUAD_STRIP) {
+         bool trim = u_trim_pipe_prim(info->mode, (unsigned *)&draws[0].count);
+         if (!trim)
+            return;
+      }
+   }
+
+   /* We can't safely re-emit 3DSTATE_SO_BUFFERS because it may zero the
+    * write offsets, changing the behavior.
+    */
+   if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER & ~CROCUS_DIRTY_GEN7_SO_BUFFERS;
+      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+   }
+
+   /* Emit Sandybridge workaround flushes on every primitive, for safety. */
+   if (screen->devinfo.ver == 6)
+      crocus_emit_post_sync_nonzero_flush(batch);
+
+   crocus_update_draw_info(ice, info, draws);
+
+   if (!crocus_update_compiled_shaders(ice))
+      return;
+
+   if (ice->state.dirty & CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES) {
+      bool draw_aux_buffer_disabled[BRW_MAX_DRAW_BUFFERS] = { };
+      for (gl_shader_stage stage = 0; stage < MESA_SHADER_COMPUTE; stage++) {
+         if (ice->shaders.prog[stage])
+            crocus_predraw_resolve_inputs(ice, batch, draw_aux_buffer_disabled,
+                                          stage, true);
+      }
+      crocus_predraw_resolve_framebuffer(ice, batch, draw_aux_buffer_disabled);
+   }
+
+   crocus_handle_always_flush_cache(batch);
+
+   if (indirect && indirect->buffer)
+      crocus_indirect_draw_vbo(ice, info, drawid_offset, indirect, draws);
+   else
+      crocus_simple_draw_vbo(ice, info, drawid_offset, indirect, draws);
+
+   crocus_handle_always_flush_cache(batch);
+
+   crocus_postdraw_update_resolve_tracking(ice, batch);
+
+   ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_RENDER;
+   ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+}
+
+static void
+crocus_update_grid_size_resource(struct crocus_context *ice,
+                                 const struct pipe_grid_info *grid)
+{
+   struct crocus_state_ref *grid_ref = &ice->state.grid_size;
+   const struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_COMPUTE];
+   bool grid_needs_surface = shader->bt.used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS];
+
+   if (grid->indirect) {
+      pipe_resource_reference(&grid_ref->res, grid->indirect);
+      grid_ref->offset = grid->indirect_offset;
+
+      /* Zero out the grid size so that the next non-indirect grid launch will
+       * re-upload it properly.
+       */
+      memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid));
+   } else if (memcmp(ice->state.last_grid, grid->grid, sizeof(grid->grid)) != 0) {
+      memcpy(ice->state.last_grid, grid->grid, sizeof(grid->grid));
+      u_upload_data(ice->ctx.const_uploader, 0, sizeof(grid->grid), 4,
+                    grid->grid, &grid_ref->offset, &grid_ref->res);
+   }
+
+   /* Skip surface upload if we don't need it or we already have one */
+   if (!grid_needs_surface)
+      return;
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_CS;
+}
+
+
+void
+crocus_launch_grid(struct pipe_context *ctx, const struct pipe_grid_info *grid)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+   struct crocus_screen *screen = batch->screen;
+
+   if (!crocus_check_conditional_render(ice))
+      return;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_REEMIT)) {
+      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
+      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+   }
+
+   /* We can't do resolves on the compute engine, so awkwardly, we have to
+    * do them on the render batch...
+    */
+   if (ice->state.dirty & CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES) {
+      crocus_predraw_resolve_inputs(ice, &ice->batches[CROCUS_BATCH_RENDER], NULL,
+                                    MESA_SHADER_COMPUTE, false);
+   }
+
+   crocus_batch_maybe_flush(batch, 1500);
+   crocus_require_statebuffer_space(batch, 2500);
+   crocus_update_compiled_compute_shader(ice);
+
+   if (memcmp(ice->state.last_block, grid->block, sizeof(grid->block)) != 0) {
+      memcpy(ice->state.last_block, grid->block, sizeof(grid->block));
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+      ice->state.shaders[MESA_SHADER_COMPUTE].sysvals_need_upload = true;
+   }
+
+   crocus_update_grid_size_resource(ice, grid);
+
+   if (ice->state.compute_predicate) {
+      screen->vtbl.emit_compute_predicate(batch);
+      ice->state.compute_predicate = NULL;
+   }
+
+   crocus_handle_always_flush_cache(batch);
+
+   screen->vtbl.upload_compute_state(ice, batch, grid);
+
+   crocus_handle_always_flush_cache(batch);
+
+   ice->state.dirty &= ~CROCUS_ALL_DIRTY_FOR_COMPUTE;
+   ice->state.stage_dirty &= ~CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+
+   /* Note: since compute shaders can't access the framebuffer, there's
+    * no need to call crocus_postdraw_update_resolve_tracking.
+    */
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.c b/src/gallium/drivers/crocus/crocus_fence.c
new file mode 100644
index 00000000000..fdff24b2dd4
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_fence.c
+ *
+ * Fences for driver and IPC serialisation, scheduling and synchronisation.
+ */
+
+#include "util/u_inlines.h"
+#include "intel/common/intel_gem.h"
+
+#include "crocus_batch.h"
+#include "crocus_bufmgr.h"
+#include "crocus_context.h"
+#include "crocus_fence.h"
+#include "crocus_screen.h"
+
+static uint32_t
+gem_syncobj_create(int fd, uint32_t flags)
+{
+   struct drm_syncobj_create args = {
+      .flags = flags,
+   };
+
+   intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_CREATE, &args);
+
+   return args.handle;
+}
+
+static void
+gem_syncobj_destroy(int fd, uint32_t handle)
+{
+   struct drm_syncobj_destroy args = {
+      .handle = handle,
+   };
+
+   intel_ioctl(fd, DRM_IOCTL_SYNCOBJ_DESTROY, &args);
+}
+
+/**
+ * Make a new sync-point.
+ */
+struct crocus_syncobj *
+crocus_create_syncobj(struct crocus_screen *screen)
+{
+   struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+
+   if (!syncobj)
+      return NULL;
+
+   syncobj->handle = gem_syncobj_create(screen->fd, 0);
+   assert(syncobj->handle);
+
+   pipe_reference_init(&syncobj->ref, 1);
+
+   return syncobj;
+}
+
+void
+crocus_syncobj_destroy(struct crocus_screen *screen,
+                       struct crocus_syncobj *syncobj)
+{
+   gem_syncobj_destroy(screen->fd, syncobj->handle);
+   free(syncobj);
+}
+
+/**
+ * Add a sync-point to the batch, with the given flags.
+ *
+ * \p flags   One of I915_EXEC_FENCE_WAIT or I915_EXEC_FENCE_SIGNAL.
+ */
+void
+crocus_batch_add_syncobj(struct crocus_batch *batch,
+                         struct crocus_syncobj *syncobj, unsigned flags)
+{
+   struct drm_i915_gem_exec_fence *fence =
+      util_dynarray_grow(&batch->exec_fences, struct drm_i915_gem_exec_fence, 1);
+
+   *fence = (struct drm_i915_gem_exec_fence){
+      .handle = syncobj->handle,
+      .flags = flags,
+   };
+
+   struct crocus_syncobj **store =
+      util_dynarray_grow(&batch->syncobjs, struct crocus_syncobj *, 1);
+
+   *store = NULL;
+   crocus_syncobj_reference(batch->screen, store, syncobj);
+}
+
+/**
+ * Walk through a batch's dependencies (any I915_EXEC_FENCE_WAIT syncobjs)
+ * and unreference any which have already passed.
+ *
+ * Sometimes the compute batch is seldom used, and accumulates references
+ * to stale render batches that are no longer of interest, so we can free
+ * those up.
+ */
+static void
+clear_stale_syncobjs(struct crocus_batch *batch)
+{
+   struct crocus_screen *screen = batch->screen;
+
+   int n = util_dynarray_num_elements(&batch->syncobjs, struct crocus_syncobj *);
+
+   assert(n == util_dynarray_num_elements(&batch->exec_fences,
+                                          struct drm_i915_gem_exec_fence));
+
+   /* Skip the first syncobj, as it's the signalling one. */
+   for (int i = n - 1; i > 1; i--) {
+      struct crocus_syncobj **syncobj =
+         util_dynarray_element(&batch->syncobjs, struct crocus_syncobj *, i);
+      struct drm_i915_gem_exec_fence *fence =
+         util_dynarray_element(&batch->exec_fences,
+                               struct drm_i915_gem_exec_fence, i);
+      assert(fence->flags & I915_EXEC_FENCE_WAIT);
+
+      if (crocus_wait_syncobj(&screen->base, *syncobj, 0))
+         continue;
+
+      /* This sync object has already passed, there's no need to continue
+       * marking it as a dependency; we can stop holding on to the reference.
+       */
+      crocus_syncobj_reference(screen, syncobj, NULL);
+
+      /* Remove it from the lists; move the last element here. */
+      struct crocus_syncobj **nth_syncobj =
+         util_dynarray_pop_ptr(&batch->syncobjs, struct crocus_syncobj *);
+      struct drm_i915_gem_exec_fence *nth_fence =
+         util_dynarray_pop_ptr(&batch->exec_fences,
+                               struct drm_i915_gem_exec_fence);
+
+      if (syncobj != nth_syncobj) {
+         *syncobj = *nth_syncobj;
+         memcpy(fence, nth_fence, sizeof(*fence));
+      }
+   }
+}
+
+/* ------------------------------------------------------------------- */
+
+struct pipe_fence_handle {
+   struct pipe_reference ref;
+
+   struct pipe_context *unflushed_ctx;
+
+   struct crocus_fine_fence *fine[CROCUS_BATCH_COUNT];
+};
+
+static void
+crocus_fence_destroy(struct pipe_screen *p_screen,
+                     struct pipe_fence_handle *fence)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++)
+      crocus_fine_fence_reference(screen, &fence->fine[i], NULL);
+
+   free(fence);
+}
+
+static void
+crocus_fence_reference(struct pipe_screen *p_screen,
+                       struct pipe_fence_handle **dst,
+                       struct pipe_fence_handle *src)
+{
+   if (pipe_reference(&(*dst)->ref, &src->ref))
+      crocus_fence_destroy(p_screen, *dst);
+
+   *dst = src;
+}
+
+bool
+crocus_wait_syncobj(struct pipe_screen *p_screen,
+                    struct crocus_syncobj *syncobj, int64_t timeout_nsec)
+{
+   if (!syncobj)
+      return false;
+
+   struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+   struct drm_syncobj_wait args = {
+      .handles = (uintptr_t)&syncobj->handle,
+      .count_handles = 1,
+      .timeout_nsec = timeout_nsec,
+   };
+   return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+}
+
+static void
+crocus_fence_flush(struct pipe_context *ctx,
+                   struct pipe_fence_handle **out_fence, unsigned flags)
+{
+   struct crocus_screen *screen = (void *)ctx->screen;
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   const bool deferred = flags & PIPE_FLUSH_DEFERRED;
+
+   if (!deferred) {
+      for (unsigned i = 0; i < ice->batch_count; i++)
+         crocus_batch_flush(&ice->batches[i]);
+   }
+
+   if (!out_fence)
+      return;
+
+   struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+   if (!fence)
+      return;
+
+   pipe_reference_init(&fence->ref, 1);
+
+   if (deferred)
+      fence->unflushed_ctx = ctx;
+
+   for (unsigned b = 0; b < ice->batch_count; b++) {
+      struct crocus_batch *batch = &ice->batches[b];
+
+      if (deferred && crocus_batch_bytes_used(batch) > 0) {
+         struct crocus_fine_fence *fine =
+            crocus_fine_fence_new(batch, CROCUS_FENCE_BOTTOM_OF_PIPE);
+         crocus_fine_fence_reference(screen, &fence->fine[b], fine);
+         crocus_fine_fence_reference(screen, &fine, NULL);
+      } else {
+         /* This batch has no commands queued up (perhaps we just flushed,
+          * or all the commands are on the other batch).  Wait for the last
+          * syncobj on this engine - unless it's already finished by now.
+          */
+         if (crocus_fine_fence_signaled(batch->last_fence))
+            continue;
+
+         crocus_fine_fence_reference(screen, &fence->fine[b],
+                                     batch->last_fence);
+      }
+   }
+
+   crocus_fence_reference(ctx->screen, out_fence, NULL);
+   *out_fence = fence;
+}
+
+static void
+crocus_fence_await(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   /* Unflushed fences from the same context are no-ops. */
+   if (ctx && ctx == fence->unflushed_ctx)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+      struct crocus_fine_fence *fine = fence->fine[i];
+
+      if (crocus_fine_fence_signaled(fine))
+         continue;
+
+      for (unsigned b = 0; b < ice->batch_count; b++) {
+         struct crocus_batch *batch = &ice->batches[b];
+
+         /* We're going to make any future work in this batch wait for our
+          * fence to have gone by.  But any currently queued work doesn't
+          * need to wait.  Flush the batch now, so it can happen sooner.
+          */
+         crocus_batch_flush(batch);
+
+         /* Before adding a new reference, clean out any stale ones. */
+         clear_stale_syncobjs(batch);
+
+         crocus_batch_add_syncobj(batch, fine->syncobj, I915_EXEC_FENCE_WAIT);
+      }
+   }
+}
+
+#define NSEC_PER_SEC (1000 * USEC_PER_SEC)
+#define USEC_PER_SEC (1000 * MSEC_PER_SEC)
+#define MSEC_PER_SEC (1000)
+
+static uint64_t
+gettime_ns(void)
+{
+   struct timespec current;
+   clock_gettime(CLOCK_MONOTONIC, &current);
+   return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
+}
+
+static uint64_t
+rel2abs(uint64_t timeout)
+{
+   if (timeout == 0)
+      return 0;
+
+   uint64_t current_time = gettime_ns();
+   uint64_t max_timeout = (uint64_t)INT64_MAX - current_time;
+
+   timeout = MIN2(max_timeout, timeout);
+
+   return current_time + timeout;
+}
+
+static bool
+crocus_fence_finish(struct pipe_screen *p_screen, struct pipe_context *ctx,
+                    struct pipe_fence_handle *fence, uint64_t timeout)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+
+   /* If we created the fence with PIPE_FLUSH_DEFERRED, we may not have
+    * flushed yet.  Check if our syncobj is the current batch's signalling
+    * syncobj - if so, we haven't flushed and need to now.
+    *
+    * The Gallium docs mention that a flush will occur if \p ctx matches
+    * the context the fence was created with.  It may be NULL, so we check
+    * that it matches first.
+    */
+   if (ctx && ctx == fence->unflushed_ctx) {
+      for (unsigned i = 0; i < ice->batch_count; i++) {
+         struct crocus_fine_fence *fine = fence->fine[i];
+
+         if (crocus_fine_fence_signaled(fine))
+            continue;
+
+         if (fine->syncobj == crocus_batch_get_signal_syncobj(&ice->batches[i]))
+            crocus_batch_flush(&ice->batches[i]);
+      }
+
+      /* The fence is no longer deferred. */
+      fence->unflushed_ctx = NULL;
+   }
+
+   unsigned int handle_count = 0;
+   uint32_t handles[ARRAY_SIZE(fence->fine)];
+   for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+      struct crocus_fine_fence *fine = fence->fine[i];
+
+      if (crocus_fine_fence_signaled(fine))
+         continue;
+
+      handles[handle_count++] = fine->syncobj->handle;
+   }
+
+   if (handle_count == 0)
+      return true;
+
+   struct drm_syncobj_wait args = {
+      .handles = (uintptr_t)handles,
+      .count_handles = handle_count,
+      .timeout_nsec = rel2abs(timeout),
+      .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL
+   };
+   if (fence->unflushed_ctx) {
+      /* This fence had a deferred flush from another context.  We can't
+       * safely flush it here, because the context might be bound to a
+       * different thread, and poking at its internals wouldn't be safe.
+       *
+       * Instead, use the WAIT_FOR_SUBMIT flag to block and hope that
+       * another thread submits the work.
+       */
+      args.flags |= DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
+   }
+   return intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args) == 0;
+}
+
+#ifndef SYNC_IOC_MAGIC
+/* duplicated from linux/sync_file.h to avoid build-time dependency
+ * on new (v4.7) kernel headers.  Once distro's are mostly using
+ * something newer than v4.7 drop this and #include <linux/sync_file.h>
+ * instead.
+ */
+struct sync_merge_data {
+   char name[32];
+   __s32 fd2;
+   __s32 fence;
+   __u32 flags;
+   __u32 pad;
+};
+
+#define SYNC_IOC_MAGIC '>'
+#define SYNC_IOC_MERGE _IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
+#endif
+
+static int
+sync_merge_fd(int sync_fd, int new_fd)
+{
+   if (sync_fd == -1)
+      return new_fd;
+
+   if (new_fd == -1)
+      return sync_fd;
+
+   struct sync_merge_data args = {
+      .name = "crocus fence",
+      .fd2 = new_fd,
+      .fence = -1,
+   };
+
+   intel_ioctl(sync_fd, SYNC_IOC_MERGE, &args);
+   close(new_fd);
+   close(sync_fd);
+
+   return args.fence;
+}
+
+static int
+crocus_fence_get_fd(struct pipe_screen *p_screen,
+                    struct pipe_fence_handle *fence)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)p_screen;
+   int fd = -1;
+
+   /* Deferred fences aren't supported. */
+   if (fence->unflushed_ctx)
+      return -1;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+      struct crocus_fine_fence *fine = fence->fine[i];
+
+      if (crocus_fine_fence_signaled(fine))
+         continue;
+
+      struct drm_syncobj_handle args = {
+         .handle = fine->syncobj->handle,
+         .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+         .fd = -1,
+      };
+
+      intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+      fd = sync_merge_fd(fd, args.fd);
+   }
+
+   if (fd == -1) {
+      /* Our fence has no syncobj's recorded.  This means that all of the
+       * batches had already completed, their syncobj's had been signalled,
+       * and so we didn't bother to record them.  But we're being asked to
+       * export such a fence.  So export a dummy already-signalled syncobj.
+       */
+      struct drm_syncobj_handle args = {
+         .flags = DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE,
+         .fd = -1,
+      };
+
+      args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+      intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &args);
+      gem_syncobj_destroy(screen->fd, args.handle);
+      return args.fd;
+   }
+
+   return fd;
+}
+
+static void
+crocus_fence_create_fd(struct pipe_context *ctx, struct pipe_fence_handle **out,
+                       int fd, enum pipe_fd_type type)
+{
+   assert(type == PIPE_FD_TYPE_NATIVE_SYNC || type == PIPE_FD_TYPE_SYNCOBJ);
+
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   struct drm_syncobj_handle args = {
+      .fd = fd,
+   };
+
+   if (type == PIPE_FD_TYPE_NATIVE_SYNC) {
+      args.flags = DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE;
+      args.handle = gem_syncobj_create(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED);
+   }
+
+   if (intel_ioctl(screen->fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &args) == -1) {
+      fprintf(stderr, "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE failed: %s\n",
+              strerror(errno));
+      if (type == PIPE_FD_TYPE_NATIVE_SYNC)
+         gem_syncobj_destroy(screen->fd, args.handle);
+      *out = NULL;
+      return;
+   }
+
+   struct crocus_syncobj *syncobj = malloc(sizeof(*syncobj));
+   if (!syncobj) {
+      *out = NULL;
+      return;
+   }
+   syncobj->handle = args.handle;
+   pipe_reference_init(&syncobj->ref, 1);
+
+   struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+   if (!fine) {
+      free(syncobj);
+      *out = NULL;
+      return;
+   }
+
+   static const uint32_t zero = 0;
+
+   /* Fences work in terms of crocus_fine_fence, but we don't actually have a
+    * seqno for an imported fence.  So, create a fake one which always
+    * returns as 'not signaled' so we fall back to using the sync object.
+    */
+   fine->seqno = UINT32_MAX;
+   fine->map = &zero;
+   fine->syncobj = syncobj;
+   fine->flags = CROCUS_FENCE_END;
+   pipe_reference_init(&fine->reference, 1);
+
+   struct pipe_fence_handle *fence = calloc(1, sizeof(*fence));
+   if (!fence) {
+      free(fine);
+      free(syncobj);
+      *out = NULL;
+      return;
+   }
+   pipe_reference_init(&fence->ref, 1);
+   fence->fine[0] = fine;
+
+   *out = fence;
+}
+
+static void
+crocus_fence_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   if (ctx == fence->unflushed_ctx)
+      return;
+
+   for (unsigned b = 0; b < ice->batch_count; b++) {
+      for (unsigned i = 0; i < ARRAY_SIZE(fence->fine); i++) {
+         struct crocus_fine_fence *fine = fence->fine[i];
+
+         /* already signaled fence skipped */
+         if (crocus_fine_fence_signaled(fine))
+            continue;
+
+         ice->batches[b].contains_fence_signal = true;
+         crocus_batch_add_syncobj(&ice->batches[b], fine->syncobj,
+                                  I915_EXEC_FENCE_SIGNAL);
+      }
+   }
+}
+
+void
+crocus_init_screen_fence_functions(struct pipe_screen *screen)
+{
+   screen->fence_reference = crocus_fence_reference;
+   screen->fence_finish = crocus_fence_finish;
+   screen->fence_get_fd = crocus_fence_get_fd;
+}
+
+void
+crocus_init_context_fence_functions(struct pipe_context *ctx)
+{
+   ctx->flush = crocus_fence_flush;
+   ctx->create_fence_fd = crocus_fence_create_fd;
+   ctx->fence_server_sync = crocus_fence_await;
+   ctx->fence_server_signal = crocus_fence_signal;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fence.h b/src/gallium/drivers/crocus/crocus_fence.h
new file mode 100644
index 00000000000..ef2eff5259b
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fence.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FENCE_H
+#define CROCUS_FENCE_H
+
+#include "util/u_inlines.h"
+
+struct pipe_screen;
+struct crocus_screen;
+struct crocus_batch;
+
+struct crocus_syncobj {
+   struct pipe_reference ref;
+   uint32_t handle;
+};
+
+void crocus_init_context_fence_functions(struct pipe_context *ctx);
+void crocus_init_screen_fence_functions(struct pipe_screen *screen);
+
+struct crocus_syncobj *crocus_create_syncobj(struct crocus_screen *screen);
+void crocus_syncobj_destroy(struct crocus_screen *, struct crocus_syncobj *);
+void crocus_batch_add_syncobj(struct crocus_batch *batch,
+                              struct crocus_syncobj *syncobj,
+                              unsigned flags);
+bool crocus_wait_syncobj(struct pipe_screen *screen,
+                         struct crocus_syncobj *syncobj,
+                         int64_t timeout_nsec);
+static inline void
+crocus_syncobj_reference(struct crocus_screen *screen,
+                         struct crocus_syncobj **dst,
+                         struct crocus_syncobj *src)
+{
+   if (pipe_reference(&(*dst)->ref, &src->ref))
+      crocus_syncobj_destroy(screen, *dst);
+
+   *dst = src;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.c b/src/gallium/drivers/crocus/crocus_fine_fence.c
new file mode 100644
index 00000000000..9bb8a9673e3
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.c
@@ -0,0 +1,85 @@
+#include "crocus_context.h"
+#include "crocus_fine_fence.h"
+#include "util/u_upload_mgr.h"
+
+static void
+crocus_fine_fence_reset(struct crocus_batch *batch)
+{
+   u_upload_alloc(batch->fine_fences.uploader,
+                  0, sizeof(uint64_t), sizeof(uint64_t),
+                  &batch->fine_fences.ref.offset, &batch->fine_fences.ref.res,
+                  (void **)&batch->fine_fences.map);
+   WRITE_ONCE(*batch->fine_fences.map, 0);
+   batch->fine_fences.next++;
+}
+
+void
+crocus_fine_fence_init(struct crocus_batch *batch)
+{
+   batch->fine_fences.ref.res = NULL;
+   batch->fine_fences.next = 0;
+   if (batch_has_fine_fence(batch))
+      crocus_fine_fence_reset(batch);
+}
+
+static uint32_t
+crocus_fine_fence_next(struct crocus_batch *batch)
+{
+   if (!batch_has_fine_fence(batch))
+      return UINT32_MAX;
+
+   uint32_t seqno = batch->fine_fences.next++;
+
+   if (batch->fine_fences.next == 0)
+      crocus_fine_fence_reset(batch);
+
+   return seqno;
+}
+
+void
+crocus_fine_fence_destroy(struct crocus_screen *screen,
+                          struct crocus_fine_fence *fine)
+{
+   crocus_syncobj_reference(screen, &fine->syncobj, NULL);
+   pipe_resource_reference(&fine->ref.res, NULL);
+   free(fine);
+}
+
+struct crocus_fine_fence *
+crocus_fine_fence_new(struct crocus_batch *batch, unsigned flags)
+{
+   struct crocus_fine_fence *fine = calloc(1, sizeof(*fine));
+   if (!fine)
+      return NULL;
+
+   pipe_reference_init(&fine->reference, 1);
+
+   fine->seqno = crocus_fine_fence_next(batch);
+
+   crocus_syncobj_reference(batch->screen, &fine->syncobj,
+                            crocus_batch_get_signal_syncobj(batch));
+
+   if (!batch_has_fine_fence(batch))
+      return fine;
+   pipe_resource_reference(&fine->ref.res, batch->fine_fences.ref.res);
+   fine->ref.offset = batch->fine_fences.ref.offset;
+   fine->map = batch->fine_fences.map;
+   fine->flags = flags;
+
+   unsigned pc;
+   if (flags & CROCUS_FENCE_TOP_OF_PIPE) {
+      pc = PIPE_CONTROL_WRITE_IMMEDIATE | PIPE_CONTROL_CS_STALL;
+   } else {
+      pc = PIPE_CONTROL_WRITE_IMMEDIATE |
+           PIPE_CONTROL_RENDER_TARGET_FLUSH |
+           PIPE_CONTROL_TILE_CACHE_FLUSH |
+           PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+           PIPE_CONTROL_DATA_CACHE_FLUSH;
+   }
+   crocus_emit_pipe_control_write(batch, "fence: fine", pc,
+                                  crocus_resource_bo(fine->ref.res),
+                                  fine->ref.offset,
+                                  fine->seqno);
+
+   return fine;
+}
diff --git a/src/gallium/drivers/crocus/crocus_fine_fence.h b/src/gallium/drivers/crocus/crocus_fine_fence.h
new file mode 100644
index 00000000000..ad6f02a945a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_fine_fence.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2020 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_FINE_FENCE_DOT_H
+#define CROCUS_FINE_FENCE_DOT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "crocus_screen.h"
+#include "crocus_resource.h"
+
+/**
+ * A lightweight sequence number fence.
+ *
+ * We emit PIPE_CONTROLs inside a batch (possibly in the middle)
+ * which update a monotonically increasing, 32-bit counter.  We
+ * can then check if that moment has passed by either:
+ *
+ * 1. Checking on the CPU by snooping on the DWord via a coherent map
+ *
+ * 2. Blocking on the GPU with MI_SEMAPHORE_WAIT from a second batch
+ *    (relying on mid-batch preemption to switch GPU execution to the
+ *    batch that writes it).
+ */
+struct crocus_fine_fence {
+   struct pipe_reference reference;
+
+   /** Buffer where the seqno lives */
+   struct crocus_state_ref ref;
+
+   /** Coherent CPU map of the buffer containing the seqno DWord. */
+   const uint32_t *map;
+
+   /**
+    * A drm_syncobj pointing which will be signaled at the end of the
+    * batch which writes this seqno.  This can be used to block until
+    * the seqno has definitely passed (but may wait longer than necessary).
+    */
+   struct crocus_syncobj *syncobj;
+
+#define CROCUS_FENCE_BOTTOM_OF_PIPE 0x0 /**< Written by bottom-of-pipe flush */
+#define CROCUS_FENCE_TOP_OF_PIPE    0x1 /**< Written by top-of-pipe flush */
+#define CROCUS_FENCE_END            0x2 /**< Written at the end of a batch */
+
+   /** Information about the type of flush involved (see CROCUS_FENCE_*) */
+   uint32_t flags;
+
+   /**
+    * Sequence number expected to be written by the flush we inserted
+    * when creating this fence.  The crocus_fine_fence is 'signaled' when *@map
+    * (written by the flush on the GPU) is greater-than-or-equal to @seqno.
+    */
+   uint32_t seqno;
+};
+
+void crocus_fine_fence_init(struct crocus_batch *batch);
+
+struct crocus_fine_fence *crocus_fine_fence_new(struct crocus_batch *batch,
+                                                unsigned flags);
+
+void crocus_fine_fence_destroy(struct crocus_screen *screen,
+                               struct crocus_fine_fence *sq);
+
+static inline void
+crocus_fine_fence_reference(struct crocus_screen *screen,
+                            struct crocus_fine_fence **dst,
+                            struct crocus_fine_fence *src)
+{
+   if (pipe_reference(&(*dst)->reference, &src->reference))
+      crocus_fine_fence_destroy(screen, *dst);
+
+   *dst = src;
+}
+
+/**
+ * Return true if this seqno has passed.
+ *
+ * NULL is considered signaled.
+ */
+static inline bool
+crocus_fine_fence_signaled(const struct crocus_fine_fence *sq)
+{
+   if (sq && !sq->map)
+      return false;
+   return !sq || (READ_ONCE(*sq->map) >= sq->seqno);
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_formats.c b/src/gallium/drivers/crocus/crocus_formats.c
new file mode 100644
index 00000000000..31762643bdc
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_formats.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_formats.c
+ *
+ * Converts Gallium formats (PIPE_FORMAT_*) to hardware ones (ISL_FORMAT_*).
+ * Provides information about which formats support what features.
+ */
+
+#include "util/bitscan.h"
+#include "util/macros.h"
+#include "util/format/u_format.h"
+
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+static enum isl_format
+crocus_isl_format_for_pipe_format(enum pipe_format pf)
+{
+   static const enum isl_format table[PIPE_FORMAT_COUNT] = {
+      [0 ... PIPE_FORMAT_COUNT-1] = ISL_FORMAT_UNSUPPORTED,
+
+      [PIPE_FORMAT_B8G8R8A8_UNORM]          = ISL_FORMAT_B8G8R8A8_UNORM,
+      [PIPE_FORMAT_B8G8R8X8_UNORM]          = ISL_FORMAT_B8G8R8X8_UNORM,
+      [PIPE_FORMAT_B5G5R5A1_UNORM]          = ISL_FORMAT_B5G5R5A1_UNORM,
+      [PIPE_FORMAT_B4G4R4A4_UNORM]          = ISL_FORMAT_B4G4R4A4_UNORM,
+      [PIPE_FORMAT_B5G6R5_UNORM]            = ISL_FORMAT_B5G6R5_UNORM,
+      [PIPE_FORMAT_R10G10B10A2_UNORM]       = ISL_FORMAT_R10G10B10A2_UNORM,
+
+      [PIPE_FORMAT_Z16_UNORM]               = ISL_FORMAT_R16_UNORM,
+      [PIPE_FORMAT_Z32_UNORM]               = ISL_FORMAT_R32_UNORM,
+      [PIPE_FORMAT_Z32_FLOAT]               = ISL_FORMAT_R32_FLOAT,
+
+      /* We translate the combined depth/stencil formats to depth only here */
+      [PIPE_FORMAT_Z24_UNORM_S8_UINT]       = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+      [PIPE_FORMAT_Z24X8_UNORM]             = ISL_FORMAT_R24_UNORM_X8_TYPELESS,
+      [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT]    = ISL_FORMAT_R32_FLOAT,
+
+      [PIPE_FORMAT_S8_UINT]                 = ISL_FORMAT_R8_UINT,
+      [PIPE_FORMAT_X24S8_UINT]              = ISL_FORMAT_R8_UINT,
+      [PIPE_FORMAT_X32_S8X24_UINT]          = ISL_FORMAT_R8_UINT,
+
+      [PIPE_FORMAT_R64_FLOAT]               = ISL_FORMAT_R64_FLOAT,
+      [PIPE_FORMAT_R64G64_FLOAT]            = ISL_FORMAT_R64G64_FLOAT,
+      [PIPE_FORMAT_R64G64B64_FLOAT]         = ISL_FORMAT_R64G64B64_FLOAT,
+      [PIPE_FORMAT_R64G64B64A64_FLOAT]      = ISL_FORMAT_R64G64B64A64_FLOAT,
+      [PIPE_FORMAT_R32_FLOAT]               = ISL_FORMAT_R32_FLOAT,
+      [PIPE_FORMAT_R32G32_FLOAT]            = ISL_FORMAT_R32G32_FLOAT,
+      [PIPE_FORMAT_R32G32B32_FLOAT]         = ISL_FORMAT_R32G32B32_FLOAT,
+      [PIPE_FORMAT_R32G32B32A32_FLOAT]      = ISL_FORMAT_R32G32B32A32_FLOAT,
+      [PIPE_FORMAT_R32_UNORM]               = ISL_FORMAT_R32_UNORM,
+      [PIPE_FORMAT_R32G32_UNORM]            = ISL_FORMAT_R32G32_UNORM,
+      [PIPE_FORMAT_R32G32B32_UNORM]         = ISL_FORMAT_R32G32B32_UNORM,
+      [PIPE_FORMAT_R32G32B32A32_UNORM]      = ISL_FORMAT_R32G32B32A32_UNORM,
+      [PIPE_FORMAT_R32_USCALED]             = ISL_FORMAT_R32_USCALED,
+      [PIPE_FORMAT_R32G32_USCALED]          = ISL_FORMAT_R32G32_USCALED,
+      [PIPE_FORMAT_R32G32B32_USCALED]       = ISL_FORMAT_R32G32B32_USCALED,
+      [PIPE_FORMAT_R32G32B32A32_USCALED]    = ISL_FORMAT_R32G32B32A32_USCALED,
+      [PIPE_FORMAT_R32_SNORM]               = ISL_FORMAT_R32_SNORM,
+      [PIPE_FORMAT_R32G32_SNORM]            = ISL_FORMAT_R32G32_SNORM,
+      [PIPE_FORMAT_R32G32B32_SNORM]         = ISL_FORMAT_R32G32B32_SNORM,
+      [PIPE_FORMAT_R32G32B32A32_SNORM]      = ISL_FORMAT_R32G32B32A32_SNORM,
+      [PIPE_FORMAT_R32_SSCALED]             = ISL_FORMAT_R32_SSCALED,
+      [PIPE_FORMAT_R32G32_SSCALED]          = ISL_FORMAT_R32G32_SSCALED,
+      [PIPE_FORMAT_R32G32B32_SSCALED]       = ISL_FORMAT_R32G32B32_SSCALED,
+      [PIPE_FORMAT_R32G32B32A32_SSCALED]    = ISL_FORMAT_R32G32B32A32_SSCALED,
+      [PIPE_FORMAT_R16_UNORM]               = ISL_FORMAT_R16_UNORM,
+      [PIPE_FORMAT_R16G16_UNORM]            = ISL_FORMAT_R16G16_UNORM,
+      [PIPE_FORMAT_R16G16B16_UNORM]         = ISL_FORMAT_R16G16B16_UNORM,
+      [PIPE_FORMAT_R16G16B16A16_UNORM]      = ISL_FORMAT_R16G16B16A16_UNORM,
+      [PIPE_FORMAT_R16_USCALED]             = ISL_FORMAT_R16_USCALED,
+      [PIPE_FORMAT_R16G16_USCALED]          = ISL_FORMAT_R16G16_USCALED,
+      [PIPE_FORMAT_R16G16B16_USCALED]       = ISL_FORMAT_R16G16B16_USCALED,
+      [PIPE_FORMAT_R16G16B16A16_USCALED]    = ISL_FORMAT_R16G16B16A16_USCALED,
+      [PIPE_FORMAT_R16_SNORM]               = ISL_FORMAT_R16_SNORM,
+      [PIPE_FORMAT_R16G16_SNORM]            = ISL_FORMAT_R16G16_SNORM,
+      [PIPE_FORMAT_R16G16B16_SNORM]         = ISL_FORMAT_R16G16B16_SNORM,
+      [PIPE_FORMAT_R16G16B16A16_SNORM]      = ISL_FORMAT_R16G16B16A16_SNORM,
+      [PIPE_FORMAT_R16_SSCALED]             = ISL_FORMAT_R16_SSCALED,
+      [PIPE_FORMAT_R16G16_SSCALED]          = ISL_FORMAT_R16G16_SSCALED,
+      [PIPE_FORMAT_R16G16B16_SSCALED]       = ISL_FORMAT_R16G16B16_SSCALED,
+      [PIPE_FORMAT_R16G16B16A16_SSCALED]    = ISL_FORMAT_R16G16B16A16_SSCALED,
+      [PIPE_FORMAT_R8_UNORM]                = ISL_FORMAT_R8_UNORM,
+      [PIPE_FORMAT_R8G8_UNORM]              = ISL_FORMAT_R8G8_UNORM,
+      [PIPE_FORMAT_R8G8B8_UNORM]            = ISL_FORMAT_R8G8B8_UNORM,
+      [PIPE_FORMAT_R8G8B8A8_UNORM]          = ISL_FORMAT_R8G8B8A8_UNORM,
+      [PIPE_FORMAT_R8_USCALED]              = ISL_FORMAT_R8_USCALED,
+      [PIPE_FORMAT_R8G8_USCALED]            = ISL_FORMAT_R8G8_USCALED,
+      [PIPE_FORMAT_R8G8B8_USCALED]          = ISL_FORMAT_R8G8B8_USCALED,
+      [PIPE_FORMAT_R8G8B8A8_USCALED]        = ISL_FORMAT_R8G8B8A8_USCALED,
+      [PIPE_FORMAT_R8_SNORM]                = ISL_FORMAT_R8_SNORM,
+      [PIPE_FORMAT_R8G8_SNORM]              = ISL_FORMAT_R8G8_SNORM,
+      [PIPE_FORMAT_R8G8B8_SNORM]            = ISL_FORMAT_R8G8B8_SNORM,
+      [PIPE_FORMAT_R8G8B8A8_SNORM]          = ISL_FORMAT_R8G8B8A8_SNORM,
+      [PIPE_FORMAT_R8_SSCALED]              = ISL_FORMAT_R8_SSCALED,
+      [PIPE_FORMAT_R8G8_SSCALED]            = ISL_FORMAT_R8G8_SSCALED,
+      [PIPE_FORMAT_R8G8B8_SSCALED]          = ISL_FORMAT_R8G8B8_SSCALED,
+      [PIPE_FORMAT_R8G8B8A8_SSCALED]        = ISL_FORMAT_R8G8B8A8_SSCALED,
+      [PIPE_FORMAT_R32_FIXED]               = ISL_FORMAT_R32_SFIXED,
+      [PIPE_FORMAT_R32G32_FIXED]            = ISL_FORMAT_R32G32_SFIXED,
+      [PIPE_FORMAT_R32G32B32_FIXED]         = ISL_FORMAT_R32G32B32_SFIXED,
+      [PIPE_FORMAT_R32G32B32A32_FIXED]      = ISL_FORMAT_R32G32B32A32_SFIXED,
+      [PIPE_FORMAT_R16_FLOAT]               = ISL_FORMAT_R16_FLOAT,
+      [PIPE_FORMAT_R16G16_FLOAT]            = ISL_FORMAT_R16G16_FLOAT,
+      [PIPE_FORMAT_R16G16B16_FLOAT]         = ISL_FORMAT_R16G16B16_FLOAT,
+      [PIPE_FORMAT_R16G16B16A16_FLOAT]      = ISL_FORMAT_R16G16B16A16_FLOAT,
+
+      [PIPE_FORMAT_R8G8B8_SRGB]             = ISL_FORMAT_R8G8B8_UNORM_SRGB,
+      [PIPE_FORMAT_B8G8R8A8_SRGB]           = ISL_FORMAT_B8G8R8A8_UNORM_SRGB,
+      [PIPE_FORMAT_B8G8R8X8_SRGB]           = ISL_FORMAT_B8G8R8X8_UNORM_SRGB,
+      [PIPE_FORMAT_R8G8B8A8_SRGB]           = ISL_FORMAT_R8G8B8A8_UNORM_SRGB,
+
+      [PIPE_FORMAT_DXT1_RGB]                = ISL_FORMAT_BC1_UNORM,
+      [PIPE_FORMAT_DXT1_RGBA]               = ISL_FORMAT_BC1_UNORM,
+      [PIPE_FORMAT_DXT3_RGBA]               = ISL_FORMAT_BC2_UNORM,
+      [PIPE_FORMAT_DXT5_RGBA]               = ISL_FORMAT_BC3_UNORM,
+
+      [PIPE_FORMAT_DXT1_SRGB]               = ISL_FORMAT_BC1_UNORM_SRGB,
+      [PIPE_FORMAT_DXT1_SRGBA]              = ISL_FORMAT_BC1_UNORM_SRGB,
+      [PIPE_FORMAT_DXT3_SRGBA]              = ISL_FORMAT_BC2_UNORM_SRGB,
+      [PIPE_FORMAT_DXT5_SRGBA]              = ISL_FORMAT_BC3_UNORM_SRGB,
+
+      [PIPE_FORMAT_RGTC1_UNORM]             = ISL_FORMAT_BC4_UNORM,
+      [PIPE_FORMAT_RGTC1_SNORM]             = ISL_FORMAT_BC4_SNORM,
+      [PIPE_FORMAT_RGTC2_UNORM]             = ISL_FORMAT_BC5_UNORM,
+      [PIPE_FORMAT_RGTC2_SNORM]             = ISL_FORMAT_BC5_SNORM,
+
+      [PIPE_FORMAT_R10G10B10A2_USCALED]     = ISL_FORMAT_R10G10B10A2_USCALED,
+      [PIPE_FORMAT_R11G11B10_FLOAT]         = ISL_FORMAT_R11G11B10_FLOAT,
+      [PIPE_FORMAT_R9G9B9E5_FLOAT]          = ISL_FORMAT_R9G9B9E5_SHAREDEXP,
+      [PIPE_FORMAT_R1_UNORM]                = ISL_FORMAT_R1_UNORM,
+      [PIPE_FORMAT_R10G10B10X2_USCALED]     = ISL_FORMAT_R10G10B10X2_USCALED,
+      [PIPE_FORMAT_B10G10R10A2_UNORM]       = ISL_FORMAT_B10G10R10A2_UNORM,
+      [PIPE_FORMAT_R8G8B8X8_UNORM]          = ISL_FORMAT_R8G8B8X8_UNORM,
+
+      [PIPE_FORMAT_I8_UNORM]                = ISL_FORMAT_R8_UNORM,
+      [PIPE_FORMAT_I16_UNORM]               = ISL_FORMAT_R16_UNORM,
+      [PIPE_FORMAT_I8_SNORM]                = ISL_FORMAT_R8_SNORM,
+      [PIPE_FORMAT_I16_SNORM]               = ISL_FORMAT_R16_SNORM,
+      [PIPE_FORMAT_I16_FLOAT]               = ISL_FORMAT_R16_FLOAT,
+      [PIPE_FORMAT_I32_FLOAT]               = ISL_FORMAT_R32_FLOAT,
+
+      [PIPE_FORMAT_L8_UINT]                 = ISL_FORMAT_L8_UINT,
+      [PIPE_FORMAT_L8_UNORM]                = ISL_FORMAT_L8_UNORM,
+      [PIPE_FORMAT_L8_SNORM]                = ISL_FORMAT_R8_SNORM,
+      [PIPE_FORMAT_L8_SINT]                 = ISL_FORMAT_L8_SINT,
+      [PIPE_FORMAT_L16_UNORM]               = ISL_FORMAT_L16_UNORM,
+      [PIPE_FORMAT_L16_SNORM]               = ISL_FORMAT_R16_SNORM,
+      [PIPE_FORMAT_L16_FLOAT]               = ISL_FORMAT_L16_FLOAT,
+      [PIPE_FORMAT_L32_FLOAT]               = ISL_FORMAT_L32_FLOAT,
+
+      [PIPE_FORMAT_A8_UNORM]                = ISL_FORMAT_A8_UNORM,
+      [PIPE_FORMAT_A16_UNORM]               = ISL_FORMAT_A16_UNORM,
+      [PIPE_FORMAT_A16_FLOAT]               = ISL_FORMAT_A16_FLOAT,
+      [PIPE_FORMAT_A32_FLOAT]               = ISL_FORMAT_A32_FLOAT,
+
+      [PIPE_FORMAT_L8A8_UNORM]              = ISL_FORMAT_L8A8_UNORM,
+      [PIPE_FORMAT_L16A16_UNORM]            = ISL_FORMAT_L16A16_UNORM,
+      [PIPE_FORMAT_L16A16_FLOAT]            = ISL_FORMAT_L16A16_FLOAT,
+      [PIPE_FORMAT_L32A32_FLOAT]            = ISL_FORMAT_L32A32_FLOAT,
+
+      /* Sadly, we have to use luminance[-alpha] formats for sRGB decoding. */
+      [PIPE_FORMAT_R8_SRGB]                 = ISL_FORMAT_L8_UNORM_SRGB,
+      [PIPE_FORMAT_L8_SRGB]                 = ISL_FORMAT_L8_UNORM_SRGB,
+      [PIPE_FORMAT_L8A8_SRGB]               = ISL_FORMAT_L8A8_UNORM_SRGB,
+
+      [PIPE_FORMAT_R10G10B10A2_SSCALED]     = ISL_FORMAT_R10G10B10A2_SSCALED,
+      [PIPE_FORMAT_R10G10B10A2_SNORM]       = ISL_FORMAT_R10G10B10A2_SNORM,
+
+      [PIPE_FORMAT_B10G10R10A2_USCALED]     = ISL_FORMAT_B10G10R10A2_USCALED,
+      [PIPE_FORMAT_B10G10R10A2_SSCALED]     = ISL_FORMAT_B10G10R10A2_SSCALED,
+      [PIPE_FORMAT_B10G10R10A2_SNORM]       = ISL_FORMAT_B10G10R10A2_SNORM,
+
+      [PIPE_FORMAT_R8_UINT]                 = ISL_FORMAT_R8_UINT,
+      [PIPE_FORMAT_R8G8_UINT]               = ISL_FORMAT_R8G8_UINT,
+      [PIPE_FORMAT_R8G8B8_UINT]             = ISL_FORMAT_R8G8B8_UINT,
+      [PIPE_FORMAT_R8G8B8A8_UINT]           = ISL_FORMAT_R8G8B8A8_UINT,
+
+      [PIPE_FORMAT_R8_SINT]                 = ISL_FORMAT_R8_SINT,
+      [PIPE_FORMAT_R8G8_SINT]               = ISL_FORMAT_R8G8_SINT,
+      [PIPE_FORMAT_R8G8B8_SINT]             = ISL_FORMAT_R8G8B8_SINT,
+      [PIPE_FORMAT_R8G8B8A8_SINT]           = ISL_FORMAT_R8G8B8A8_SINT,
+
+      [PIPE_FORMAT_R16_UINT]                = ISL_FORMAT_R16_UINT,
+      [PIPE_FORMAT_R16G16_UINT]             = ISL_FORMAT_R16G16_UINT,
+      [PIPE_FORMAT_R16G16B16_UINT]          = ISL_FORMAT_R16G16B16_UINT,
+      [PIPE_FORMAT_R16G16B16A16_UINT]       = ISL_FORMAT_R16G16B16A16_UINT,
+
+      [PIPE_FORMAT_R16_SINT]                = ISL_FORMAT_R16_SINT,
+      [PIPE_FORMAT_R16G16_SINT]             = ISL_FORMAT_R16G16_SINT,
+      [PIPE_FORMAT_R16G16B16_SINT]          = ISL_FORMAT_R16G16B16_SINT,
+      [PIPE_FORMAT_R16G16B16A16_SINT]       = ISL_FORMAT_R16G16B16A16_SINT,
+
+      [PIPE_FORMAT_R32_UINT]                = ISL_FORMAT_R32_UINT,
+      [PIPE_FORMAT_R32G32_UINT]             = ISL_FORMAT_R32G32_UINT,
+      [PIPE_FORMAT_R32G32B32_UINT]          = ISL_FORMAT_R32G32B32_UINT,
+      [PIPE_FORMAT_R32G32B32A32_UINT]       = ISL_FORMAT_R32G32B32A32_UINT,
+
+      [PIPE_FORMAT_R32_SINT]                = ISL_FORMAT_R32_SINT,
+      [PIPE_FORMAT_R32G32_SINT]             = ISL_FORMAT_R32G32_SINT,
+      [PIPE_FORMAT_R32G32B32_SINT]          = ISL_FORMAT_R32G32B32_SINT,
+      [PIPE_FORMAT_R32G32B32A32_SINT]       = ISL_FORMAT_R32G32B32A32_SINT,
+
+      [PIPE_FORMAT_B10G10R10A2_UINT]        = ISL_FORMAT_B10G10R10A2_UINT,
+
+      [PIPE_FORMAT_ETC1_RGB8]               = ISL_FORMAT_ETC1_RGB8,
+
+      [PIPE_FORMAT_R8G8B8X8_SRGB]           = ISL_FORMAT_R8G8B8X8_UNORM_SRGB,
+      [PIPE_FORMAT_B10G10R10X2_UNORM]       = ISL_FORMAT_B10G10R10X2_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_UNORM]      = ISL_FORMAT_R16G16B16X16_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_FLOAT]      = ISL_FORMAT_R16G16B16X16_FLOAT,
+      [PIPE_FORMAT_R32G32B32X32_FLOAT]      = ISL_FORMAT_R32G32B32X32_FLOAT,
+
+      [PIPE_FORMAT_R10G10B10A2_UINT]        = ISL_FORMAT_R10G10B10A2_UINT,
+
+      [PIPE_FORMAT_B5G6R5_SRGB]             = ISL_FORMAT_B5G6R5_UNORM_SRGB,
+
+      [PIPE_FORMAT_BPTC_RGBA_UNORM]         = ISL_FORMAT_BC7_UNORM,
+      [PIPE_FORMAT_BPTC_SRGBA]              = ISL_FORMAT_BC7_UNORM_SRGB,
+      [PIPE_FORMAT_BPTC_RGB_FLOAT]          = ISL_FORMAT_BC6H_SF16,
+      [PIPE_FORMAT_BPTC_RGB_UFLOAT]         = ISL_FORMAT_BC6H_UF16,
+
+      [PIPE_FORMAT_ETC2_RGB8]               = ISL_FORMAT_ETC2_RGB8,
+      [PIPE_FORMAT_ETC2_SRGB8]              = ISL_FORMAT_ETC2_SRGB8,
+      [PIPE_FORMAT_ETC2_RGB8A1]             = ISL_FORMAT_ETC2_RGB8_PTA,
+      [PIPE_FORMAT_ETC2_SRGB8A1]            = ISL_FORMAT_ETC2_SRGB8_PTA,
+      [PIPE_FORMAT_ETC2_RGBA8]              = ISL_FORMAT_ETC2_EAC_RGBA8,
+      [PIPE_FORMAT_ETC2_SRGBA8]             = ISL_FORMAT_ETC2_EAC_SRGB8_A8,
+      [PIPE_FORMAT_ETC2_R11_UNORM]          = ISL_FORMAT_EAC_R11,
+      [PIPE_FORMAT_ETC2_R11_SNORM]          = ISL_FORMAT_EAC_SIGNED_R11,
+      [PIPE_FORMAT_ETC2_RG11_UNORM]         = ISL_FORMAT_EAC_RG11,
+      [PIPE_FORMAT_ETC2_RG11_SNORM]         = ISL_FORMAT_EAC_SIGNED_RG11,
+
+      [PIPE_FORMAT_FXT1_RGB]                = ISL_FORMAT_FXT1,
+      [PIPE_FORMAT_FXT1_RGBA]               = ISL_FORMAT_FXT1,
+
+      [PIPE_FORMAT_ASTC_4x4]                = ISL_FORMAT_ASTC_LDR_2D_4X4_FLT16,
+      [PIPE_FORMAT_ASTC_5x4]                = ISL_FORMAT_ASTC_LDR_2D_5X4_FLT16,
+      [PIPE_FORMAT_ASTC_5x5]                = ISL_FORMAT_ASTC_LDR_2D_5X5_FLT16,
+      [PIPE_FORMAT_ASTC_6x5]                = ISL_FORMAT_ASTC_LDR_2D_6X5_FLT16,
+      [PIPE_FORMAT_ASTC_6x6]                = ISL_FORMAT_ASTC_LDR_2D_6X6_FLT16,
+      [PIPE_FORMAT_ASTC_8x5]                = ISL_FORMAT_ASTC_LDR_2D_8X5_FLT16,
+      [PIPE_FORMAT_ASTC_8x6]                = ISL_FORMAT_ASTC_LDR_2D_8X6_FLT16,
+      [PIPE_FORMAT_ASTC_8x8]                = ISL_FORMAT_ASTC_LDR_2D_8X8_FLT16,
+      [PIPE_FORMAT_ASTC_10x5]               = ISL_FORMAT_ASTC_LDR_2D_10X5_FLT16,
+      [PIPE_FORMAT_ASTC_10x6]               = ISL_FORMAT_ASTC_LDR_2D_10X6_FLT16,
+      [PIPE_FORMAT_ASTC_10x8]               = ISL_FORMAT_ASTC_LDR_2D_10X8_FLT16,
+      [PIPE_FORMAT_ASTC_10x10]              = ISL_FORMAT_ASTC_LDR_2D_10X10_FLT16,
+      [PIPE_FORMAT_ASTC_12x10]              = ISL_FORMAT_ASTC_LDR_2D_12X10_FLT16,
+      [PIPE_FORMAT_ASTC_12x12]              = ISL_FORMAT_ASTC_LDR_2D_12X12_FLT16,
+
+      [PIPE_FORMAT_ASTC_4x4_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_4X4_U8SRGB,
+      [PIPE_FORMAT_ASTC_5x4_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_5X4_U8SRGB,
+      [PIPE_FORMAT_ASTC_5x5_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_5X5_U8SRGB,
+      [PIPE_FORMAT_ASTC_6x5_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_6X5_U8SRGB,
+      [PIPE_FORMAT_ASTC_6x6_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_6X6_U8SRGB,
+      [PIPE_FORMAT_ASTC_8x5_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_8X5_U8SRGB,
+      [PIPE_FORMAT_ASTC_8x6_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_8X6_U8SRGB,
+      [PIPE_FORMAT_ASTC_8x8_SRGB]           = ISL_FORMAT_ASTC_LDR_2D_8X8_U8SRGB,
+      [PIPE_FORMAT_ASTC_10x5_SRGB]          = ISL_FORMAT_ASTC_LDR_2D_10X5_U8SRGB,
+      [PIPE_FORMAT_ASTC_10x6_SRGB]          = ISL_FORMAT_ASTC_LDR_2D_10X6_U8SRGB,
+      [PIPE_FORMAT_ASTC_10x8_SRGB]          = ISL_FORMAT_ASTC_LDR_2D_10X8_U8SRGB,
+      [PIPE_FORMAT_ASTC_10x10_SRGB]         = ISL_FORMAT_ASTC_LDR_2D_10X10_U8SRGB,
+      [PIPE_FORMAT_ASTC_12x10_SRGB]         = ISL_FORMAT_ASTC_LDR_2D_12X10_U8SRGB,
+      [PIPE_FORMAT_ASTC_12x12_SRGB]         = ISL_FORMAT_ASTC_LDR_2D_12X12_U8SRGB,
+
+      [PIPE_FORMAT_A1B5G5R5_UNORM]          = ISL_FORMAT_A1B5G5R5_UNORM,
+
+      /* We support these so that we know the API expects no alpha channel.
+       * Otherwise, the state tracker would just give us a format with alpha
+       * and we wouldn't know to override the swizzle to 1.
+       */
+      [PIPE_FORMAT_R16G16B16X16_UINT]       = ISL_FORMAT_R16G16B16A16_UINT,
+      [PIPE_FORMAT_R16G16B16X16_SINT]       = ISL_FORMAT_R16G16B16A16_SINT,
+      [PIPE_FORMAT_R32G32B32X32_UINT]       = ISL_FORMAT_R32G32B32A32_UINT,
+      [PIPE_FORMAT_R32G32B32X32_SINT]       = ISL_FORMAT_R32G32B32A32_SINT,
+      [PIPE_FORMAT_R10G10B10X2_SNORM]       = ISL_FORMAT_R10G10B10A2_SNORM,
+   };
+   assert(pf < PIPE_FORMAT_COUNT);
+   return table[pf];
+}
+
+static enum isl_format
+get_render_format(enum pipe_format pformat, enum isl_format def_format)
+{
+   switch (pformat) {
+   case PIPE_FORMAT_A16_UNORM:            return ISL_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_A16_FLOAT:            return ISL_FORMAT_R16_FLOAT;
+   case PIPE_FORMAT_A32_FLOAT:            return ISL_FORMAT_R32_FLOAT;
+
+   case PIPE_FORMAT_I8_UNORM:             return ISL_FORMAT_R8_UNORM;
+   case PIPE_FORMAT_I16_UNORM:            return ISL_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_I16_FLOAT:            return ISL_FORMAT_R16_FLOAT;
+   case PIPE_FORMAT_I32_FLOAT:            return ISL_FORMAT_R32_FLOAT;
+
+   case PIPE_FORMAT_L8_UNORM:             return ISL_FORMAT_R8_UNORM;
+   case PIPE_FORMAT_L8_UINT:              return ISL_FORMAT_R8_UINT;
+   case PIPE_FORMAT_L8_SINT:              return ISL_FORMAT_R8_SINT;
+   case PIPE_FORMAT_L16_UNORM:            return ISL_FORMAT_R16_UNORM;
+   case PIPE_FORMAT_L16_FLOAT:            return ISL_FORMAT_R16_FLOAT;
+   case PIPE_FORMAT_L32_FLOAT:            return ISL_FORMAT_R32_FLOAT;
+
+   case PIPE_FORMAT_L8A8_UNORM:           return ISL_FORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_L16A16_UNORM:         return ISL_FORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_L16A16_FLOAT:         return ISL_FORMAT_R16G16_FLOAT;
+   case PIPE_FORMAT_L32A32_FLOAT:         return ISL_FORMAT_R32G32_FLOAT;
+
+   default:
+      return def_format;
+   }
+}
+
+struct crocus_format_info
+crocus_format_for_usage(const struct intel_device_info *devinfo,
+                        enum pipe_format pformat,
+                        isl_surf_usage_flags_t usage)
+{
+   struct crocus_format_info info = { crocus_isl_format_for_pipe_format(pformat),
+                                      { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W } };
+
+   if (info.fmt == ISL_FORMAT_UNSUPPORTED)
+      return info;
+
+   if (pformat == PIPE_FORMAT_A8_UNORM) {
+      info.fmt = ISL_FORMAT_A8_UNORM;
+   }
+
+   if (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)
+      info.fmt = get_render_format(pformat, info.fmt);
+   if (devinfo->ver < 6) {
+      if (pformat == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+         info.fmt = ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS;
+      if (pformat == PIPE_FORMAT_X32_S8X24_UINT)
+         info.fmt = ISL_FORMAT_X32_TYPELESS_G8X24_UINT;
+      if (pformat == PIPE_FORMAT_X24S8_UINT)
+         info.fmt = ISL_FORMAT_X24_TYPELESS_G8_UINT;
+   }
+
+   const struct isl_format_layout *fmtl = isl_format_get_layout(info.fmt);
+
+   if (util_format_is_snorm(pformat)) {
+      if (util_format_is_intensity(pformat)) {
+         info.swizzles[0] = PIPE_SWIZZLE_X;
+         info.swizzles[1] = PIPE_SWIZZLE_X;
+         info.swizzles[2] = PIPE_SWIZZLE_X;
+         info.swizzles[3] = PIPE_SWIZZLE_X;
+      } else if (util_format_is_luminance(pformat)) {
+         info.swizzles[0] = PIPE_SWIZZLE_X;
+         info.swizzles[1] = PIPE_SWIZZLE_X;
+         info.swizzles[2] = PIPE_SWIZZLE_X;
+         info.swizzles[3] = PIPE_SWIZZLE_1;
+      } else if (util_format_is_luminance_alpha(pformat)) {
+         info.swizzles[0] = PIPE_SWIZZLE_X;
+         info.swizzles[1] = PIPE_SWIZZLE_X;
+         info.swizzles[2] = PIPE_SWIZZLE_X;
+         info.swizzles[3] = PIPE_SWIZZLE_Y;
+      } else if (util_format_is_alpha(pformat)) {
+         info.swizzles[0] = PIPE_SWIZZLE_0;
+         info.swizzles[1] = PIPE_SWIZZLE_0;
+         info.swizzles[2] = PIPE_SWIZZLE_0;
+         info.swizzles[3] = PIPE_SWIZZLE_X;
+      }
+   }
+
+   /* When faking RGBX pipe formats with RGBA ISL formats, override alpha. */
+   if (!util_format_has_alpha(pformat) && fmtl->channels.a.type != ISL_VOID) {
+      info.swizzles[0] = PIPE_SWIZZLE_X;
+      info.swizzles[1] = PIPE_SWIZZLE_Y;
+      info.swizzles[2] = PIPE_SWIZZLE_Z;
+      info.swizzles[3] = PIPE_SWIZZLE_1;
+   }
+
+   /* We choose RGBA over RGBX for rendering the hardware doesn't support
+    * rendering to RGBX. However, when this internal override is used on Gen9+,
+    * fast clears don't work correctly.
+    *
+    * i965 fixes this by pretending to not support RGBX formats, and the higher
+    * layers of Mesa pick the RGBA format instead. Gallium doesn't work that
+    * way, and might choose a different format, like BGRX instead of RGBX,
+    * which will also cause problems when sampling from a surface fast cleared
+    * as RGBX. So we always choose RGBA instead of RGBX explicitly
+    * here.
+    */
+   if (isl_format_is_rgbx(info.fmt) &&
+       !isl_format_supports_rendering(devinfo, info.fmt) &&
+       (usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)) {
+      info.fmt = isl_format_rgbx_to_rgba(info.fmt);
+      info.swizzles[0] = PIPE_SWIZZLE_X;
+      info.swizzles[1] = PIPE_SWIZZLE_Y;
+      info.swizzles[2] = PIPE_SWIZZLE_Z;
+      info.swizzles[3] = PIPE_SWIZZLE_1;
+   }
+
+   return info;
+}
+
+/**
+ * The pscreen->is_format_supported() driver hook.
+ *
+ * Returns true if the given format is supported for the given usage
+ * (PIPE_BIND_*) and sample count.
+ */
+bool
+crocus_is_format_supported(struct pipe_screen *pscreen,
+                           enum pipe_format pformat,
+                           enum pipe_texture_target target,
+                           unsigned sample_count, unsigned storage_sample_count,
+                           unsigned usage)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (!util_is_power_of_two_or_zero(sample_count))
+      return false;
+   if (devinfo->ver >= 7) {
+      if (sample_count > 8 || sample_count == 2)
+         return false;
+   } else if (devinfo->ver == 6) {
+      if (sample_count > 4 || sample_count == 2)
+         return false;
+   } else if (sample_count > 1) {
+      return false;
+   }
+
+   if (pformat == PIPE_FORMAT_NONE)
+      return true;
+
+   enum isl_format format = crocus_isl_format_for_pipe_format(pformat);
+
+   if (format == ISL_FORMAT_UNSUPPORTED)
+      return false;
+
+   /* no stencil texturing prior to haswell */
+   if (!devinfo->is_haswell) {
+      if (pformat == PIPE_FORMAT_S8_UINT ||
+          pformat == PIPE_FORMAT_X24S8_UINT ||
+          pformat == PIPE_FORMAT_S8X24_UINT ||
+          pformat == PIPE_FORMAT_X32_S8X24_UINT)
+         return FALSE;
+   }
+
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+   const bool is_integer = isl_format_has_int_channel(format);
+   bool supported = true;
+
+   if (sample_count > 1)
+      supported &= isl_format_supports_multisampling(devinfo, format);
+
+   if (usage & PIPE_BIND_DEPTH_STENCIL) {
+      supported &= format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS ||
+                   format == ISL_FORMAT_R32_FLOAT ||
+                   format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+                   format == ISL_FORMAT_R16_UNORM ||
+                   format == ISL_FORMAT_R8_UINT;
+   }
+
+   if (usage & PIPE_BIND_RENDER_TARGET) {
+      /* Alpha and luminance-alpha formats other than A8_UNORM are not
+       * renderable.
+       *
+       * For BLORP, we can apply the swizzle in the shader.  But for
+       * general rendering, this would mean recompiling the shader, which
+       * we'd like to avoid doing.  So we mark these formats non-renderable.
+       *
+       * We do support A8_UNORM as it's required and is renderable.
+       */
+      if (pformat != PIPE_FORMAT_A8_UNORM &&
+          (util_format_is_alpha(pformat) ||
+           util_format_is_luminance_alpha(pformat)))
+         supported = false;
+
+      enum isl_format rt_format = format;
+
+      if (isl_format_is_rgbx(format) &&
+          !isl_format_supports_rendering(devinfo, format))
+         rt_format = isl_format_rgbx_to_rgba(format);
+
+      supported &= isl_format_supports_rendering(devinfo, rt_format);
+
+      if (!is_integer)
+         supported &= isl_format_supports_alpha_blending(devinfo, rt_format);
+   }
+
+   if (usage & PIPE_BIND_SHADER_IMAGE) {
+      /* Dataport doesn't support compression, and we can't resolve an MCS
+       * compressed surface.  (Buffer images may have sample count of 0.)
+       */
+      supported &= sample_count == 0;
+
+      supported &= isl_format_supports_typed_writes(devinfo, format);
+      supported &= isl_has_matching_typed_storage_image_format(devinfo, format);
+   }
+
+   if (usage & PIPE_BIND_SAMPLER_VIEW) {
+      supported &= isl_format_supports_sampling(devinfo, format);
+      bool ignore_filtering = false;
+
+      if (is_integer)
+         ignore_filtering = true;
+
+      /* I said them, but I lied them. */
+      if (devinfo->ver < 5 && (format == ISL_FORMAT_R32G32B32A32_FLOAT ||
+                               format == ISL_FORMAT_R24_UNORM_X8_TYPELESS ||
+                               format == ISL_FORMAT_R32_FLOAT ||
+                               format == ISL_FORMAT_R32_FLOAT_X8X24_TYPELESS))
+         ignore_filtering = true;
+      if (!ignore_filtering)
+         supported &= isl_format_supports_filtering(devinfo, format);
+
+      /* Don't advertise 3-component RGB formats for non-buffer textures.
+       * This ensures that they are renderable from an API perspective since
+       * the state tracker will fall back to RGBA or RGBX, which are
+       * renderable.  We want to render internally for copies and blits,
+       * even if the application doesn't.
+       *
+       * Buffer textures don't need to be renderable, so we support real RGB.
+       * This is useful for PBO upload, and 32-bit RGB support is mandatory.
+       */
+      if (target != PIPE_BUFFER)
+         supported &= fmtl->bpb != 24 && fmtl->bpb != 48 && fmtl->bpb != 96;
+   }
+
+   if (usage & PIPE_BIND_VERTEX_BUFFER) {
+      supported &= isl_format_supports_vertex_fetch(devinfo, format);
+
+      if (!devinfo->is_haswell) {
+         /* W/A: Pre-Haswell, the hardware doesn't really support the formats
+          * we'd like to use here, so upload everything as UINT and fix it in
+          * the shader
+          */
+         if (format == ISL_FORMAT_R10G10B10A2_UNORM ||
+             format == ISL_FORMAT_B10G10R10A2_UNORM ||
+             format == ISL_FORMAT_R10G10B10A2_SNORM ||
+             format == ISL_FORMAT_B10G10R10A2_SNORM ||
+             format == ISL_FORMAT_R10G10B10A2_USCALED ||
+             format == ISL_FORMAT_B10G10R10A2_USCALED ||
+             format == ISL_FORMAT_R10G10B10A2_SSCALED ||
+             format == ISL_FORMAT_B10G10R10A2_SSCALED)
+            supported = true;
+
+         if (format == ISL_FORMAT_R8G8B8_SINT ||
+             format == ISL_FORMAT_R8G8B8_UINT ||
+             format == ISL_FORMAT_R16G16B16_SINT ||
+             format == ISL_FORMAT_R16G16B16_UINT)
+            supported = true;
+      }
+   }
+
+   if (usage & PIPE_BIND_INDEX_BUFFER) {
+      supported &= format == ISL_FORMAT_R8_UINT ||
+                   format == ISL_FORMAT_R16_UINT ||
+                   format == ISL_FORMAT_R32_UINT;
+   }
+
+   return supported;
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_macros.h b/src/gallium/drivers/crocus/crocus_genx_macros.h
new file mode 100644
index 00000000000..a0309513ed2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_macros.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Macro and function definitions needed in order to use genxml.
+ *
+ * This should only be included in sources compiled per-generation.
+ */
+
+#include "crocus_batch.h"
+
+#include "genxml/gen_macros.h"
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+#define __gen_combine_address crocus_combine_address
+
+static inline void *
+__gen_get_batch_dwords(struct crocus_batch *batch, unsigned dwords)
+{
+   return crocus_get_command_space(batch, dwords * sizeof(uint32_t));
+}
+
+static inline struct crocus_address
+__gen_address_offset(struct crocus_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
+
+static uint64_t
+__gen_combine_address(struct crocus_batch *batch, void *location,
+                      struct crocus_address addr, uint32_t delta)
+{
+   uint32_t offset = (char *)location - (char *)batch->command.map;
+
+   if (addr.bo == NULL) {
+      return addr.offset + delta;
+   } else {
+      if (GFX_VER < 6 && crocus_ptr_in_state_buffer(batch, location)) {
+         offset = (char *) location - (char *) batch->state.map;
+         return crocus_state_reloc(batch, offset, addr.bo,
+                                   addr.offset + delta,
+                                   addr.reloc_flags);
+      }
+
+      assert(!crocus_ptr_in_state_buffer(batch, location));
+
+      offset = (char *) location - (char *) batch->command.map;
+      return crocus_command_reloc(batch, offset, addr.bo,
+                                  addr.offset + delta,
+                                  addr.reloc_flags);
+   }
+}
+
+#define __gen_address_type struct crocus_address
+#define __gen_user_data struct crocus_batch
+
+#define __genxml_cmd_length(cmd) cmd ## _length
+#define __genxml_cmd_length_bias(cmd) cmd ## _length_bias
+#define __genxml_cmd_header(cmd) cmd ## _header
+#define __genxml_cmd_pack(cmd) cmd ## _pack
+#define __genxml_reg_num(cmd) cmd ## _num
+
+#include "genxml/genX_pack.h"
+#include "genxml/gen_macros.h"
+#include "genxml/genX_bits.h"
+
+/* CS_GPR(15) is reserved for combining conditional rendering predicates
+ * with GL_ARB_indirect_parameters draw number predicates.
+ */
+#define MI_BUILDER_NUM_ALLOC_GPRS 15
+#include "common/mi_builder.h"
+
+#define _crocus_pack_command(batch, cmd, dst, name)                 \
+   for (struct cmd name = { __genxml_cmd_header(cmd) },           \
+        *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+        ({ __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name);    \
+           _dst = NULL;                                           \
+           }))
+
+#define crocus_pack_command(cmd, dst, name) \
+   _crocus_pack_command(NULL, cmd, dst, name)
+
+#define _crocus_pack_state(batch, cmd, dst, name)                   \
+   for (struct cmd name = {},                                     \
+        *_dst = (void *)(dst); __builtin_expect(_dst != NULL, 1); \
+        __genxml_cmd_pack(cmd)(batch, (void *)_dst, &name),       \
+        _dst = NULL)
+
+#define crocus_pack_state(cmd, dst, name)                           \
+   _crocus_pack_state(NULL, cmd, dst, name)
+
+#define crocus_emit_cmd(batch, cmd, name) \
+   _crocus_pack_command(batch, cmd, __gen_get_batch_dwords(batch, __genxml_cmd_length(cmd)), name)
+
+#define crocus_emit_merge(batch, dwords0, dwords1, num_dwords)    \
+   do {                                                         \
+      uint32_t *dw = __gen_get_batch_dwords(batch, num_dwords); \
+      for (uint32_t i = 0; i < num_dwords; i++)                 \
+         dw[i] = (dwords0)[i] | (dwords1)[i];                   \
+      VG(VALGRIND_CHECK_MEM_IS_DEFINED(dw, num_dwords));        \
+   } while (0)
+
+#define crocus_emit_reg(batch, reg, name)                                 \
+   for (struct reg name = {}, *_cont = (struct reg *)1; _cont != NULL;  \
+        ({                                                              \
+            uint32_t _dw[__genxml_cmd_length(reg)];                     \
+            __genxml_cmd_pack(reg)(NULL, _dw, &name);                   \
+            for (unsigned i = 0; i < __genxml_cmd_length(reg); i++) {   \
+               crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {  \
+                  lri.RegisterOffset   = __genxml_reg_num(reg);         \
+                  lri.DataDWord        = _dw[i];                        \
+               }                                                        \
+            }                                                           \
+           _cont = NULL;                                                \
+         }))
+
+
+/**
+ * crocus_address constructor helpers:
+ *
+ * When using these to construct a CSO, pass NULL for \p bo, and manually
+ * pin the BO later.  Otherwise, genxml's address handling will add the
+ * BO to the current batch's validation list at CSO creation time, rather
+ * than at draw time as desired.
+ */
+
+UNUSED static struct crocus_address
+ro_bo(struct crocus_bo *bo, uint64_t offset)
+{
+   return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT };
+}
+
+UNUSED static struct crocus_address
+rw_bo(struct crocus_bo *bo, uint64_t offset)
+{
+   return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_32BIT | RELOC_WRITE };
+}
+
+UNUSED static struct crocus_address
+ggtt_bo(struct crocus_bo *bo, uint64_t offset)
+{
+   return (struct crocus_address) { .bo = bo, .offset = offset, .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT };
+}
diff --git a/src/gallium/drivers/crocus/crocus_genx_protos.h b/src/gallium/drivers/crocus/crocus_genx_protos.h
new file mode 100644
index 00000000000..ba6798f991e
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_genx_protos.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* GenX-specific function declarations.
+ *
+ * Don't include this directly, it will be included by crocus_context.h.
+ *
+ * NOTE: This header can be included multiple times, from the same file.
+ */
+
+/* crocus_state.c */
+void genX(init_state)(struct crocus_context *ice);
+void genX(init_screen_state)(struct crocus_screen *screen);
+void genX(upload_urb)(struct crocus_batch *batch,
+                      unsigned vs_size,
+                      bool gs_present,
+                      unsigned gs_size);
+void genX(emit_hashing_mode)(struct crocus_context *ice,
+                             struct crocus_batch *batch,
+                             unsigned width, unsigned height,
+                             unsigned scale);
+
+/* crocus_blorp.c */
+void genX(init_blorp)(struct crocus_context *ice);
+
+/* crocus_query.c */
+void genX(init_query)(struct crocus_context *ice);
+void genX(init_screen_query)(struct crocus_screen *screen);
+void genX(math_add32_gpr0)(struct crocus_context *ice,
+                           struct crocus_batch *batch,
+                           uint32_t x);
+void genX(math_div32_gpr0)(struct crocus_context *ice,
+                           struct crocus_batch *batch,
+                           uint32_t D);
+
+/* crocus_blt.c */
+void genX(init_blt)(struct crocus_screen *screen);
diff --git a/src/gallium/drivers/crocus/crocus_monitor.c b/src/gallium/drivers/crocus/crocus_monitor.c
new file mode 100644
index 00000000000..c0465f22875
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "crocus_monitor.h"
+
+#include <xf86drm.h>
+
+#include "crocus_screen.h"
+#include "crocus_context.h"
+
+#include "perf/intel_perf.h"
+#include "perf/intel_perf_query.h"
+#include "perf/intel_perf_regs.h"
+
+struct crocus_monitor_object {
+   int num_active_counters;
+   int *active_counters;
+
+   size_t result_size;
+   unsigned char *result_buffer;
+
+   struct intel_perf_query_object *query;
+};
+
+int
+crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+                        struct pipe_driver_query_info *info)
+{
+   const struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   assert(screen->monitor_cfg);
+   if (!screen->monitor_cfg)
+      return 0;
+
+   const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+   if (!info) {
+      /* return the number of metrics */
+      return monitor_cfg->num_counters;
+   }
+
+   const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+   const int group = monitor_cfg->counters[index].group;
+   const int counter_index = monitor_cfg->counters[index].counter;
+   struct intel_perf_query_counter *counter =
+      &perf_cfg->queries[group].counters[counter_index];
+
+   info->group_id = group;
+   info->name = counter->name;
+   info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index;
+
+   if (counter->type == INTEL_PERF_COUNTER_TYPE_THROUGHPUT)
+      info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_AVERAGE;
+   else
+      info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE;
+   switch (counter->data_type) {
+   case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+   case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+      info->type = PIPE_DRIVER_QUERY_TYPE_UINT;
+      info->max_value.u32 = 0;
+      break;
+   case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+      info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+      info->max_value.u64 = 0;
+      break;
+   case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+   case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
+      info->type = PIPE_DRIVER_QUERY_TYPE_FLOAT;
+      info->max_value.u64 = -1;
+      break;
+   default:
+      assert(false);
+      break;
+   }
+
+   /* indicates that this is an OA query, not a pipeline statistics query */
+   info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+   return 1;
+}
+
+typedef void (*bo_unreference_t)(void *);
+typedef void *(*bo_map_t)(void *, void *, unsigned flags);
+typedef void (*bo_unmap_t)(void *);
+typedef void (*emit_mi_report_t)(void *, void *, uint32_t, uint32_t);
+typedef void (*emit_mi_flush_t)(void *);
+typedef void (*capture_frequency_stat_register_t)(void *, void *,
+                                                  uint32_t );
+typedef void (*store_register_mem64_t)(void *ctx, void *bo,
+                                       uint32_t reg, uint32_t offset);
+typedef bool (*batch_references_t)(void *batch, void *bo);
+typedef void (*bo_wait_rendering_t)(void *bo);
+typedef int (*bo_busy_t)(void *bo);
+
+static void *
+crocus_oa_bo_alloc(void *bufmgr, const char *name, uint64_t size)
+{
+   return crocus_bo_alloc(bufmgr, name, size);
+}
+
+#if 0
+static void
+crocus_monitor_emit_mi_flush(struct crocus_context *ice)
+{
+   const int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                     PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                     PIPE_CONTROL_DATA_CACHE_FLUSH |
+                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                     PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                     PIPE_CONTROL_CS_STALL;
+   crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
+                                  "OA metrics", flags);
+}
+#endif
+
+static void
+crocus_monitor_emit_mi_report_perf_count(void *c,
+                                         void *bo,
+                                         uint32_t offset_in_bytes,
+                                         uint32_t report_id)
+{
+   struct crocus_context *ice = c;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   screen->vtbl.emit_mi_report_perf_count(batch, bo, offset_in_bytes, report_id);
+}
+
+static void
+crocus_monitor_batchbuffer_flush(void *c, const char *file, int line)
+{
+   struct crocus_context *ice = c;
+   _crocus_batch_flush(&ice->batches[CROCUS_BATCH_RENDER], __FILE__, __LINE__);
+}
+
+#if 0
+static void
+crocus_monitor_capture_frequency_stat_register(void *ctx,
+                                               void *bo,
+                                               uint32_t bo_offset)
+{
+   struct crocus_context *ice = ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   ice->vtbl.store_register_mem32(batch, GEN9_RPSTAT0, bo, bo_offset, false);
+}
+
+static void
+crocus_monitor_store_register_mem64(void *ctx, void *bo,
+                                    uint32_t reg, uint32_t offset)
+{
+   struct crocus_context *ice = ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   ice->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+}
+#endif
+
+static bool
+crocus_monitor_init_metrics(struct crocus_screen *screen)
+{
+   struct crocus_monitor_config *monitor_cfg =
+      rzalloc(screen, struct crocus_monitor_config);
+   struct intel_perf_config *perf_cfg = NULL;
+   if (unlikely(!monitor_cfg))
+      goto allocation_error;
+   perf_cfg = intel_perf_new(monitor_cfg);
+   if (unlikely(!perf_cfg))
+      goto allocation_error;
+
+   monitor_cfg->perf_cfg = perf_cfg;
+
+   perf_cfg->vtbl.bo_alloc = crocus_oa_bo_alloc;
+   perf_cfg->vtbl.bo_unreference = (bo_unreference_t)crocus_bo_unreference;
+   perf_cfg->vtbl.bo_map = (bo_map_t)crocus_bo_map;
+   perf_cfg->vtbl.bo_unmap = (bo_unmap_t)crocus_bo_unmap;
+
+   perf_cfg->vtbl.emit_mi_report_perf_count =
+      (emit_mi_report_t)crocus_monitor_emit_mi_report_perf_count;
+   perf_cfg->vtbl.batchbuffer_flush = crocus_monitor_batchbuffer_flush;
+   perf_cfg->vtbl.batch_references = (batch_references_t)crocus_batch_references;
+   perf_cfg->vtbl.bo_wait_rendering =
+      (bo_wait_rendering_t)crocus_bo_wait_rendering;
+   perf_cfg->vtbl.bo_busy = (bo_busy_t)crocus_bo_busy;
+
+   intel_perf_init_metrics(perf_cfg, &screen->devinfo, screen->fd, false, false);
+   screen->monitor_cfg = monitor_cfg;
+
+   /* a gallium "group" is equivalent to a gen "query"
+    * a gallium "query" is equivalent to a gen "query_counter"
+    *
+    * Each gen_query supports a specific number of query_counters.  To
+    * allocate the array of crocus_monitor_counter, we need an upper bound
+    * (ignoring duplicate query_counters).
+    */
+   int gen_query_counters_count = 0;
+   for (int gen_query_id = 0;
+        gen_query_id < perf_cfg->n_queries;
+        ++gen_query_id) {
+      gen_query_counters_count += perf_cfg->queries[gen_query_id].n_counters;
+   }
+
+   monitor_cfg->counters = rzalloc_size(monitor_cfg,
+                                        sizeof(struct crocus_monitor_counter) *
+                                        gen_query_counters_count);
+   if (unlikely(!monitor_cfg->counters))
+      goto allocation_error;
+
+   int crocus_monitor_id = 0;
+   for (int group = 0; group < perf_cfg->n_queries; ++group) {
+      for (int counter = 0;
+           counter < perf_cfg->queries[group].n_counters;
+           ++counter) {
+         /* Check previously identified metrics to filter out duplicates. The
+          * user is not helped by having the same metric available in several
+          * groups. (n^2 algorithm).
+          */
+         bool duplicate = false;
+         for (int existing_group = 0;
+              existing_group < group && !duplicate;
+              ++existing_group) {
+            for (int existing_counter = 0;
+                 existing_counter < perf_cfg->queries[existing_group].n_counters && !duplicate;
+                 ++existing_counter) {
+               const char *current_name =
+                  perf_cfg->queries[group].counters[counter].name;
+               const char *existing_name =
+                  perf_cfg->queries[existing_group].counters[existing_counter].name;
+               if (strcmp(current_name, existing_name) == 0) {
+                  duplicate = true;
+               }
+            }
+         }
+         if (duplicate)
+            continue;
+         monitor_cfg->counters[crocus_monitor_id].group = group;
+         monitor_cfg->counters[crocus_monitor_id].counter = counter;
+         ++crocus_monitor_id;
+      }
+   }
+   monitor_cfg->num_counters = crocus_monitor_id;
+   return monitor_cfg->num_counters;
+
+allocation_error:
+   if (monitor_cfg)
+      free(monitor_cfg->counters);
+   free(perf_cfg);
+   free(monitor_cfg);
+   return false;
+}
+
+int
+crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+                              unsigned group_index,
+                              struct pipe_driver_query_group_info *info)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   if (!screen->monitor_cfg) {
+      if (!crocus_monitor_init_metrics(screen))
+         return 0;
+   }
+
+   const struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+   const struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+
+   if (!info) {
+      /* return the count that can be queried */
+      return perf_cfg->n_queries;
+   }
+
+   if (group_index >= perf_cfg->n_queries) {
+      /* out of range */
+      return 0;
+   }
+
+   struct intel_perf_query_info *query = &perf_cfg->queries[group_index];
+
+   info->name = query->name;
+   info->max_active_queries = query->n_counters;
+   info->num_queries = query->n_counters;
+
+   return 1;
+}
+
+static void
+crocus_init_monitor_ctx(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+   struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+
+   ice->perf_ctx = intel_perf_new_context(ice);
+   if (unlikely(!ice->perf_ctx))
+      return;
+
+   struct intel_perf_context *perf_ctx = ice->perf_ctx;
+   struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+   intel_perf_init_context(perf_ctx,
+                           perf_cfg,
+                           ice,
+                           ice,
+                           screen->bufmgr,
+                           &screen->devinfo,
+                           ice->batches[CROCUS_BATCH_RENDER].hw_ctx_id,
+                           screen->fd);
+}
+
+/* entry point for GenPerfMonitorsAMD */
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+                             unsigned num_queries,
+                             unsigned *query_types)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+   struct crocus_monitor_config *monitor_cfg = screen->monitor_cfg;
+   struct intel_perf_config *perf_cfg = monitor_cfg->perf_cfg;
+   struct intel_perf_query_object *query_obj = NULL;
+
+   /* initialize perf context if this has not already been done.  This
+    * function is the first entry point that carries the gl context.
+    */
+   if (ice->perf_ctx == NULL) {
+      crocus_init_monitor_ctx(ice);
+   }
+   struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+   assert(num_queries > 0);
+   int query_index = query_types[0] - PIPE_QUERY_DRIVER_SPECIFIC;
+   assert(query_index <= monitor_cfg->num_counters);
+   const int group = monitor_cfg->counters[query_index].group;
+
+   struct crocus_monitor_object *monitor =
+      calloc(1, sizeof(struct crocus_monitor_object));
+   if (unlikely(!monitor))
+      goto allocation_failure;
+
+   monitor->num_active_counters = num_queries;
+   monitor->active_counters = calloc(num_queries, sizeof(int));
+   if (unlikely(!monitor->active_counters))
+      goto allocation_failure;
+
+   for (int i = 0; i < num_queries; ++i) {
+      unsigned current_query = query_types[i];
+      unsigned current_query_index = current_query - PIPE_QUERY_DRIVER_SPECIFIC;
+
+      /* all queries must be in the same group */
+      assert(current_query_index <= monitor_cfg->num_counters);
+      assert(monitor_cfg->counters[current_query_index].group == group);
+      monitor->active_counters[i] =
+         monitor_cfg->counters[current_query_index].counter;
+   }
+
+   /* create the intel_perf_query */
+   query_obj = intel_perf_new_query(perf_ctx, group);
+   if (unlikely(!query_obj))
+      goto allocation_failure;
+
+   monitor->query = query_obj;
+   monitor->result_size = perf_cfg->queries[group].data_size;
+   monitor->result_buffer = calloc(1, monitor->result_size);
+   if (unlikely(!monitor->result_buffer))
+      goto allocation_failure;
+
+   return monitor;
+
+allocation_failure:
+   if (monitor) {
+      free(monitor->active_counters);
+      free(monitor->result_buffer);
+   }
+   free(query_obj);
+   free(monitor);
+   return NULL;
+}
+
+void
+crocus_destroy_monitor_object(struct pipe_context *ctx,
+                              struct crocus_monitor_object *monitor)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   intel_perf_delete_query(ice->perf_ctx, monitor->query);
+   free(monitor->result_buffer);
+   monitor->result_buffer = NULL;
+   free(monitor->active_counters);
+   monitor->active_counters = NULL;
+   free(monitor);
+}
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+                     struct crocus_monitor_object *monitor)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+   return intel_perf_begin_query(perf_ctx, monitor->query);
+}
+
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+                   struct crocus_monitor_object *monitor)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct intel_perf_context *perf_ctx = ice->perf_ctx;
+
+   intel_perf_end_query(perf_ctx, monitor->query);
+   return true;
+}
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+                          struct crocus_monitor_object *monitor,
+                          bool wait,
+                          union pipe_numeric_type_union *result)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct intel_perf_context *perf_ctx = ice->perf_ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+   bool monitor_ready =
+      intel_perf_is_query_ready(perf_ctx, monitor->query, batch);
+
+   if (!monitor_ready) {
+      if (!wait)
+         return false;
+      intel_perf_wait_query(perf_ctx, monitor->query, batch);
+   }
+
+   assert(intel_perf_is_query_ready(perf_ctx, monitor->query, batch));
+
+   unsigned bytes_written;
+   intel_perf_get_query_data(perf_ctx, monitor->query, batch,
+                             monitor->result_size,
+                             (unsigned*) monitor->result_buffer,
+                             &bytes_written);
+   if (bytes_written != monitor->result_size)
+      return false;
+
+   /* copy metrics into the batch result */
+   for (int i = 0; i < monitor->num_active_counters; ++i) {
+      int current_counter = monitor->active_counters[i];
+      const struct intel_perf_query_info *info =
+         intel_perf_query_info(monitor->query);
+      const struct intel_perf_query_counter *counter =
+         &info->counters[current_counter];
+      assert(intel_perf_query_counter_get_size(counter));
+      switch (counter->data_type) {
+      case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
+         result[i].u64 = *(uint64_t*)(monitor->result_buffer + counter->offset);
+         break;
+      case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
+         result[i].f = *(float*)(monitor->result_buffer + counter->offset);
+         break;
+      case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
+      case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
+         result[i].u64 = *(uint32_t*)(monitor->result_buffer + counter->offset);
+         break;
+      case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: {
+         double v = *(double*)(monitor->result_buffer + counter->offset);
+         result[i].f = v;
+         break;
+      }
+      default:
+         unreachable("unexpected counter data type");
+      }
+   }
+   return true;
+}
diff --git a/src/gallium/drivers/crocus/crocus_monitor.h b/src/gallium/drivers/crocus/crocus_monitor.h
new file mode 100644
index 00000000000..3335c8860e2
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_monitor.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_MONITOR_H
+#define CROCUS_MONITOR_H
+
+#include "pipe/p_screen.h"
+
+struct crocus_monitor_counter {
+   int group;
+   int counter;
+};
+
+struct crocus_monitor_config {
+   struct intel_perf_config *perf_cfg;
+
+   /* gallium requires an index for each counter */
+   int num_counters;
+   struct crocus_monitor_counter *counters;
+};
+
+int crocus_get_monitor_info(struct pipe_screen *pscreen, unsigned index,
+                            struct pipe_driver_query_info *info);
+int crocus_get_monitor_group_info(struct pipe_screen *pscreen,
+                                  unsigned index,
+                                  struct pipe_driver_query_group_info *info);
+
+struct crocus_context;
+struct crocus_screen;
+
+struct crocus_monitor_object *
+crocus_create_monitor_object(struct crocus_context *ice,
+                             unsigned num_queries,
+                             unsigned *query_types);
+
+struct pipe_query;
+void crocus_destroy_monitor_object(struct pipe_context *ctx,
+                                   struct crocus_monitor_object *monitor);
+
+bool
+crocus_begin_monitor(struct pipe_context *ctx,
+                     struct crocus_monitor_object *monitor);
+bool
+crocus_end_monitor(struct pipe_context *ctx,
+                   struct crocus_monitor_object *monitor);
+
+bool
+crocus_get_monitor_result(struct pipe_context *ctx,
+                          struct crocus_monitor_object *monitor,
+                          bool wait,
+                          union pipe_numeric_type_union *result);
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe.h b/src/gallium/drivers/crocus/crocus_pipe.h
new file mode 100644
index 00000000000..71b12d08e16
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_PIPE_H
+#define CROCUS_PIPE_H
+
+#include "pipe/p_defines.h"
+#include "compiler/shader_enums.h"
+
+static inline gl_shader_stage
+stage_from_pipe(enum pipe_shader_type pstage)
+{
+   static const gl_shader_stage stages[PIPE_SHADER_TYPES] = {
+      [PIPE_SHADER_VERTEX] = MESA_SHADER_VERTEX,
+      [PIPE_SHADER_TESS_CTRL] = MESA_SHADER_TESS_CTRL,
+      [PIPE_SHADER_TESS_EVAL] = MESA_SHADER_TESS_EVAL,
+      [PIPE_SHADER_GEOMETRY] = MESA_SHADER_GEOMETRY,
+      [PIPE_SHADER_FRAGMENT] = MESA_SHADER_FRAGMENT,
+      [PIPE_SHADER_COMPUTE] = MESA_SHADER_COMPUTE,
+   };
+   return stages[pstage];
+}
+
+static inline enum pipe_shader_type
+stage_to_pipe(gl_shader_stage stage)
+{
+   static const enum pipe_shader_type pstages[MESA_SHADER_STAGES] = {
+      [MESA_SHADER_VERTEX] = PIPE_SHADER_VERTEX,
+      [MESA_SHADER_TESS_CTRL] = PIPE_SHADER_TESS_CTRL,
+      [MESA_SHADER_TESS_EVAL] = PIPE_SHADER_TESS_EVAL,
+      [MESA_SHADER_GEOMETRY] = PIPE_SHADER_GEOMETRY,
+      [MESA_SHADER_FRAGMENT] = PIPE_SHADER_FRAGMENT,
+      [MESA_SHADER_COMPUTE] = PIPE_SHADER_COMPUTE,
+   };
+   return pstages[stage];
+}
+
+/**
+ * Convert an swizzle enumeration (i.e. PIPE_SWIZZLE_X) to one of the HW's
+ * "Shader Channel Select" enumerations (i.e. SCS_RED).  The mappings are
+ *
+ * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
+ *         0          1          2          3             4            5
+ *         4          5          6          7             0            1
+ *   SCS_RED, SCS_GREEN,  SCS_BLUE, SCS_ALPHA,     SCS_ZERO,     SCS_ONE
+ *
+ * which is simply adding 4 then modding by 8 (or anding with 7).
+ */
+static inline enum isl_channel_select
+pipe_swizzle_to_isl_channel(enum pipe_swizzle swizzle)
+{
+   return (swizzle + 4) & 7;
+}
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_pipe_control.c b/src/gallium/drivers/crocus/crocus_pipe_control.c
new file mode 100644
index 00000000000..7a9625c61ed
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_pipe_control.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_pipe_control.c
+ *
+ * PIPE_CONTROL is the main flushing and synchronization primitive on Intel
+ * GPUs.  It can invalidate caches, stall until rendering reaches various
+ * stages of completion, write to memory, and other things.  In a way, it's
+ * a swiss army knife command - it has all kinds of capabilities, but some
+ * significant limitations as well.
+ *
+ * Unfortunately, it's notoriously complicated and difficult to use.  Many
+ * sub-commands can't be used together.  Some are meant to be used at the
+ * top of the pipeline (invalidating caches before drawing), while some are
+ * meant to be used at the end (stalling or flushing after drawing).
+ *
+ * Also, there's a list of restrictions a mile long, which vary by generation.
+ * Do this before doing that, or suffer the consequences (usually a GPU hang).
+ *
+ * This file contains helpers for emitting them safely.  You can simply call
+ * crocus_emit_pipe_control_flush() with the desired operations (as logical
+ * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple
+ * PIPE_CONTROL commands as necessary.  The per-generation workarounds are
+ * applied in crocus_emit_raw_pipe_control() in crocus_state.c.
+ */
+
+#include "crocus_context.h"
+#include "util/hash_table.h"
+#include "util/set.h"
+
+/**
+ * Emit a PIPE_CONTROL with various flushing flags.
+ *
+ * The caller is responsible for deciding what flags are appropriate for the
+ * given generation.
+ */
+void
+crocus_emit_pipe_control_flush(struct crocus_batch *batch,
+                               const char *reason,
+                               uint32_t flags)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   if (devinfo->ver >= 6 &&
+       (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
+       (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
+      /* A pipe control command with flush and invalidate bits set
+       * simultaneously is an inherently racy operation on Gen6+ if the
+       * contents of the flushed caches were intended to become visible from
+       * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
+       * first one should stall the pipeline to make sure that the flushed R/W
+       * caches are coherent with memory once the specified R/O caches are
+       * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
+       * invalidation seems to happen at the bottom of the pipeline together
+       * with any write cache flush, so this shouldn't be a concern.  In order
+       * to ensure a full stall, we do an end-of-pipe sync.
+       */
+      crocus_emit_end_of_pipe_sync(batch, reason,
+                                   flags & PIPE_CONTROL_CACHE_FLUSH_BITS);
+      flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
+   }
+
+   batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0);
+}
+
+/**
+ * Emit a PIPE_CONTROL that writes to a buffer object.
+ *
+ * \p flags should contain one of the following items:
+ *  - PIPE_CONTROL_WRITE_IMMEDIATE
+ *  - PIPE_CONTROL_WRITE_TIMESTAMP
+ *  - PIPE_CONTROL_WRITE_DEPTH_COUNT
+ */
+void
+crocus_emit_pipe_control_write(struct crocus_batch *batch,
+                               const char *reason, uint32_t flags,
+                               struct crocus_bo *bo, uint32_t offset,
+                               uint64_t imm)
+{
+   batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm);
+}
+
+/**
+ * Restriction [DevSNB, DevIVB]:
+ *
+ * Prior to changing Depth/Stencil Buffer state (i.e. any combination of
+ * 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
+ * 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
+ * (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
+ * cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
+ * another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
+ * unless SW can otherwise guarantee that the pipeline from WM onwards is
+ * already flushed (e.g., via a preceding MI_FLUSH).
+ */
+void
+crocus_emit_depth_stall_flushes(struct crocus_batch *batch)
+{
+   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   assert(devinfo->ver >= 6);
+
+   crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+   crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+   crocus_emit_pipe_control_flush(batch, "depth stall", PIPE_CONTROL_DEPTH_STALL);
+}
+
+/*
+ * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
+ *
+ *  Write synchronization is a special case of end-of-pipe
+ *  synchronization that requires that the render cache and/or depth
+ *  related caches are flushed to memory, where the data will become
+ *  globally visible. This type of synchronization is required prior to
+ *  SW (CPU) actually reading the result data from memory, or initiating
+ *  an operation that will use as a read surface (such as a texture
+ *  surface) a previous render target and/or depth/stencil buffer
+ *
+ * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
+ *
+ *  Exercising the write cache flush bits (Render Target Cache Flush
+ *  Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
+ *  ensures the write caches are flushed and doesn't guarantee the data
+ *  is globally visible.
+ *
+ *  SW can track the completion of the end-of-pipe-synchronization by
+ *  using "Notify Enable" and "PostSync Operation - Write Immediate
+ *  Data" in the PIPE_CONTROL command.
+ */
+void
+crocus_emit_end_of_pipe_sync(struct crocus_batch *batch,
+                             const char *reason, uint32_t flags)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   if (devinfo->ver >= 6) {
+      /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
+       *
+       *    "The most common action to perform upon reaching a synchronization
+       *    point is to write a value out to memory. An immediate value
+       *    (included with the synchronization command) may be written."
+       *
+       * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
+       *
+       *    "In case the data flushed out by the render engine is to be read
+       *    back in to the render engine in coherent manner, then the render
+       *    engine has to wait for the fence completion before accessing the
+       *    flushed data. This can be achieved by following means on various
+       *    products: PIPE_CONTROL command with CS Stall and the required
+       *    write caches flushed with Post-Sync-Operation as Write Immediate
+       *    Data.
+       *
+       *    Example:
+       *       - Workload-1 (3D/GPGPU/MEDIA)
+       *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate
+       *         Data, Required Write Cache Flush bits set)
+       *       - Workload-2 (Can use the data produce or output by Workload-1)
+       */
+      crocus_emit_pipe_control_write(batch, reason,
+                                     flags | PIPE_CONTROL_CS_STALL |
+                                     PIPE_CONTROL_WRITE_IMMEDIATE,
+                                     batch->ice->workaround_bo,
+                                     batch->ice->workaround_offset, 0);
+
+      if (batch->screen->devinfo.is_haswell) {
+#define GEN7_3DPRIM_START_INSTANCE      0x243C
+         batch->screen->vtbl.load_register_mem32(batch, GEN7_3DPRIM_START_INSTANCE,
+                                                 batch->ice->workaround_bo,
+                                                 batch->ice->workaround_offset);
+      }
+   } else {
+      /* On gen4-5, a regular pipe control seems to suffice. */
+      crocus_emit_pipe_control_flush(batch, reason, flags);
+   }
+}
+
+/* Emit a pipelined flush to either flush render and texture cache for
+ * reading from a FBO-drawn texture, or flush so that frontbuffer
+ * render appears on the screen in DRI1.
+ *
+ * This is also used for the always_flush_cache driconf debug option.
+ */
+void
+crocus_emit_mi_flush(struct crocus_batch *batch)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
+   if (devinfo->ver >= 6) {
+      flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+               PIPE_CONTROL_DATA_CACHE_FLUSH |
+               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+               PIPE_CONTROL_VF_CACHE_INVALIDATE |
+               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+               PIPE_CONTROL_CS_STALL;
+   }
+   crocus_emit_pipe_control_flush(batch, "mi flush", flags);
+}
+
+/**
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+void
+crocus_emit_post_sync_nonzero_flush(struct crocus_batch *batch)
+{
+   crocus_emit_pipe_control_flush(batch, "nonzero",
+                                  PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_STALL_AT_SCOREBOARD);
+
+   crocus_emit_pipe_control_write(batch, "nonzero",
+                                  PIPE_CONTROL_WRITE_IMMEDIATE,
+                                  batch->ice->workaround_bo,
+                                  batch->ice->workaround_offset, 0);
+}
+
+/**
+ * Flush and invalidate all caches (for debugging purposes).
+ */
+void
+crocus_flush_all_caches(struct crocus_batch *batch)
+{
+   crocus_emit_pipe_control_flush(batch, "debug: flush all caches",
+                                  PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+}
+
+static void
+crocus_texture_barrier(struct pipe_context *ctx, unsigned flags)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_batch *render_batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_batch *compute_batch = &ice->batches[CROCUS_BATCH_COMPUTE];
+   const struct intel_device_info *devinfo = &render_batch->screen->devinfo;
+
+   if (devinfo->ver < 6) {
+      crocus_emit_mi_flush(render_batch);
+      return;
+   }
+
+   if (render_batch->contains_draw) {
+      crocus_batch_maybe_flush(render_batch, 48);
+      crocus_emit_pipe_control_flush(render_batch,
+                                     "API: texture barrier (1/2)",
+                                     (flags == 1 ? PIPE_CONTROL_DEPTH_CACHE_FLUSH  : 0) |
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+      crocus_emit_pipe_control_flush(render_batch,
+                                     "API: texture barrier (2/2)",
+                                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+   }
+
+   if (compute_batch->contains_draw) {
+      crocus_batch_maybe_flush(compute_batch, 48);
+      crocus_emit_pipe_control_flush(compute_batch,
+                                     "API: texture barrier (1/2)",
+                                     PIPE_CONTROL_CS_STALL);
+      crocus_emit_pipe_control_flush(compute_batch,
+                                     "API: texture barrier (2/2)",
+                                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
+   }
+}
+
+static void
+crocus_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+   struct crocus_context *ice = (void *) ctx;
+   unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL;
+   const struct intel_device_info *devinfo = &ice->batches[0].screen->devinfo;
+
+   assert(devinfo->ver == 7);
+
+   if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
+                PIPE_BARRIER_INDEX_BUFFER |
+                PIPE_BARRIER_INDIRECT_BUFFER)) {
+      bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+   }
+
+   if (flags & PIPE_BARRIER_CONSTANT_BUFFER) {
+      bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+              PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+   }
+
+   if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) {
+      bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+              PIPE_CONTROL_RENDER_TARGET_FLUSH;
+   }
+
+   /* Typed surface messages are handled by the render cache on IVB, so we
+    * need to flush it too.
+    */
+   if (!devinfo->is_haswell)
+      bits |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+   for (int i = 0; i < ice->batch_count; i++) {
+      if (ice->batches[i].contains_draw) {
+         crocus_batch_maybe_flush(&ice->batches[i], 24);
+         crocus_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier",
+                                        bits);
+      }
+   }
+}
+
+void
+crocus_init_flush_functions(struct pipe_context *ctx)
+{
+   ctx->memory_barrier = crocus_memory_barrier;
+   ctx->texture_barrier = crocus_texture_barrier;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c
new file mode 100644
index 00000000000..fb8216b71ab
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program.c
@@ -0,0 +1,3171 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program.c
+ *
+ * This file contains the driver interface for compiling shaders.
+ *
+ * See crocus_program_cache.c for the in-memory program cache where the
+ * compiled shaders are stored.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "util/debug.h"
+#include "util/u_prim.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_serialize.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "nir/tgsi_to_nir.h"
+
+#define KEY_INIT_NO_ID()                              \
+   .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \
+   .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688,   \
+   .base.tex.compressed_multisample_layout_mask = ~0
+#define KEY_INIT() .base.program_string_id = ish->program_id, KEY_INIT_NO_ID()
+
+static void
+crocus_sanitize_tex_key(struct brw_sampler_prog_key_data *key)
+{
+   key->gather_channel_quirk_mask = 0;
+   for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+      key->swizzles[s] = SWIZZLE_NOOP;
+      key->gfx6_gather_wa[s] = 0;
+   }
+}
+
+static uint32_t
+crocus_get_texture_swizzle(const struct crocus_context *ice,
+                           const struct crocus_sampler_view *t)
+{
+   uint32_t swiz = 0;
+
+   for (int i = 0; i < 4; i++) {
+      swiz |= t->swizzle[i] << (i * 3);
+   }
+   return swiz;
+}
+
+static inline bool can_push_ubo(const struct intel_device_info *devinfo)
+{
+   /* push works for everyone except SNB at the moment */
+   return devinfo->ver != 6;
+}
+
+static uint8_t
+gfx6_gather_workaround(enum pipe_format pformat)
+{
+   switch (pformat) {
+   case PIPE_FORMAT_R8_SINT: return WA_SIGN | WA_8BIT;
+   case PIPE_FORMAT_R8_UINT: return WA_8BIT;
+   case PIPE_FORMAT_R16_SINT: return WA_SIGN | WA_16BIT;
+   case PIPE_FORMAT_R16_UINT: return WA_16BIT;
+   default:
+      /* Note that even though PIPE_FORMAT_R32_SINT and
+       * PIPE_FORMAT_R32_UINThave format overrides in
+       * the surface state, there is no shader w/a required.
+       */
+      return 0;
+   }
+}
+
+static const unsigned crocus_gfx6_swizzle_for_offset[4] = {
+   BRW_SWIZZLE4(0, 1, 2, 3),
+   BRW_SWIZZLE4(1, 2, 3, 3),
+   BRW_SWIZZLE4(2, 3, 3, 3),
+   BRW_SWIZZLE4(3, 3, 3, 3)
+};
+
+static void
+gfx6_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+                  struct brw_gs_prog_data *gs_prog_data)
+{
+   /* Make sure that the VUE slots won't overflow the unsigned chars in
+    * prog_data->transform_feedback_bindings[].
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
+
+   /* Make sure that we don't need more binding table entries than we've
+    * set aside for use in transform feedback.  (We shouldn't, since we
+    * set aside enough binding table entries to have one per component).
+    */
+   assert(so_info->num_outputs <= BRW_MAX_SOL_BINDINGS);
+
+   gs_prog_data->num_transform_feedback_bindings = so_info->num_outputs;
+   for (unsigned i = 0; i < so_info->num_outputs; i++) {
+      gs_prog_data->transform_feedback_bindings[i] =
+         so_info->output[i].register_index;
+      gs_prog_data->transform_feedback_swizzles[i] =
+         crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+   }
+}
+
+static void
+gfx6_ff_gs_xfb_setup(const struct pipe_stream_output_info *so_info,
+                     struct brw_ff_gs_prog_key *key)
+{
+   key->num_transform_feedback_bindings = so_info->num_outputs;
+   for (unsigned i = 0; i < so_info->num_outputs; i++) {
+      key->transform_feedback_bindings[i] =
+         so_info->output[i].register_index;
+      key->transform_feedback_swizzles[i] =
+         crocus_gfx6_swizzle_for_offset[so_info->output[i].start_component];
+   }
+}
+
+static void
+crocus_populate_sampler_prog_key_data(struct crocus_context *ice,
+                                      const struct intel_device_info *devinfo,
+                                      gl_shader_stage stage,
+                                      struct crocus_uncompiled_shader *ish,
+                                      bool uses_texture_gather,
+                                      struct brw_sampler_prog_key_data *key)
+{
+   uint32_t mask = ish->nir->info.textures_used[0];
+
+   while (mask) {
+      const int s = u_bit_scan(&mask);
+
+      struct crocus_sampler_view *texture = ice->state.shaders[stage].textures[s];
+      key->swizzles[s] = SWIZZLE_NOOP;
+      key->scale_factors[s] = 0.0f;
+
+      if (!texture)
+         continue;
+      if (texture->base.target == PIPE_BUFFER)
+         continue;
+      if (!devinfo->is_haswell) {
+         key->swizzles[s] = crocus_get_texture_swizzle(ice, texture);
+      }
+
+      /* gather4 for RG32* is broken in multiple ways on Gen7. */
+      if (devinfo->ver == 7 && uses_texture_gather) {
+         switch (texture->base.format) {
+         case PIPE_FORMAT_R32G32_UINT:
+         case PIPE_FORMAT_R32G32_SINT: {
+            /* We have to override the format to R32G32_FLOAT_LD.
+             * This means that SCS_ALPHA and SCS_ONE will return 0x3f8
+             * (1.0) rather than integer 1.  This needs shader hacks.
+             *
+             * On Ivybridge, we whack W (alpha) to ONE in our key's
+             * swizzle.  On Haswell, we look at the original texture
+             * swizzle, and use XYZW with channels overridden to ONE,
+             * leaving normal texture swizzling to SCS.
+             */
+            unsigned src_swizzle = key->swizzles[s];
+            for (int i = 0; i < 4; i++) {
+               unsigned src_comp = GET_SWZ(src_swizzle, i);
+               if (src_comp == SWIZZLE_ONE || src_comp == SWIZZLE_W) {
+                  key->swizzles[i] &= ~(0x7 << (3 * i));
+                  key->swizzles[i] |= SWIZZLE_ONE << (3 * i);
+               }
+            }
+         }
+         FALLTHROUGH;
+         case PIPE_FORMAT_R32G32_FLOAT:
+            /* The channel select for green doesn't work - we have to
+             * request blue.  Haswell can use SCS for this, but Ivybridge
+             * needs a shader workaround.
+             */
+            if (!devinfo->is_haswell)
+               key->gather_channel_quirk_mask |= 1 << s;
+            break;
+         default:
+            break;
+         }
+      }
+      if (devinfo->ver == 6 && uses_texture_gather) {
+         key->gfx6_gather_wa[s] = gfx6_gather_workaround(texture->base.format);
+      }
+   }
+}
+
+static void
+crocus_lower_swizzles(struct nir_shader *nir,
+                      const struct brw_sampler_prog_key_data *key_tex)
+{
+   struct nir_lower_tex_options tex_options = { 0 };
+   uint32_t mask = nir->info.textures_used[0];
+
+   while (mask) {
+      const int s = u_bit_scan(&mask);
+
+      if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+         continue;
+
+      tex_options.swizzle_result |= (1 << s);
+      for (unsigned c = 0; c < 4; c++)
+         tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+   }
+   if (tex_options.swizzle_result)
+      nir_lower_tex(nir, &tex_options);
+}
+
+static unsigned
+get_new_program_id(struct crocus_screen *screen)
+{
+   return p_atomic_inc_return(&screen->program_id);
+}
+
+static nir_ssa_def *
+get_aoa_deref_offset(nir_builder *b,
+                     nir_deref_instr *deref,
+                     unsigned elem_size)
+{
+   unsigned array_size = elem_size;
+   nir_ssa_def *offset = nir_imm_int(b, 0);
+
+   while (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+
+      /* This level's element size is the previous level's array size */
+      nir_ssa_def *index = nir_ssa_for_src(b, deref->arr.index, 1);
+      assert(deref->arr.index.ssa);
+      offset = nir_iadd(b, offset,
+                        nir_imul(b, index, nir_imm_int(b, array_size)));
+
+      deref = nir_deref_instr_parent(deref);
+      assert(glsl_type_is_array(deref->type));
+      array_size *= glsl_get_length(deref->type);
+   }
+
+   /* Accessing an invalid surface index with the dataport can result in a
+    * hang.  According to the spec "if the index used to select an individual
+    * element is negative or greater than or equal to the size of the array,
+    * the results of the operation are undefined but may not lead to
+    * termination" -- which is one of the possible outcomes of the hang.
+    * Clamp the index to prevent access outside of the array bounds.
+    */
+   return nir_umin(b, offset, nir_imm_int(b, array_size - elem_size));
+}
+
+static void
+crocus_lower_storage_image_derefs(nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_image_deref_load:
+         case nir_intrinsic_image_deref_store:
+         case nir_intrinsic_image_deref_atomic_add:
+         case nir_intrinsic_image_deref_atomic_imin:
+         case nir_intrinsic_image_deref_atomic_umin:
+         case nir_intrinsic_image_deref_atomic_imax:
+         case nir_intrinsic_image_deref_atomic_umax:
+         case nir_intrinsic_image_deref_atomic_and:
+         case nir_intrinsic_image_deref_atomic_or:
+         case nir_intrinsic_image_deref_atomic_xor:
+         case nir_intrinsic_image_deref_atomic_exchange:
+         case nir_intrinsic_image_deref_atomic_comp_swap:
+         case nir_intrinsic_image_deref_size:
+         case nir_intrinsic_image_deref_samples:
+         case nir_intrinsic_image_deref_load_raw_intel:
+         case nir_intrinsic_image_deref_store_raw_intel: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+
+            b.cursor = nir_before_instr(&intrin->instr);
+            nir_ssa_def *index =
+               nir_iadd(&b, nir_imm_int(&b, var->data.driver_location),
+                        get_aoa_deref_offset(&b, deref, 1));
+            nir_rewrite_image_intrinsic(intrin, index, false);
+            break;
+         }
+
+         default:
+            break;
+         }
+      }
+   }
+}
+
+// XXX: need unify_interfaces() at link time...
+
+/**
+ * Undo nir_lower_passthrough_edgeflags but keep the inputs_read flag.
+ */
+static bool
+crocus_fix_edge_flags(nir_shader *nir)
+{
+   if (nir->info.stage != MESA_SHADER_VERTEX) {
+      nir_shader_preserve_all_metadata(nir);
+      return false;
+   }
+
+   nir_variable *var = nir_find_variable_with_location(nir, nir_var_shader_out,
+                                                       VARYING_SLOT_EDGE);
+   if (!var) {
+      nir_shader_preserve_all_metadata(nir);
+      return false;
+   }
+
+   var->data.mode = nir_var_shader_temp;
+   nir->info.outputs_written &= ~VARYING_BIT_EDGE;
+   nir->info.inputs_read &= ~VERT_BIT_EDGEFLAG;
+   nir_fixup_deref_modes(nir);
+
+   nir_foreach_function(f, nir) {
+      if (f->impl) {
+         nir_metadata_preserve(f->impl, nir_metadata_block_index |
+                               nir_metadata_dominance |
+                               nir_metadata_live_ssa_defs |
+                               nir_metadata_loop_analysis);
+      } else {
+         nir_metadata_preserve(f->impl, nir_metadata_all);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Fix an uncompiled shader's stream output info.
+ *
+ * Core Gallium stores output->register_index as a "slot" number, where
+ * slots are assigned consecutively to all outputs in info->outputs_written.
+ * This naive packing of outputs doesn't work for us - we too have slots,
+ * but the layout is defined by the VUE map, which we won't have until we
+ * compile a specific shader variant.  So, we remap these and simply store
+ * VARYING_SLOT_* in our copy's output->register_index fields.
+ *
+ * We also fix up VARYING_SLOT_{LAYER,VIEWPORT,PSIZ} to select the Y/Z/W
+ * components of our VUE header.  See brw_vue_map.c for the layout.
+ */
+static void
+update_so_info(struct pipe_stream_output_info *so_info,
+               uint64_t outputs_written)
+{
+   uint8_t reverse_map[64] = {};
+   unsigned slot = 0;
+   while (outputs_written) {
+      reverse_map[slot++] = u_bit_scan64(&outputs_written);
+   }
+
+   for (unsigned i = 0; i < so_info->num_outputs; i++) {
+      struct pipe_stream_output *output = &so_info->output[i];
+
+      /* Map Gallium's condensed "slots" back to real VARYING_SLOT_* enums */
+      output->register_index = reverse_map[output->register_index];
+
+      /* The VUE header contains three scalar fields packed together:
+       * - gl_PointSize is stored in VARYING_SLOT_PSIZ.w
+       * - gl_Layer is stored in VARYING_SLOT_PSIZ.y
+       * - gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z
+       */
+      switch (output->register_index) {
+      case VARYING_SLOT_LAYER:
+         assert(output->num_components == 1);
+         output->register_index = VARYING_SLOT_PSIZ;
+         output->start_component = 1;
+         break;
+      case VARYING_SLOT_VIEWPORT:
+         assert(output->num_components == 1);
+         output->register_index = VARYING_SLOT_PSIZ;
+         output->start_component = 2;
+         break;
+      case VARYING_SLOT_PSIZ:
+         assert(output->num_components == 1);
+         output->start_component = 3;
+         break;
+      }
+
+      //info->outputs_written |= 1ull << output->register_index;
+   }
+}
+
+static void
+setup_vec4_image_sysval(uint32_t *sysvals, uint32_t idx,
+                        unsigned offset, unsigned n)
+{
+   assert(offset % sizeof(uint32_t) == 0);
+
+   for (unsigned i = 0; i < n; ++i)
+      sysvals[i] = BRW_PARAM_IMAGE(idx, offset / sizeof(uint32_t) + i);
+
+   for (unsigned i = n; i < 4; ++i)
+      sysvals[i] = BRW_PARAM_BUILTIN_ZERO;
+}
+
+/**
+ * Associate NIR uniform variables with the prog_data->param[] mechanism
+ * used by the backend.  Also, decide which UBOs we'd like to push in an
+ * ideal situation (though the backend can reduce this).
+ */
+static void
+crocus_setup_uniforms(const struct brw_compiler *compiler,
+                      void *mem_ctx,
+                      nir_shader *nir,
+                      struct brw_stage_prog_data *prog_data,
+                      enum brw_param_builtin **out_system_values,
+                      unsigned *out_num_system_values,
+                      unsigned *out_num_cbufs)
+{
+   UNUSED const struct intel_device_info *devinfo = compiler->devinfo;
+
+   const unsigned CROCUS_MAX_SYSTEM_VALUES =
+      PIPE_MAX_SHADER_IMAGES * BRW_IMAGE_PARAM_SIZE;
+   enum brw_param_builtin *system_values =
+      rzalloc_array(mem_ctx, enum brw_param_builtin, CROCUS_MAX_SYSTEM_VALUES);
+   unsigned num_system_values = 0;
+
+   unsigned patch_vert_idx = -1;
+   unsigned ucp_idx[CROCUS_MAX_CLIP_PLANES];
+   unsigned img_idx[PIPE_MAX_SHADER_IMAGES];
+   unsigned variable_group_size_idx = -1;
+   memset(ucp_idx, -1, sizeof(ucp_idx));
+   memset(img_idx, -1, sizeof(img_idx));
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   b.cursor = nir_before_block(nir_start_block(impl));
+   nir_ssa_def *temp_ubo_name = nir_ssa_undef(&b, 1, 32);
+   nir_ssa_def *temp_const_ubo_name = NULL;
+
+   /* Turn system value intrinsics into uniforms */
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         nir_ssa_def *offset;
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_constant: {
+            /* This one is special because it reads from the shader constant
+             * data and not cbuf0 which gallium uploads for us.
+             */
+            b.cursor = nir_before_instr(instr);
+            nir_ssa_def *offset =
+               nir_iadd_imm(&b, nir_ssa_for_src(&b, intrin->src[0], 1),
+                            nir_intrinsic_base(intrin));
+
+            if (temp_const_ubo_name == NULL)
+               temp_const_ubo_name = nir_imm_int(&b, 0);
+
+            nir_intrinsic_instr *load_ubo =
+               nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ubo);
+            load_ubo->num_components = intrin->num_components;
+            load_ubo->src[0] = nir_src_for_ssa(temp_const_ubo_name);
+            load_ubo->src[1] = nir_src_for_ssa(offset);
+            nir_intrinsic_set_align(load_ubo, 4, 0);
+            nir_intrinsic_set_range_base(load_ubo, 0);
+            nir_intrinsic_set_range(load_ubo, ~0);
+            nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest,
+                              intrin->dest.ssa.num_components,
+                              intrin->dest.ssa.bit_size,
+                              intrin->dest.ssa.name);
+            nir_builder_instr_insert(&b, &load_ubo->instr);
+
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                     &load_ubo->dest.ssa);
+            nir_instr_remove(&intrin->instr);
+            continue;
+         }
+         case nir_intrinsic_load_user_clip_plane: {
+            unsigned ucp = nir_intrinsic_ucp_id(intrin);
+
+            if (ucp_idx[ucp] == -1) {
+               ucp_idx[ucp] = num_system_values;
+               num_system_values += 4;
+            }
+
+            for (int i = 0; i < 4; i++) {
+               system_values[ucp_idx[ucp] + i] =
+                  BRW_PARAM_BUILTIN_CLIP_PLANE(ucp, i);
+            }
+
+            b.cursor = nir_before_instr(instr);
+            offset = nir_imm_int(&b, ucp_idx[ucp] * sizeof(uint32_t));
+            break;
+         }
+         case nir_intrinsic_load_patch_vertices_in:
+            if (patch_vert_idx == -1)
+               patch_vert_idx = num_system_values++;
+
+            system_values[patch_vert_idx] =
+               BRW_PARAM_BUILTIN_PATCH_VERTICES_IN;
+
+            b.cursor = nir_before_instr(instr);
+            offset = nir_imm_int(&b, patch_vert_idx * sizeof(uint32_t));
+            break;
+         case nir_intrinsic_image_deref_load_param_intel: {
+            assert(devinfo->ver < 9);
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+
+            if (img_idx[var->data.binding] == -1) {
+               /* GL only allows arrays of arrays of images. */
+               assert(glsl_type_is_image(glsl_without_array(var->type)));
+               unsigned num_images = MAX2(1, glsl_get_aoa_size(var->type));
+
+               for (int i = 0; i < num_images; i++) {
+                  const unsigned img = var->data.binding + i;
+
+                  img_idx[img] = num_system_values;
+                  num_system_values += BRW_IMAGE_PARAM_SIZE;
+
+                  uint32_t *img_sv = &system_values[img_idx[img]];
+
+                  setup_vec4_image_sysval(
+                     img_sv + BRW_IMAGE_PARAM_OFFSET_OFFSET, img,
+                     offsetof(struct brw_image_param, offset), 2);
+                  setup_vec4_image_sysval(
+                     img_sv + BRW_IMAGE_PARAM_SIZE_OFFSET, img,
+                     offsetof(struct brw_image_param, size), 3);
+                  setup_vec4_image_sysval(
+                     img_sv + BRW_IMAGE_PARAM_STRIDE_OFFSET, img,
+                     offsetof(struct brw_image_param, stride), 4);
+                  setup_vec4_image_sysval(
+                     img_sv + BRW_IMAGE_PARAM_TILING_OFFSET, img,
+                     offsetof(struct brw_image_param, tiling), 3);
+                  setup_vec4_image_sysval(
+                     img_sv + BRW_IMAGE_PARAM_SWIZZLING_OFFSET, img,
+                     offsetof(struct brw_image_param, swizzling), 2);
+               }
+            }
+
+            b.cursor = nir_before_instr(instr);
+            offset = nir_iadd(&b,
+                              get_aoa_deref_offset(&b, deref, BRW_IMAGE_PARAM_SIZE * 4),
+                              nir_imm_int(&b, img_idx[var->data.binding] * 4 +
+                                          nir_intrinsic_base(intrin) * 16));
+            break;
+         }
+         case nir_intrinsic_load_workgroup_size: {
+            assert(nir->info.workgroup_size_variable);
+            if (variable_group_size_idx == -1) {
+               variable_group_size_idx = num_system_values;
+               num_system_values += 3;
+               for (int i = 0; i < 3; i++) {
+                  system_values[variable_group_size_idx + i] =
+                     BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i;
+               }
+            }
+
+            b.cursor = nir_before_instr(instr);
+            offset = nir_imm_int(&b,
+                                 variable_group_size_idx * sizeof(uint32_t));
+            break;
+         }
+         default:
+            continue;
+         }
+
+         unsigned comps = nir_intrinsic_dest_components(intrin);
+
+         nir_intrinsic_instr *load =
+            nir_intrinsic_instr_create(nir, nir_intrinsic_load_ubo);
+         load->num_components = comps;
+         load->src[0] = nir_src_for_ssa(temp_ubo_name);
+         load->src[1] = nir_src_for_ssa(offset);
+         nir_intrinsic_set_align(load, 4, 0);
+         nir_intrinsic_set_range_base(load, 0);
+         nir_intrinsic_set_range(load, ~0);
+         nir_ssa_dest_init(&load->instr, &load->dest, comps, 32, NULL);
+         nir_builder_instr_insert(&b, &load->instr);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  &load->dest.ssa);
+         nir_instr_remove(instr);
+      }
+   }
+
+   nir_validate_shader(nir, "before remapping");
+
+   /* Uniforms are stored in constant buffer 0, the
+    * user-facing UBOs are indexed by one.  So if any constant buffer is
+    * needed, the constant buffer 0 will be needed, so account for it.
+    */
+   unsigned num_cbufs = nir->info.num_ubos;
+   if (num_cbufs || nir->num_uniforms)
+      num_cbufs++;
+
+   /* Place the new params in a new cbuf. */
+   if (num_system_values > 0) {
+      unsigned sysval_cbuf_index = num_cbufs;
+      num_cbufs++;
+
+      system_values = reralloc(mem_ctx, system_values, enum brw_param_builtin,
+                               num_system_values);
+
+      nir_foreach_block(block, impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+
+            if (load->intrinsic != nir_intrinsic_load_ubo)
+               continue;
+
+            b.cursor = nir_before_instr(instr);
+
+            assert(load->src[0].is_ssa);
+
+            if (load->src[0].ssa == temp_ubo_name) {
+               nir_ssa_def *imm = nir_imm_int(&b, sysval_cbuf_index);
+               nir_instr_rewrite_src(instr, &load->src[0],
+                                     nir_src_for_ssa(imm));
+            }
+         }
+      }
+
+      /* We need to fold the new iadds for brw_nir_analyze_ubo_ranges */
+      nir_opt_constant_folding(nir);
+   } else {
+      ralloc_free(system_values);
+      system_values = NULL;
+   }
+
+   assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS);
+   nir_validate_shader(nir, "after remap");
+
+   /* We don't use params[] but gallium leaves num_uniforms set.  We use this
+    * to detect when cbuf0 exists but we don't need it anymore when we get
+    * here.  Instead, zero it out so that the back-end doesn't get confused
+    * when nr_params * 4 != num_uniforms != nr_params * 4.
+    */
+   nir->num_uniforms = 0;
+
+   /* Constant loads (if any) need to go at the end of the constant buffers so
+    * we need to know num_cbufs before we can lower to them.
+    */
+   if (temp_const_ubo_name != NULL) {
+      nir_load_const_instr *const_ubo_index =
+         nir_instr_as_load_const(temp_const_ubo_name->parent_instr);
+      assert(const_ubo_index->def.bit_size == 32);
+      const_ubo_index->value[0].u32 = num_cbufs;
+   }
+
+   *out_system_values = system_values;
+   *out_num_system_values = num_system_values;
+   *out_num_cbufs = num_cbufs;
+}
+
+static const char *surface_group_names[] = {
+   [CROCUS_SURFACE_GROUP_RENDER_TARGET]      = "render target",
+   [CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = "non-coherent render target read",
+   [CROCUS_SURFACE_GROUP_SOL]                = "streamout",
+   [CROCUS_SURFACE_GROUP_CS_WORK_GROUPS]     = "CS work groups",
+   [CROCUS_SURFACE_GROUP_TEXTURE]            = "texture",
+   [CROCUS_SURFACE_GROUP_TEXTURE_GATHER]     = "texture gather",
+   [CROCUS_SURFACE_GROUP_UBO]                = "ubo",
+   [CROCUS_SURFACE_GROUP_SSBO]               = "ssbo",
+   [CROCUS_SURFACE_GROUP_IMAGE]              = "image",
+};
+
+static void
+crocus_print_binding_table(FILE *fp, const char *name,
+                           const struct crocus_binding_table *bt)
+{
+   STATIC_ASSERT(ARRAY_SIZE(surface_group_names) == CROCUS_SURFACE_GROUP_COUNT);
+
+   uint32_t total = 0;
+   uint32_t compacted = 0;
+
+   for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+      uint32_t size = bt->sizes[i];
+      total += size;
+      if (size)
+         compacted += util_bitcount64(bt->used_mask[i]);
+   }
+
+   if (total == 0) {
+      fprintf(fp, "Binding table for %s is empty\n\n", name);
+      return;
+   }
+
+   if (total != compacted) {
+      fprintf(fp, "Binding table for %s "
+              "(compacted to %u entries from %u entries)\n",
+              name, compacted, total);
+   } else {
+      fprintf(fp, "Binding table for %s (%u entries)\n", name, total);
+   }
+
+   uint32_t entry = 0;
+   for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+      uint64_t mask = bt->used_mask[i];
+      while (mask) {
+         int index = u_bit_scan64(&mask);
+         fprintf(fp, "  [%u] %s #%d\n", entry++, surface_group_names[i], index);
+      }
+   }
+   fprintf(fp, "\n");
+}
+
+enum {
+   /* Max elements in a surface group. */
+   SURFACE_GROUP_MAX_ELEMENTS = 64,
+};
+
+/**
+ * Map a <group, index> pair to a binding table index.
+ *
+ * For example: <UBO, 5> => binding table index 12
+ */
+uint32_t
+crocus_group_index_to_bti(const struct crocus_binding_table *bt,
+                          enum crocus_surface_group group, uint32_t index)
+{
+   assert(index < bt->sizes[group]);
+   uint64_t mask = bt->used_mask[group];
+   uint64_t bit = 1ull << index;
+   if (bit & mask) {
+      return bt->offsets[group] + util_bitcount64((bit - 1) & mask);
+   } else {
+      return CROCUS_SURFACE_NOT_USED;
+   }
+}
+
+/**
+ * Map a binding table index back to a <group, index> pair.
+ *
+ * For example: binding table index 12 => <UBO, 5>
+ */
+uint32_t
+crocus_bti_to_group_index(const struct crocus_binding_table *bt,
+                          enum crocus_surface_group group, uint32_t bti)
+{
+   uint64_t used_mask = bt->used_mask[group];
+   assert(bti >= bt->offsets[group]);
+
+   uint32_t c = bti - bt->offsets[group];
+   while (used_mask) {
+      int i = u_bit_scan64(&used_mask);
+      if (c == 0)
+         return i;
+      c--;
+   }
+
+   return CROCUS_SURFACE_NOT_USED;
+}
+
+static void
+rewrite_src_with_bti(nir_builder *b, struct crocus_binding_table *bt,
+                     nir_instr *instr, nir_src *src,
+                     enum crocus_surface_group group)
+{
+   assert(bt->sizes[group] > 0);
+
+   b->cursor = nir_before_instr(instr);
+   nir_ssa_def *bti;
+   if (nir_src_is_const(*src)) {
+      uint32_t index = nir_src_as_uint(*src);
+      bti = nir_imm_intN_t(b, crocus_group_index_to_bti(bt, group, index),
+                           src->ssa->bit_size);
+   } else {
+      /* Indirect usage makes all the surfaces of the group to be available,
+       * so we can just add the base.
+       */
+      assert(bt->used_mask[group] == BITFIELD64_MASK(bt->sizes[group]));
+      bti = nir_iadd_imm(b, src->ssa, bt->offsets[group]);
+   }
+   nir_instr_rewrite_src(instr, src, nir_src_for_ssa(bti));
+}
+
+static void
+mark_used_with_src(struct crocus_binding_table *bt, nir_src *src,
+                   enum crocus_surface_group group)
+{
+   assert(bt->sizes[group] > 0);
+
+   if (nir_src_is_const(*src)) {
+      uint64_t index = nir_src_as_uint(*src);
+      assert(index < bt->sizes[group]);
+      bt->used_mask[group] |= 1ull << index;
+   } else {
+      /* There's an indirect usage, we need all the surfaces. */
+      bt->used_mask[group] = BITFIELD64_MASK(bt->sizes[group]);
+   }
+}
+
+static bool
+skip_compacting_binding_tables(void)
+{
+   static int skip = -1;
+   if (skip < 0)
+      skip = env_var_as_boolean("INTEL_DISABLE_COMPACT_BINDING_TABLE", false);
+   return skip;
+}
+
+/**
+ * Set up the binding table indices and apply to the shader.
+ */
+static void
+crocus_setup_binding_table(const struct intel_device_info *devinfo,
+                           struct nir_shader *nir,
+                           struct crocus_binding_table *bt,
+                           unsigned num_render_targets,
+                           unsigned num_system_values,
+                           unsigned num_cbufs,
+                           const struct brw_sampler_prog_key_data *key)
+{
+   const struct shader_info *info = &nir->info;
+
+   memset(bt, 0, sizeof(*bt));
+
+   /* Set the sizes for each surface group.  For some groups, we already know
+    * upfront how many will be used, so mark them.
+    */
+   if (info->stage == MESA_SHADER_FRAGMENT) {
+      bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET] = num_render_targets;
+      /* All render targets used. */
+      bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET] =
+         BITFIELD64_MASK(num_render_targets);
+
+      /* Setup render target read surface group in order to support non-coherent
+       * framebuffer fetch on Gfx7
+       */
+      if (devinfo->ver >= 6 && info->outputs_read) {
+         bt->sizes[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] = num_render_targets;
+         bt->used_mask[CROCUS_SURFACE_GROUP_RENDER_TARGET_READ] =
+            BITFIELD64_MASK(num_render_targets);
+      }
+   } else if (info->stage == MESA_SHADER_COMPUTE) {
+      bt->sizes[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+   } else if (info->stage == MESA_SHADER_GEOMETRY) {
+      /* In gfx6 we reserve the first BRW_MAX_SOL_BINDINGS entries for transform
+       * feedback surfaces.
+       */
+      if (devinfo->ver == 6) {
+         bt->sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+         bt->used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+      }
+   }
+
+   bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE] = BITSET_LAST_BIT(info->textures_used);
+   bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE] = info->textures_used[0];
+
+   if (info->uses_texture_gather) {
+      bt->sizes[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = BITSET_LAST_BIT(info->textures_used);
+      bt->used_mask[CROCUS_SURFACE_GROUP_TEXTURE_GATHER] = info->textures_used[0];
+   }
+
+   bt->sizes[CROCUS_SURFACE_GROUP_IMAGE] = info->num_images;
+
+   /* Allocate an extra slot in the UBO section for NIR constants.
+    * Binding table compaction will remove it if unnecessary.
+    *
+    * We don't include them in crocus_compiled_shader::num_cbufs because
+    * they are uploaded separately from shs->constbufs[], but from a shader
+    * point of view, they're another UBO (at the end of the section).
+    */
+   bt->sizes[CROCUS_SURFACE_GROUP_UBO] = num_cbufs + 1;
+
+   bt->sizes[CROCUS_SURFACE_GROUP_SSBO] = info->num_ssbos;
+
+   for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+      assert(bt->sizes[i] <= SURFACE_GROUP_MAX_ELEMENTS);
+
+   /* Mark surfaces used for the cases we don't have the information available
+    * upfront.
+    */
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   nir_foreach_block (block, impl) {
+      nir_foreach_instr (instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_num_workgroups:
+            bt->used_mask[CROCUS_SURFACE_GROUP_CS_WORK_GROUPS] = 1;
+            break;
+
+         case nir_intrinsic_load_output:
+            if (devinfo->ver >= 6) {
+               mark_used_with_src(bt, &intrin->src[0],
+                                  CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+            }
+            break;
+
+         case nir_intrinsic_image_size:
+         case nir_intrinsic_image_load:
+         case nir_intrinsic_image_store:
+         case nir_intrinsic_image_atomic_add:
+         case nir_intrinsic_image_atomic_imin:
+         case nir_intrinsic_image_atomic_umin:
+         case nir_intrinsic_image_atomic_imax:
+         case nir_intrinsic_image_atomic_umax:
+         case nir_intrinsic_image_atomic_and:
+         case nir_intrinsic_image_atomic_or:
+         case nir_intrinsic_image_atomic_xor:
+         case nir_intrinsic_image_atomic_exchange:
+         case nir_intrinsic_image_atomic_comp_swap:
+         case nir_intrinsic_image_load_raw_intel:
+         case nir_intrinsic_image_store_raw_intel:
+            mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_IMAGE);
+            break;
+
+         case nir_intrinsic_load_ubo:
+            mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_UBO);
+            break;
+
+         case nir_intrinsic_store_ssbo:
+            mark_used_with_src(bt, &intrin->src[1], CROCUS_SURFACE_GROUP_SSBO);
+            break;
+
+         case nir_intrinsic_get_ssbo_size:
+         case nir_intrinsic_ssbo_atomic_add:
+         case nir_intrinsic_ssbo_atomic_imin:
+         case nir_intrinsic_ssbo_atomic_umin:
+         case nir_intrinsic_ssbo_atomic_imax:
+         case nir_intrinsic_ssbo_atomic_umax:
+         case nir_intrinsic_ssbo_atomic_and:
+         case nir_intrinsic_ssbo_atomic_or:
+         case nir_intrinsic_ssbo_atomic_xor:
+         case nir_intrinsic_ssbo_atomic_exchange:
+         case nir_intrinsic_ssbo_atomic_comp_swap:
+         case nir_intrinsic_ssbo_atomic_fmin:
+         case nir_intrinsic_ssbo_atomic_fmax:
+         case nir_intrinsic_ssbo_atomic_fcomp_swap:
+         case nir_intrinsic_load_ssbo:
+            mark_used_with_src(bt, &intrin->src[0], CROCUS_SURFACE_GROUP_SSBO);
+            break;
+
+         default:
+            break;
+         }
+      }
+   }
+
+   /* When disable we just mark everything as used. */
+   if (unlikely(skip_compacting_binding_tables())) {
+      for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++)
+         bt->used_mask[i] = BITFIELD64_MASK(bt->sizes[i]);
+   }
+
+   /* Calculate the offsets and the binding table size based on the used
+    * surfaces.  After this point, the functions to go between "group indices"
+    * and binding table indices can be used.
+    */
+   uint32_t next = 0;
+   for (int i = 0; i < CROCUS_SURFACE_GROUP_COUNT; i++) {
+      if (bt->used_mask[i] != 0) {
+         bt->offsets[i] = next;
+         next += util_bitcount64(bt->used_mask[i]);
+      }
+   }
+   bt->size_bytes = next * 4;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_BT)) {
+      crocus_print_binding_table(stderr, gl_shader_stage_name(info->stage), bt);
+   }
+
+   /* Apply the binding table indices.  The backend compiler is not expected
+    * to change those, as we haven't set any of the *_start entries in brw
+    * binding_table.
+    */
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block (block, impl) {
+      nir_foreach_instr (instr, block) {
+         if (instr->type == nir_instr_type_tex) {
+            nir_tex_instr *tex = nir_instr_as_tex(instr);
+            bool is_gather = tex->op == nir_texop_tg4;
+
+            /* rewrite the tg4 component from green to blue before replacing the
+               texture index */
+            if (devinfo->ver == 7 && !devinfo->is_haswell) {
+               if (tex->component == 1)
+                  if (key->gather_channel_quirk_mask & (1 << tex->texture_index))
+                     tex->component = 2;
+            }
+
+            if (is_gather && devinfo->ver == 6 && key->gfx6_gather_wa[tex->texture_index]) {
+               b.cursor = nir_after_instr(instr);
+               enum gfx6_gather_sampler_wa wa = key->gfx6_gather_wa[tex->texture_index];
+               int width = (wa & WA_8BIT) ? 8 : 16;
+
+               nir_ssa_def *val = nir_fmul_imm(&b, &tex->dest.ssa, (1 << width) - 1);
+               val = nir_f2u32(&b, val);
+               if (wa & WA_SIGN) {
+                  val = nir_ishl(&b, val, nir_imm_int(&b, 32 - width));
+                  val = nir_ishr(&b, val, nir_imm_int(&b, 32 - width));
+               }
+               nir_ssa_def_rewrite_uses_after(&tex->dest.ssa, val, val->parent_instr);
+            }
+
+            tex->texture_index =
+               crocus_group_index_to_bti(bt, is_gather ? CROCUS_SURFACE_GROUP_TEXTURE_GATHER : CROCUS_SURFACE_GROUP_TEXTURE,
+                                         tex->texture_index);
+            continue;
+         }
+
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_image_size:
+         case nir_intrinsic_image_load:
+         case nir_intrinsic_image_store:
+         case nir_intrinsic_image_atomic_add:
+         case nir_intrinsic_image_atomic_imin:
+         case nir_intrinsic_image_atomic_umin:
+         case nir_intrinsic_image_atomic_imax:
+         case nir_intrinsic_image_atomic_umax:
+         case nir_intrinsic_image_atomic_and:
+         case nir_intrinsic_image_atomic_or:
+         case nir_intrinsic_image_atomic_xor:
+         case nir_intrinsic_image_atomic_exchange:
+         case nir_intrinsic_image_atomic_comp_swap:
+         case nir_intrinsic_image_load_raw_intel:
+         case nir_intrinsic_image_store_raw_intel:
+            rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+                                 CROCUS_SURFACE_GROUP_IMAGE);
+            break;
+
+         case nir_intrinsic_load_ubo:
+            rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+                                 CROCUS_SURFACE_GROUP_UBO);
+            break;
+
+         case nir_intrinsic_store_ssbo:
+            rewrite_src_with_bti(&b, bt, instr, &intrin->src[1],
+                                 CROCUS_SURFACE_GROUP_SSBO);
+            break;
+
+         case nir_intrinsic_load_output:
+            if (devinfo->ver >= 6) {
+               rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+                                    CROCUS_SURFACE_GROUP_RENDER_TARGET_READ);
+            }
+            break;
+
+         case nir_intrinsic_get_ssbo_size:
+         case nir_intrinsic_ssbo_atomic_add:
+         case nir_intrinsic_ssbo_atomic_imin:
+         case nir_intrinsic_ssbo_atomic_umin:
+         case nir_intrinsic_ssbo_atomic_imax:
+         case nir_intrinsic_ssbo_atomic_umax:
+         case nir_intrinsic_ssbo_atomic_and:
+         case nir_intrinsic_ssbo_atomic_or:
+         case nir_intrinsic_ssbo_atomic_xor:
+         case nir_intrinsic_ssbo_atomic_exchange:
+         case nir_intrinsic_ssbo_atomic_comp_swap:
+         case nir_intrinsic_ssbo_atomic_fmin:
+         case nir_intrinsic_ssbo_atomic_fmax:
+         case nir_intrinsic_ssbo_atomic_fcomp_swap:
+         case nir_intrinsic_load_ssbo:
+            rewrite_src_with_bti(&b, bt, instr, &intrin->src[0],
+                                 CROCUS_SURFACE_GROUP_SSBO);
+            break;
+
+         default:
+            break;
+         }
+      }
+   }
+}
+
+static void
+crocus_debug_recompile(struct crocus_context *ice,
+                       struct shader_info *info,
+                       const struct brw_base_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) ice->ctx.screen;
+   const struct brw_compiler *c = screen->compiler;
+
+   if (!info)
+      return;
+
+   c->shader_perf_log(&ice->dbg, "Recompiling %s shader for program %s: %s\n",
+                      _mesa_shader_stage_to_string(info->stage),
+                      info->name ? info->name : "(no identifier)",
+                      info->label ? info->label : "");
+
+   const void *old_key =
+      crocus_find_previous_compile(ice, info->stage, key->program_string_id);
+
+   brw_debug_key_recompile(c, &ice->dbg, info->stage, old_key, key);
+}
+
+/**
+ * Get the shader for the last enabled geometry stage.
+ *
+ * This stage is the one which will feed stream output and the rasterizer.
+ */
+static gl_shader_stage
+last_vue_stage(struct crocus_context *ice)
+{
+   if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+      return MESA_SHADER_GEOMETRY;
+
+   if (ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+      return MESA_SHADER_TESS_EVAL;
+
+   return MESA_SHADER_VERTEX;
+}
+
+static GLbitfield64
+crocus_vs_outputs_written(struct crocus_context *ice,
+                          const struct brw_vs_prog_key *key,
+                          GLbitfield64 user_varyings)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   GLbitfield64 outputs_written = user_varyings;
+
+   if (devinfo->ver < 6) {
+
+      if (key->copy_edgeflag)
+         outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE);
+
+      /* Put dummy slots into the VUE for the SF to put the replaced
+       * point sprite coords in.  We shouldn't need these dummy slots,
+       * which take up precious URB space, but it would mean that the SF
+       * doesn't get nice aligned pairs of input coords into output
+       * coords, which would be a pain to handle.
+       */
+      for (unsigned i = 0; i < 8; i++) {
+         if (key->point_coord_replace & (1 << i))
+            outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i);
+      }
+
+      /* if back colors are written, allocate slots for front colors too */
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+         outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+      if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+         outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+   }
+
+   /* In order for legacy clipping to work, we need to populate the clip
+    * distance varying slots whenever clipping is enabled, even if the vertex
+    * shader doesn't write to gl_ClipDistance.
+    */
+   if (key->nr_userclip_plane_consts > 0) {
+      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+      outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+   }
+
+   return outputs_written;
+}
+
+/*
+ * If no edgeflags come from the user, gen4/5
+ * require giving the clip shader a default edgeflag.
+ *
+ * This will always be 1.0.
+ */
+static void
+crocus_lower_default_edgeflags(struct nir_shader *nir)
+{
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   b.cursor = nir_after_cf_list(&b.impl->body);
+   nir_variable *var = nir_variable_create(nir, nir_var_shader_out,
+                                           glsl_float_type(),
+                                           "edgeflag");
+   var->data.location = VARYING_SLOT_EDGE;
+   nir_store_var(&b, var, nir_imm_float(&b, 1.0), 0x1);
+}
+
+/**
+ * Compile a vertex shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_vs(struct crocus_context *ice,
+                  struct crocus_uncompiled_shader *ish,
+                  const struct brw_vs_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_vs_prog_data *vs_prog_data =
+      rzalloc(mem_ctx, struct brw_vs_prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &vs_prog_data->base;
+   struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+   enum brw_param_builtin *system_values;
+   unsigned num_system_values;
+   unsigned num_cbufs;
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+   if (key->nr_userclip_plane_consts) {
+      nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+      nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+                        false, NULL);
+      nir_lower_io_to_temporaries(nir, impl, true, false);
+      nir_lower_global_vars_to_local(nir);
+      nir_lower_vars_to_ssa(nir);
+      nir_shader_gather_info(nir, impl);
+   }
+
+   prog_data->use_alt_mode = ish->use_alt_mode;
+
+   crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                         &num_system_values, &num_cbufs);
+
+   crocus_lower_swizzles(nir, &key->base.tex);
+
+   if (devinfo->ver <= 5 &&
+       !(nir->info.inputs_read & BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG)))
+      crocus_lower_default_edgeflags(nir);
+
+   struct crocus_binding_table bt;
+   crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+                              num_system_values, num_cbufs, &key->base.tex);
+
+   if (can_push_ubo(devinfo))
+      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+   uint64_t outputs_written =
+      crocus_vs_outputs_written(ice, key, nir->info.outputs_written);
+   brw_compute_vue_map(devinfo,
+                       &vue_prog_data->vue_map, outputs_written,
+                       nir->info.separate_shader, /* pos slots */ 1);
+
+   /* Don't tell the backend about our clip plane constants, we've already
+    * lowered them in NIR and we don't want it doing it again.
+    */
+   struct brw_vs_prog_key key_no_ucp = *key;
+   key_no_ucp.nr_userclip_plane_consts = 0;
+   key_no_ucp.copy_edgeflag = false;
+   crocus_sanitize_tex_key(&key_no_ucp.base.tex);
+
+   struct brw_compile_vs_params params = {
+      .nir = nir,
+      .key = &key_no_ucp,
+      .prog_data = vs_prog_data,
+      .edgeflag_is_last = devinfo->ver < 6,
+      .log_data = &ice->dbg,
+   };
+   const unsigned *program =
+      brw_compile_vs(compiler, mem_ctx, &params);
+   if (program == NULL) {
+      dbg_printf("Failed to compile vertex shader: %s\n", params.error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish->compiled_once) {
+      crocus_debug_recompile(ice, &nir->info, &key->base);
+   } else {
+      ish->compiled_once = true;
+   }
+
+   uint32_t *so_decls = NULL;
+   if (devinfo->ver > 6)
+      so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+                                                  &vue_prog_data->vue_map);
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_VS, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*vs_prog_data), so_decls,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                           ice->shaders.cache_bo_map,
+                           key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+/**
+ * Update the current vertex shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_vs(struct crocus_context *ice)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct brw_vs_prog_key key = { KEY_INIT() };
+
+   if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+      crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_VERTEX, ish,
+                                            ish->nir->info.uses_texture_gather, &key.base.tex);
+   screen->vtbl.populate_vs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_VS];
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_VS, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+   if (!shader)
+      shader = crocus_compile_vs(ice, ish, &key);
+
+   if (old != shader) {
+      ice->shaders.prog[CROCUS_CACHE_VS] = shader;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS |
+                                CROCUS_STAGE_DIRTY_BINDINGS_VS |
+                                CROCUS_STAGE_DIRTY_CONSTANTS_VS;
+      shs->sysvals_need_upload = true;
+
+      const struct brw_vs_prog_data *vs_prog_data =
+         (void *) shader->prog_data;
+      const bool uses_draw_params = vs_prog_data->uses_firstvertex ||
+                                    vs_prog_data->uses_baseinstance;
+      const bool uses_derived_draw_params = vs_prog_data->uses_drawid ||
+                                            vs_prog_data->uses_is_indexed_draw;
+      const bool needs_sgvs_element = uses_draw_params ||
+                                      vs_prog_data->uses_instanceid ||
+                                      vs_prog_data->uses_vertexid;
+
+      if (ice->state.vs_uses_draw_params != uses_draw_params ||
+          ice->state.vs_uses_derived_draw_params != uses_derived_draw_params ||
+          ice->state.vs_needs_edge_flag != ish->needs_edge_flag ||
+          ice->state.vs_uses_vertexid != vs_prog_data->uses_vertexid ||
+          ice->state.vs_uses_instanceid != vs_prog_data->uses_instanceid) {
+         ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS |
+                             CROCUS_DIRTY_VERTEX_ELEMENTS;
+      }
+      ice->state.vs_uses_draw_params = uses_draw_params;
+      ice->state.vs_uses_derived_draw_params = uses_derived_draw_params;
+      ice->state.vs_needs_sgvs_element = needs_sgvs_element;
+      ice->state.vs_needs_edge_flag = ish->needs_edge_flag;
+      ice->state.vs_uses_vertexid = vs_prog_data->uses_vertexid;
+      ice->state.vs_uses_instanceid = vs_prog_data->uses_instanceid;
+   }
+}
+
+/**
+ * Get the shader_info for a given stage, or NULL if the stage is disabled.
+ */
+const struct shader_info *
+crocus_get_shader_info(const struct crocus_context *ice, gl_shader_stage stage)
+{
+   const struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[stage];
+
+   if (!ish)
+      return NULL;
+
+   const nir_shader *nir = ish->nir;
+   return &nir->info;
+}
+
+/**
+ * Get the union of TCS output and TES input slots.
+ *
+ * TCS and TES need to agree on a common URB entry layout.  In particular,
+ * the data for all patch vertices is stored in a single URB entry (unlike
+ * GS which has one entry per input vertex).  This means that per-vertex
+ * array indexing needs a stride.
+ *
+ * SSO requires locations to match, but doesn't require the number of
+ * outputs/inputs to match (in fact, the TCS often has extra outputs).
+ * So, we need to take the extra step of unifying these on the fly.
+ */
+static void
+get_unified_tess_slots(const struct crocus_context *ice,
+                       uint64_t *per_vertex_slots,
+                       uint32_t *per_patch_slots)
+{
+   const struct shader_info *tcs =
+      crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+   const struct shader_info *tes =
+      crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+
+   *per_vertex_slots = tes->inputs_read;
+   *per_patch_slots = tes->patch_inputs_read;
+
+   if (tcs) {
+      *per_vertex_slots |= tcs->outputs_written;
+      *per_patch_slots |= tcs->patch_outputs_written;
+   }
+}
+
+/**
+ * Compile a tessellation control shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tcs(struct crocus_context *ice,
+                   struct crocus_uncompiled_shader *ish,
+                   const struct brw_tcs_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   const struct nir_shader_compiler_options *options =
+      compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].NirOptions;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_tcs_prog_data *tcs_prog_data =
+      rzalloc(mem_ctx, struct brw_tcs_prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
+   struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   enum brw_param_builtin *system_values = NULL;
+   unsigned num_system_values = 0;
+   unsigned num_cbufs = 0;
+
+   nir_shader *nir;
+
+   struct crocus_binding_table bt;
+
+   if (ish) {
+      nir = nir_shader_clone(mem_ctx, ish->nir);
+
+      crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                            &num_system_values, &num_cbufs);
+
+      crocus_lower_swizzles(nir, &key->base.tex);
+      crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+                                 num_system_values, num_cbufs, &key->base.tex);
+      if (can_push_ubo(devinfo))
+         brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+   } else {
+      nir = brw_nir_create_passthrough_tcs(mem_ctx, compiler, options, key);
+
+      /* Reserve space for passing the default tess levels as constants. */
+      num_cbufs = 1;
+      num_system_values = 8;
+      system_values =
+         rzalloc_array(mem_ctx, enum brw_param_builtin, num_system_values);
+      prog_data->param = rzalloc_array(mem_ctx, uint32_t, num_system_values);
+      prog_data->nr_params = num_system_values;
+
+      if (key->tes_primitive_mode == GL_QUADS) {
+         for (int i = 0; i < 4; i++)
+            system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+         system_values[3] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+         system_values[2] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y;
+      } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+         for (int i = 0; i < 3; i++)
+            system_values[7 - i] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X + i;
+
+         system_values[4] = BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X;
+      } else {
+         assert(key->tes_primitive_mode == GL_ISOLINES);
+         system_values[7] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y;
+         system_values[6] = BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
+      }
+
+      /* Manually setup the TCS binding table. */
+      memset(&bt, 0, sizeof(bt));
+      bt.sizes[CROCUS_SURFACE_GROUP_UBO] = 1;
+      bt.used_mask[CROCUS_SURFACE_GROUP_UBO] = 1;
+      bt.size_bytes = 4;
+
+      prog_data->ubo_ranges[0].length = 1;
+   }
+
+   struct brw_tcs_prog_key key_clean = *key;
+   crocus_sanitize_tex_key(&key_clean.base.tex);
+   char *error_str = NULL;
+   const unsigned *program =
+      brw_compile_tcs(compiler, &ice->dbg, mem_ctx, &key_clean, tcs_prog_data, nir,
+                      -1, NULL, &error_str);
+   if (program == NULL) {
+      dbg_printf("Failed to compile control shader: %s\n", error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish) {
+      if (ish->compiled_once) {
+         crocus_debug_recompile(ice, &nir->info, &key->base);
+      } else {
+         ish->compiled_once = true;
+      }
+   }
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_TCS, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*tcs_prog_data), NULL,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   if (ish)
+      crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                              ice->shaders.cache_bo_map,
+                              key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+/**
+ * Update the current tessellation control shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tcs(struct crocus_context *ice)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
+   struct crocus_uncompiled_shader *tcs =
+      ice->shaders.uncompiled[MESA_SHADER_TESS_CTRL];
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   const struct shader_info *tes_info =
+      crocus_get_shader_info(ice, MESA_SHADER_TESS_EVAL);
+   struct brw_tcs_prog_key key = {
+      KEY_INIT_NO_ID(),
+      .base.program_string_id = tcs ? tcs->program_id : 0,
+      .tes_primitive_mode = tes_info->tess.primitive_mode,
+      .input_vertices = ice->state.vertices_per_patch,
+      .quads_workaround = tes_info->tess.primitive_mode == GL_QUADS &&
+                          tes_info->tess.spacing == TESS_SPACING_EQUAL,
+   };
+
+   if (tcs && tcs->nos & (1ull << CROCUS_NOS_TEXTURES))
+      crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_CTRL, tcs,
+                                            tcs->nir->info.uses_texture_gather, &key.base.tex);
+   get_unified_tess_slots(ice, &key.outputs_written,
+                          &key.patch_outputs_written);
+   screen->vtbl.populate_tcs_key(ice, &key);
+
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TCS];
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_TCS, sizeof(key), &key);
+
+   if (tcs && !shader)
+      shader = crocus_disk_cache_retrieve(ice, tcs, &key, sizeof(key));
+
+   if (!shader)
+      shader = crocus_compile_tcs(ice, tcs, &key);
+
+   if (old != shader) {
+      ice->shaders.prog[CROCUS_CACHE_TCS] = shader;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TCS |
+                                CROCUS_STAGE_DIRTY_BINDINGS_TCS |
+                                CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+      shs->sysvals_need_upload = true;
+   }
+}
+
+/**
+ * Compile a tessellation evaluation shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_tes(struct crocus_context *ice,
+                   struct crocus_uncompiled_shader *ish,
+                   const struct brw_tes_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_tes_prog_data *tes_prog_data =
+      rzalloc(mem_ctx, struct brw_tes_prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base;
+   struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+   enum brw_param_builtin *system_values;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   unsigned num_system_values;
+   unsigned num_cbufs;
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+   if (key->nr_userclip_plane_consts) {
+      nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+      nir_lower_clip_vs(nir, (1 << key->nr_userclip_plane_consts) - 1, true,
+                        false, NULL);
+      nir_lower_io_to_temporaries(nir, impl, true, false);
+      nir_lower_global_vars_to_local(nir);
+      nir_lower_vars_to_ssa(nir);
+      nir_shader_gather_info(nir, impl);
+   }
+
+   crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                         &num_system_values, &num_cbufs);
+   crocus_lower_swizzles(nir, &key->base.tex);
+   struct crocus_binding_table bt;
+   crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+                              num_system_values, num_cbufs, &key->base.tex);
+
+   if (can_push_ubo(devinfo))
+      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_tess_vue_map(&input_vue_map, key->inputs_read,
+                            key->patch_inputs_read);
+
+   struct brw_tes_prog_key key_clean = *key;
+   crocus_sanitize_tex_key(&key_clean.base.tex);
+   char *error_str = NULL;
+   const unsigned *program =
+      brw_compile_tes(compiler, &ice->dbg, mem_ctx, &key_clean, &input_vue_map,
+                      tes_prog_data, nir, -1, NULL, &error_str);
+   if (program == NULL) {
+      dbg_printf("Failed to compile evaluation shader: %s\n", error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish->compiled_once) {
+      crocus_debug_recompile(ice, &nir->info, &key->base);
+   } else {
+      ish->compiled_once = true;
+   }
+
+   uint32_t *so_decls = NULL;
+   if (devinfo->ver > 6)
+      so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+                                                  &vue_prog_data->vue_map);
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_TES, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*tes_prog_data), so_decls,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                           ice->shaders.cache_bo_map,
+                           key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+/**
+ * Update the current tessellation evaluation shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_tes(struct crocus_context *ice)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+   struct brw_tes_prog_key key = { KEY_INIT() };
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+      crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_TESS_EVAL, ish,
+                                            ish->nir->info.uses_texture_gather, &key.base.tex);
+   get_unified_tess_slots(ice, &key.inputs_read, &key.patch_inputs_read);
+   screen->vtbl.populate_tes_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_TES];
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_TES, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+   if (!shader)
+      shader = crocus_compile_tes(ice, ish, &key);
+
+   if (old != shader) {
+      ice->shaders.prog[CROCUS_CACHE_TES] = shader;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_TES |
+                                CROCUS_STAGE_DIRTY_BINDINGS_TES |
+                                CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+      shs->sysvals_need_upload = true;
+   }
+
+   /* TODO: Could compare and avoid flagging this. */
+   const struct shader_info *tes_info = &ish->nir->info;
+   if (BITSET_TEST(tes_info->system_values_read, SYSTEM_VALUE_VERTICES_IN)) {
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+      ice->state.shaders[MESA_SHADER_TESS_EVAL].sysvals_need_upload = true;
+   }
+}
+
+/**
+ * Compile a geometry shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_gs(struct crocus_context *ice,
+                  struct crocus_uncompiled_shader *ish,
+                  const struct brw_gs_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_gs_prog_data *gs_prog_data =
+      rzalloc(mem_ctx, struct brw_gs_prog_data);
+   struct brw_vue_prog_data *vue_prog_data = &gs_prog_data->base;
+   struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+   enum brw_param_builtin *system_values;
+   unsigned num_system_values;
+   unsigned num_cbufs;
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+   if (key->nr_userclip_plane_consts) {
+      nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+      nir_lower_clip_gs(nir, (1 << key->nr_userclip_plane_consts) - 1, false,
+                        NULL);
+      nir_lower_io_to_temporaries(nir, impl, true, false);
+      nir_lower_global_vars_to_local(nir);
+      nir_lower_vars_to_ssa(nir);
+      nir_shader_gather_info(nir, impl);
+   }
+
+   crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                         &num_system_values, &num_cbufs);
+   crocus_lower_swizzles(nir, &key->base.tex);
+   struct crocus_binding_table bt;
+   crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+                              num_system_values, num_cbufs, &key->base.tex);
+
+   if (can_push_ubo(devinfo))
+      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+   brw_compute_vue_map(devinfo,
+                       &vue_prog_data->vue_map, nir->info.outputs_written,
+                       nir->info.separate_shader, /* pos slots */ 1);
+
+   if (devinfo->ver == 6)
+      gfx6_gs_xfb_setup(&ish->stream_output, gs_prog_data);
+   struct brw_gs_prog_key key_clean = *key;
+   crocus_sanitize_tex_key(&key_clean.base.tex);
+
+   char *error_str = NULL;
+   const unsigned *program =
+      brw_compile_gs(compiler, &ice->dbg, mem_ctx, &key_clean, gs_prog_data, nir,
+                     -1, NULL, &error_str);
+   if (program == NULL) {
+      dbg_printf("Failed to compile geometry shader: %s\n", error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish->compiled_once) {
+      crocus_debug_recompile(ice, &nir->info, &key->base);
+   } else {
+      ish->compiled_once = true;
+   }
+
+   uint32_t *so_decls = NULL;
+   if (devinfo->ver > 6)
+      so_decls = screen->vtbl.create_so_decl_list(&ish->stream_output,
+                                                  &vue_prog_data->vue_map);
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_GS, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*gs_prog_data), so_decls,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                           ice->shaders.cache_bo_map,
+                           key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+/**
+ * Update the current geometry shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_gs(struct crocus_context *ice)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_GEOMETRY];
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_GS];
+   struct crocus_compiled_shader *shader = NULL;
+
+   if (ish) {
+      struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+      const struct intel_device_info *devinfo = &screen->devinfo;
+      struct brw_gs_prog_key key = { KEY_INIT() };
+
+      if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+         crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_GEOMETRY, ish,
+                                               ish->nir->info.uses_texture_gather, &key.base.tex);
+      screen->vtbl.populate_gs_key(ice, &ish->nir->info, last_vue_stage(ice), &key);
+
+      shader =
+         crocus_find_cached_shader(ice, CROCUS_CACHE_GS, sizeof(key), &key);
+
+      if (!shader)
+         shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+      if (!shader)
+         shader = crocus_compile_gs(ice, ish, &key);
+   }
+
+   if (old != shader) {
+      ice->shaders.prog[CROCUS_CACHE_GS] = shader;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+                                CROCUS_STAGE_DIRTY_BINDINGS_GS |
+                                CROCUS_STAGE_DIRTY_CONSTANTS_GS;
+      shs->sysvals_need_upload = true;
+   }
+}
+
+/**
+ * Compile a fragment (pixel) shader, and upload the assembly.
+ */
+static struct crocus_compiled_shader *
+crocus_compile_fs(struct crocus_context *ice,
+                  struct crocus_uncompiled_shader *ish,
+                  const struct brw_wm_prog_key *key,
+                  struct brw_vue_map *vue_map)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_wm_prog_data *fs_prog_data =
+      rzalloc(mem_ctx, struct brw_wm_prog_data);
+   struct brw_stage_prog_data *prog_data = &fs_prog_data->base;
+   enum brw_param_builtin *system_values;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   unsigned num_system_values;
+   unsigned num_cbufs;
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+   prog_data->use_alt_mode = ish->use_alt_mode;
+
+   crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                         &num_system_values, &num_cbufs);
+
+   /* Lower output variables to load_output intrinsics before setting up
+    * binding tables, so crocus_setup_binding_table can map any load_output
+    * intrinsics to CROCUS_SURFACE_GROUP_RENDER_TARGET_READ on Gen8 for
+    * non-coherent framebuffer fetches.
+    */
+   brw_nir_lower_fs_outputs(nir);
+
+   /* lower swizzles before binding table */
+   crocus_lower_swizzles(nir, &key->base.tex);
+   int null_rts = 1;
+
+   struct crocus_binding_table bt;
+   crocus_setup_binding_table(devinfo, nir, &bt,
+                              MAX2(key->nr_color_regions, null_rts),
+                              num_system_values, num_cbufs,
+                              &key->base.tex);
+
+   if (can_push_ubo(devinfo))
+      brw_nir_analyze_ubo_ranges(compiler, nir, NULL, prog_data->ubo_ranges);
+
+   struct brw_wm_prog_key key_clean = *key;
+   crocus_sanitize_tex_key(&key_clean.base.tex);
+
+   struct brw_compile_fs_params params = {
+      .nir = nir,
+      .key = &key_clean,
+      .prog_data = fs_prog_data,
+
+      .allow_spilling = true,
+      .vue_map = vue_map,
+
+      .log_data = &ice->dbg,
+   };
+   const unsigned *program =
+      brw_compile_fs(compiler, mem_ctx, &params);
+   if (program == NULL) {
+      dbg_printf("Failed to compile fragment shader: %s\n", params.error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish->compiled_once) {
+      crocus_debug_recompile(ice, &nir->info, &key->base);
+   } else {
+      ish->compiled_once = true;
+   }
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_FS, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*fs_prog_data), NULL,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                           ice->shaders.cache_bo_map,
+                           key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+/**
+ * Update the current fragment shader variant.
+ *
+ * Fill out the key, look in the cache, compile and bind if needed.
+ */
+static void
+crocus_update_compiled_fs(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+   struct brw_wm_prog_key key = { KEY_INIT() };
+
+   if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+      crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_FRAGMENT, ish,
+                                            ish->nir->info.uses_texture_gather, &key.base.tex);
+   screen->vtbl.populate_fs_key(ice, &ish->nir->info, &key);
+
+   if (ish->nos & (1ull << CROCUS_NOS_LAST_VUE_MAP))
+      key.input_slots_valid = ice->shaders.last_vue_map->slots_valid;
+
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_FS];
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_FS, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+   if (!shader)
+      shader = crocus_compile_fs(ice, ish, &key, ice->shaders.last_vue_map);
+
+   if (old != shader) {
+      // XXX: only need to flag CLIP if barycentric has NONPERSPECTIVE
+      // toggles.  might be able to avoid flagging SBE too.
+      ice->shaders.prog[CROCUS_CACHE_FS] = shader;
+      ice->state.dirty |= CROCUS_DIRTY_WM;
+      /* gen4 clip/sf rely on fs prog_data */
+      if (devinfo->ver < 6)
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+      else
+         ice->state.dirty |= CROCUS_DIRTY_CLIP;
+      if (devinfo->ver == 6)
+         ice->state.dirty |= CROCUS_DIRTY_RASTER;
+      if (devinfo->ver >= 7)
+         ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS |
+                                CROCUS_STAGE_DIRTY_BINDINGS_FS |
+                                CROCUS_STAGE_DIRTY_CONSTANTS_FS;
+      shs->sysvals_need_upload = true;
+   }
+}
+
+/**
+ * Update the last enabled stage's VUE map.
+ *
+ * When the shader feeding the rasterizer's output interface changes, we
+ * need to re-emit various packets.
+ */
+static void
+update_last_vue_map(struct crocus_context *ice,
+                    struct brw_stage_prog_data *prog_data)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
+   struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
+   struct brw_vue_map *old_map = ice->shaders.last_vue_map;
+   const uint64_t changed_slots =
+      (old_map ? old_map->slots_valid : 0ull) ^ vue_map->slots_valid;
+
+   if (changed_slots & VARYING_BIT_VIEWPORT) {
+      ice->state.num_viewports =
+         (vue_map->slots_valid & VARYING_BIT_VIEWPORT) ? CROCUS_MAX_VIEWPORTS : 1;
+      ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT |
+                          CROCUS_DIRTY_CC_VIEWPORT;
+      if (devinfo->ver < 6)
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+
+      if (devinfo->ver <= 6)
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+
+      if (devinfo->ver >= 6)
+         ice->state.dirty |= CROCUS_DIRTY_CLIP |
+                             CROCUS_DIRTY_GEN6_SCISSOR_RECT;;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS |
+         ice->state.stage_dirty_for_nos[CROCUS_NOS_LAST_VUE_MAP];
+   }
+
+   if (changed_slots || (old_map && old_map->separate != vue_map->separate)) {
+      ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_FS;
+   }
+
+   ice->shaders.last_vue_map = &vue_prog_data->vue_map;
+}
+
+static void
+crocus_update_pull_constant_descriptors(struct crocus_context *ice,
+                                        gl_shader_stage stage)
+{
+   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+
+   if (!shader || !shader->prog_data->has_ubo_pull)
+      return;
+
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   bool any_new_descriptors =
+      shader->num_system_values > 0 && shs->sysvals_need_upload;
+
+   unsigned bound_cbufs = shs->bound_cbufs;
+
+   while (bound_cbufs) {
+      const int i = u_bit_scan(&bound_cbufs);
+      struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
+      if (cbuf->buffer) {
+         any_new_descriptors = true;
+      }
+   }
+
+   if (any_new_descriptors)
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+}
+
+/**
+ * Get the prog_data for a given stage, or NULL if the stage is disabled.
+ */
+static struct brw_vue_prog_data *
+get_vue_prog_data(struct crocus_context *ice, gl_shader_stage stage)
+{
+   if (!ice->shaders.prog[stage])
+      return NULL;
+
+   return (void *) ice->shaders.prog[stage]->prog_data;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_clip(struct crocus_context *ice, struct brw_clip_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx;
+   unsigned program_size;
+   mem_ctx = ralloc_context(NULL);
+
+   struct brw_clip_prog_data *clip_prog_data =
+      rzalloc(mem_ctx, struct brw_clip_prog_data);
+
+   const unsigned *program = brw_compile_clip(compiler, mem_ctx, key, clip_prog_data,
+                                              ice->shaders.last_vue_map, &program_size);
+
+   if (program == NULL) {
+      dbg_printf("failed to compile clip shader\n");
+      ralloc_free(mem_ctx);
+      return false;
+   }
+   struct crocus_binding_table bt;
+   memset(&bt, 0, sizeof(bt));
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_CLIP, sizeof(*key), key, program,
+                           program_size,
+                           (struct brw_stage_prog_data *)clip_prog_data, sizeof(*clip_prog_data),
+                           NULL, NULL, 0, 0, &bt);
+   ralloc_free(mem_ctx);
+   return shader;
+}
+static void
+crocus_update_compiled_clip(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   struct brw_clip_prog_key key;
+   struct crocus_compiled_shader *old = ice->shaders.clip_prog;
+   memset(&key, 0, sizeof(key));
+
+   const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+   if (wm_prog_data) {
+      key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+      key.contains_noperspective_varying =
+         wm_prog_data->contains_noperspective_varying;
+      memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+   }
+
+   key.primitive = u_reduced_prim(ice->state.prim_mode);
+   key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+   struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+   key.pv_first = rs_state->flatshade_first;
+
+   if (rs_state->clip_plane_enable)
+      key.nr_userclip = util_logbase2(rs_state->clip_plane_enable) + 1;
+
+   if (screen->devinfo.ver == 5)
+      key.clip_mode = BRW_CLIP_MODE_KERNEL_CLIP;
+   else
+      key.clip_mode = BRW_CLIP_MODE_NORMAL;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (rs_state->cull_face == PIPE_FACE_FRONT_AND_BACK)
+         key.clip_mode = BRW_CLIP_MODE_REJECT_ALL;
+      else {
+         uint32_t fill_front = BRW_CLIP_FILL_MODE_CULL;
+         uint32_t fill_back = BRW_CLIP_FILL_MODE_CULL;
+         uint32_t offset_front = 0;
+         uint32_t offset_back = 0;
+
+         if (!(rs_state->cull_face & PIPE_FACE_FRONT)) {
+            switch (rs_state->fill_front) {
+            case PIPE_POLYGON_MODE_FILL:
+               fill_front = BRW_CLIP_FILL_MODE_FILL;
+               offset_front = 0;
+               break;
+            case PIPE_POLYGON_MODE_LINE:
+               fill_front = BRW_CLIP_FILL_MODE_LINE;
+               offset_front = rs_state->offset_line;
+               break;
+            case PIPE_POLYGON_MODE_POINT:
+               fill_front = BRW_CLIP_FILL_MODE_POINT;
+               offset_front = rs_state->offset_point;
+               break;
+            }
+         }
+
+         if (!(rs_state->cull_face & PIPE_FACE_BACK)) {
+            switch (rs_state->fill_back) {
+            case PIPE_POLYGON_MODE_FILL:
+               fill_back = BRW_CLIP_FILL_MODE_FILL;
+               offset_back = 0;
+               break;
+            case PIPE_POLYGON_MODE_LINE:
+               fill_back = BRW_CLIP_FILL_MODE_LINE;
+               offset_back = rs_state->offset_line;
+               break;
+            case PIPE_POLYGON_MODE_POINT:
+               fill_back = BRW_CLIP_FILL_MODE_POINT;
+               offset_back = rs_state->offset_point;
+               break;
+            }
+         }
+
+         if (rs_state->fill_back != PIPE_POLYGON_MODE_FILL ||
+             rs_state->fill_front != PIPE_POLYGON_MODE_FILL) {
+            key.do_unfilled = 1;
+
+            /* Most cases the fixed function units will handle.  Cases where
+             * one or more polygon faces are unfilled will require help:
+             */
+            key.clip_mode = BRW_CLIP_MODE_CLIP_NON_REJECTED;
+
+            if (offset_back || offset_front) {
+               double mrd = 0.0;
+               if (ice->state.framebuffer.zsbuf)
+                  mrd = util_get_depth_format_mrd(util_format_description(ice->state.framebuffer.zsbuf->format));
+               key.offset_units = rs_state->offset_units * mrd * 2;
+               key.offset_factor = rs_state->offset_scale * mrd;
+               key.offset_clamp = rs_state->offset_clamp * mrd;
+            }
+
+            if (!(rs_state->front_ccw ^ rs_state->bottom_edge_rule)) {
+               key.fill_ccw = fill_front;
+               key.fill_cw = fill_back;
+               key.offset_ccw = offset_front;
+               key.offset_cw = offset_back;
+               if (rs_state->light_twoside &&
+                   key.fill_cw != BRW_CLIP_FILL_MODE_CULL)
+                  key.copy_bfc_cw = 1;
+            } else {
+               key.fill_cw = fill_front;
+               key.fill_ccw = fill_back;
+               key.offset_cw = offset_front;
+               key.offset_ccw = offset_back;
+               if (rs_state->light_twoside &&
+                   key.fill_ccw != BRW_CLIP_FILL_MODE_CULL)
+                  key.copy_bfc_ccw = 1;
+            }
+         }
+      }
+   }
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_CLIP, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_compile_clip(ice, &key);
+
+   if (old != shader) {
+      ice->state.dirty |= CROCUS_DIRTY_CLIP;
+      ice->shaders.clip_prog = shader;
+   }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_sf(struct crocus_context *ice, struct brw_sf_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx;
+   unsigned program_size;
+   mem_ctx = ralloc_context(NULL);
+
+   struct brw_sf_prog_data *sf_prog_data =
+      rzalloc(mem_ctx, struct brw_sf_prog_data);
+
+   const unsigned *program = brw_compile_sf(compiler, mem_ctx, key, sf_prog_data,
+                                            ice->shaders.last_vue_map, &program_size);
+
+   if (program == NULL) {
+      dbg_printf("failed to compile sf shader\n");
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   struct crocus_binding_table bt;
+   memset(&bt, 0, sizeof(bt));
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_SF, sizeof(*key), key, program,
+                           program_size,
+                           (struct brw_stage_prog_data *)sf_prog_data, sizeof(*sf_prog_data),
+                           NULL, NULL, 0, 0, &bt);
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+static void
+crocus_update_compiled_sf(struct crocus_context *ice)
+{
+   struct brw_sf_prog_key key;
+   struct crocus_compiled_shader *old = ice->shaders.sf_prog;
+   memset(&key, 0, sizeof(key));
+
+   key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+   switch (u_reduced_prim(ice->state.prim_mode)) {
+   case GL_TRIANGLES:
+   default:
+      if (key.attrs & BITFIELD64_BIT(VARYING_SLOT_EDGE))
+         key.primitive = BRW_SF_PRIM_UNFILLED_TRIS;
+      else
+         key.primitive = BRW_SF_PRIM_TRIANGLES;
+      break;
+   case GL_LINES:
+      key.primitive = BRW_SF_PRIM_LINES;
+      break;
+   case GL_POINTS:
+      key.primitive = BRW_SF_PRIM_POINTS;
+      break;
+   }
+
+   struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+   key.userclip_active = rs_state->clip_plane_enable != 0;
+   const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+   if (wm_prog_data) {
+      key.contains_flat_varying = wm_prog_data->contains_flat_varying;
+      memcpy(key.interp_mode, wm_prog_data->interp_mode, sizeof(key.interp_mode));
+   }
+
+   key.do_twoside_color = rs_state->light_twoside;
+
+   key.do_point_sprite = rs_state->point_quad_rasterization;
+   if (key.do_point_sprite) {
+      key.point_sprite_coord_replace = rs_state->sprite_coord_enable & 0xff;
+      if (rs_state->sprite_coord_enable & (1 << 8))
+         key.do_point_coord = 1;
+      if (wm_prog_data && wm_prog_data->urb_setup[VARYING_SLOT_PNTC] != -1)
+         key.do_point_coord = 1;
+   }
+
+   key.sprite_origin_lower_left = rs_state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT;
+
+   if (key.do_twoside_color) {
+      key.frontface_ccw = rs_state->front_ccw;
+   }
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_SF, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_compile_sf(ice, &key);
+
+   if (old != shader) {
+      ice->state.dirty |= CROCUS_DIRTY_RASTER;
+      ice->shaders.sf_prog = shader;
+   }
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_ff_gs(struct crocus_context *ice, struct brw_ff_gs_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx;
+   unsigned program_size;
+   mem_ctx = ralloc_context(NULL);
+
+   struct brw_ff_gs_prog_data *ff_gs_prog_data =
+      rzalloc(mem_ctx, struct brw_ff_gs_prog_data);
+
+   const unsigned *program = brw_compile_ff_gs_prog(compiler, mem_ctx, key, ff_gs_prog_data,
+                                                    ice->shaders.last_vue_map, &program_size);
+
+   if (program == NULL) {
+      dbg_printf("failed to compile sf shader\n");
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   struct crocus_binding_table bt;
+   memset(&bt, 0, sizeof(bt));
+
+   if (screen->devinfo.ver == 6) {
+      bt.sizes[CROCUS_SURFACE_GROUP_SOL] = BRW_MAX_SOL_BINDINGS;
+      bt.used_mask[CROCUS_SURFACE_GROUP_SOL] = (uint64_t)-1;
+
+      bt.size_bytes = BRW_MAX_SOL_BINDINGS * 4;
+   }
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_FF_GS, sizeof(*key), key, program,
+                           program_size,
+                           (struct brw_stage_prog_data *)ff_gs_prog_data, sizeof(*ff_gs_prog_data),
+                           NULL, NULL, 0, 0, &bt);
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+static void
+crocus_update_compiled_ff_gs(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct brw_ff_gs_prog_key key;
+   struct crocus_compiled_shader *old = ice->shaders.ff_gs_prog;
+   memset(&key, 0, sizeof(key));
+
+   assert(devinfo->ver < 7);
+
+   key.attrs = ice->shaders.last_vue_map->slots_valid;
+
+   key.primitive = screen->vtbl.translate_prim_type(ice->state.prim_mode, 0);
+
+   struct pipe_rasterizer_state *rs_state = crocus_get_rast_state(ice);
+   key.pv_first = rs_state->flatshade_first;
+
+   if (key.primitive == _3DPRIM_QUADLIST && !rs_state->flatshade) {
+      /* Provide consistenbbbbbt primitive order with brw_set_prim's
+       * optimization of single quads to trifans.
+       */
+      key.pv_first = true;
+   }
+
+   if (devinfo->ver >= 6) {
+      key.need_gs_prog = ice->state.streamout_active;
+      if (key.need_gs_prog) {
+         struct crocus_uncompiled_shader *vs =
+            ice->shaders.uncompiled[MESA_SHADER_VERTEX];
+         gfx6_ff_gs_xfb_setup(&vs->stream_output,
+                              &key);
+      }
+   } else {
+      key.need_gs_prog = (key.primitive == _3DPRIM_QUADLIST ||
+                          key.primitive == _3DPRIM_QUADSTRIP ||
+                          key.primitive == _3DPRIM_LINELOOP);
+   }
+
+   struct crocus_compiled_shader *shader = NULL;
+   if (key.need_gs_prog) {
+      shader = crocus_find_cached_shader(ice, CROCUS_CACHE_FF_GS,
+                                         sizeof(key), &key);
+      if (!shader)
+         shader = crocus_compile_ff_gs(ice, &key);
+   }
+   if (old != shader) {
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
+      if (!!old != !!shader)
+         ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+      ice->shaders.ff_gs_prog = shader;
+      if (shader) {
+         const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
+         ice->state.last_xfb_verts_per_prim = gs_prog_data->svbi_postincrement_value;
+      }
+   }
+}
+
+// XXX: crocus_compiled_shaders are space-leaking :(
+// XXX: do remember to unbind them if deleting them.
+
+/**
+ * Update the current shader variants for the given state.
+ *
+ * This should be called on every draw call to ensure that the correct
+ * shaders are bound.  It will also flag any dirty state triggered by
+ * swapping out those shaders.
+ */
+bool
+crocus_update_compiled_shaders(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   const uint64_t stage_dirty = ice->state.stage_dirty;
+
+   struct brw_vue_prog_data *old_prog_datas[4];
+   if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++)
+         old_prog_datas[i] = get_vue_prog_data(ice, i);
+   }
+
+   if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_TCS |
+                      CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+      struct crocus_uncompiled_shader *tes =
+         ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL];
+      if (tes) {
+         crocus_update_compiled_tcs(ice);
+         crocus_update_compiled_tes(ice);
+      } else {
+         ice->shaders.prog[CROCUS_CACHE_TCS] = NULL;
+         ice->shaders.prog[CROCUS_CACHE_TES] = NULL;
+         ice->state.stage_dirty |=
+            CROCUS_STAGE_DIRTY_TCS | CROCUS_STAGE_DIRTY_TES |
+            CROCUS_STAGE_DIRTY_BINDINGS_TCS | CROCUS_STAGE_DIRTY_BINDINGS_TES |
+            CROCUS_STAGE_DIRTY_CONSTANTS_TCS | CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+      }
+   }
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_VS)
+      crocus_update_compiled_vs(ice);
+   if (stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_GS)
+      crocus_update_compiled_gs(ice);
+
+   if (stage_dirty & (CROCUS_STAGE_DIRTY_UNCOMPILED_GS |
+                      CROCUS_STAGE_DIRTY_UNCOMPILED_TES)) {
+      const struct crocus_compiled_shader *gs =
+         ice->shaders.prog[MESA_SHADER_GEOMETRY];
+      const struct crocus_compiled_shader *tes =
+         ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+
+      bool points_or_lines = false;
+
+      if (gs) {
+         const struct brw_gs_prog_data *gs_prog_data = (void *) gs->prog_data;
+         points_or_lines =
+            gs_prog_data->output_topology == _3DPRIM_POINTLIST ||
+            gs_prog_data->output_topology == _3DPRIM_LINESTRIP;
+      } else if (tes) {
+         const struct brw_tes_prog_data *tes_data = (void *) tes->prog_data;
+         points_or_lines =
+            tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_LINE ||
+            tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+      }
+
+      if (ice->shaders.output_topology_is_points_or_lines != points_or_lines) {
+         /* Outbound to XY Clip enables */
+         ice->shaders.output_topology_is_points_or_lines = points_or_lines;
+         ice->state.dirty |= CROCUS_DIRTY_CLIP;
+      }
+   }
+
+   if (!ice->shaders.prog[MESA_SHADER_VERTEX])
+      return false;
+
+   gl_shader_stage last_stage = last_vue_stage(ice);
+   struct crocus_compiled_shader *shader = ice->shaders.prog[last_stage];
+   struct crocus_uncompiled_shader *ish = ice->shaders.uncompiled[last_stage];
+   update_last_vue_map(ice, shader->prog_data);
+   if (ice->state.streamout != shader->streamout) {
+      ice->state.streamout = shader->streamout;
+      ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST | CROCUS_DIRTY_STREAMOUT;
+   }
+
+   if (ice->state.streamout_active) {
+      screen->vtbl.update_so_strides(ice, ish->stream_output.stride);
+   }
+
+   /* use ice->state version as last_vue_map can dirty this bit */
+   if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_FS)
+      crocus_update_compiled_fs(ice);
+
+   if (screen->devinfo.ver <= 6) {
+      if (ice->state.dirty & CROCUS_DIRTY_GEN4_FF_GS_PROG &&
+          !ice->shaders.prog[MESA_SHADER_GEOMETRY])
+         crocus_update_compiled_ff_gs(ice);
+   }
+
+   if (screen->devinfo.ver < 6) {
+      if (ice->state.dirty & CROCUS_DIRTY_GEN4_CLIP_PROG)
+         crocus_update_compiled_clip(ice);
+      if (ice->state.dirty & CROCUS_DIRTY_GEN4_SF_PROG)
+         crocus_update_compiled_sf(ice);
+   }
+
+
+   /* Changing shader interfaces may require a URB configuration. */
+   if (!(ice->state.dirty & CROCUS_DIRTY_GEN6_URB)) {
+      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+         struct brw_vue_prog_data *old = old_prog_datas[i];
+         struct brw_vue_prog_data *new = get_vue_prog_data(ice, i);
+         if (!!old != !!new ||
+             (new && new->urb_entry_size != old->urb_entry_size)) {
+            ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+            break;
+         }
+      }
+   }
+
+   if (ice->state.stage_dirty & CROCUS_RENDER_STAGE_DIRTY_CONSTANTS) {
+      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_FRAGMENT; i++) {
+         if (ice->state.stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << i))
+            crocus_update_pull_constant_descriptors(ice, i);
+      }
+   }
+   return true;
+}
+
+static struct crocus_compiled_shader *
+crocus_compile_cs(struct crocus_context *ice,
+                  struct crocus_uncompiled_shader *ish,
+                  const struct brw_cs_prog_key *key)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct brw_compiler *compiler = screen->compiler;
+   void *mem_ctx = ralloc_context(NULL);
+   struct brw_cs_prog_data *cs_prog_data =
+      rzalloc(mem_ctx, struct brw_cs_prog_data);
+   struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+   enum brw_param_builtin *system_values;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   unsigned num_system_values;
+   unsigned num_cbufs;
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, ish->nir);
+
+   NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics);
+
+   crocus_setup_uniforms(compiler, mem_ctx, nir, prog_data, &system_values,
+                         &num_system_values, &num_cbufs);
+   crocus_lower_swizzles(nir, &key->base.tex);
+   struct crocus_binding_table bt;
+   crocus_setup_binding_table(devinfo, nir, &bt, /* num_render_targets */ 0,
+                              num_system_values, num_cbufs, &key->base.tex);
+
+   struct brw_compile_cs_params params = {
+      .nir = nir,
+      .key = key,
+      .prog_data = cs_prog_data,
+      .log_data = &ice->dbg,
+   };
+
+   const unsigned *program =
+      brw_compile_cs(compiler, mem_ctx, &params);
+   if (program == NULL) {
+      dbg_printf("Failed to compile compute shader: %s\n", params.error_str);
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (ish->compiled_once) {
+      crocus_debug_recompile(ice, &nir->info, &key->base);
+   } else {
+      ish->compiled_once = true;
+   }
+
+   struct crocus_compiled_shader *shader =
+      crocus_upload_shader(ice, CROCUS_CACHE_CS, sizeof(*key), key, program,
+                           prog_data->program_size,
+                           prog_data, sizeof(*cs_prog_data), NULL,
+                           system_values, num_system_values,
+                           num_cbufs, &bt);
+
+   crocus_disk_cache_store(screen->disk_cache, ish, shader,
+                           ice->shaders.cache_bo_map,
+                           key, sizeof(*key));
+
+   ralloc_free(mem_ctx);
+   return shader;
+}
+
+static void
+crocus_update_compiled_cs(struct crocus_context *ice)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct brw_cs_prog_key key = { KEY_INIT() };
+
+   if (ish->nos & (1ull << CROCUS_NOS_TEXTURES))
+      crocus_populate_sampler_prog_key_data(ice, devinfo, MESA_SHADER_COMPUTE, ish,
+                                            ish->nir->info.uses_texture_gather, &key.base.tex);
+   screen->vtbl.populate_cs_key(ice, &key);
+
+   struct crocus_compiled_shader *old = ice->shaders.prog[CROCUS_CACHE_CS];
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_CS, sizeof(key), &key);
+
+   if (!shader)
+      shader = crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key));
+
+   if (!shader)
+      shader = crocus_compile_cs(ice, ish, &key);
+
+   if (old != shader) {
+      ice->shaders.prog[CROCUS_CACHE_CS] = shader;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS |
+                          CROCUS_STAGE_DIRTY_BINDINGS_CS |
+                          CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+      shs->sysvals_need_upload = true;
+   }
+}
+
+void
+crocus_update_compiled_compute_shader(struct crocus_context *ice)
+{
+   if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_UNCOMPILED_CS)
+      crocus_update_compiled_cs(ice);
+
+   if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS)
+      crocus_update_pull_constant_descriptors(ice, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+                                 unsigned threads,
+                                 uint32_t *dst)
+{
+   assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0);
+   assert(cs_prog_data->push.cross_thread.size == 0);
+   assert(cs_prog_data->push.per_thread.dwords == 1);
+   assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
+   for (unsigned t = 0; t < threads; t++)
+      dst[8 * t] = t;
+}
+
+/**
+ * Allocate scratch BOs as needed for the given per-thread size and stage.
+ */
+struct crocus_bo *
+crocus_get_scratch_space(struct crocus_context *ice,
+                         unsigned per_thread_scratch,
+                         gl_shader_stage stage)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   unsigned encoded_size = ffs(per_thread_scratch) - 11;
+   assert(encoded_size < (1 << 16));
+
+   struct crocus_bo **bop = &ice->shaders.scratch_bos[encoded_size][stage];
+
+   unsigned subslice_total = screen->subslice_total;
+   subslice_total = 4 * devinfo->num_slices;
+   //   assert(subslice_total >= screen->subslice_total);
+
+   if (!*bop) {
+      unsigned scratch_ids_per_subslice = devinfo->max_cs_threads;
+
+      uint32_t max_threads[] = {
+         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
+         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
+         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
+         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
+         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
+         [MESA_SHADER_COMPUTE]   = scratch_ids_per_subslice * subslice_total,
+      };
+
+      uint32_t size = per_thread_scratch * max_threads[stage];
+
+      *bop = crocus_bo_alloc(bufmgr, "scratch", size);
+   }
+
+   return *bop;
+}
+
+/* ------------------------------------------------------------------- */
+
+/**
+ * The pipe->create_[stage]_state() driver hooks.
+ *
+ * Performs basic NIR preprocessing, records any state dependencies, and
+ * returns an crocus_uncompiled_shader as the Gallium CSO.
+ *
+ * Actual shader compilation to assembly happens later, at first use.
+ */
+static void *
+crocus_create_uncompiled_shader(struct pipe_context *ctx,
+                                nir_shader *nir,
+                                const struct pipe_stream_output_info *so_info)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_uncompiled_shader *ish =
+      calloc(1, sizeof(struct crocus_uncompiled_shader));
+   if (!ish)
+      return NULL;
+
+   if (devinfo->ver >= 6)
+     NIR_PASS(ish->needs_edge_flag, nir, crocus_fix_edge_flags);
+   else
+     ish->needs_edge_flag = false;
+
+   brw_preprocess_nir(screen->compiler, nir, NULL);
+
+   NIR_PASS_V(nir, brw_nir_lower_image_load_store, devinfo, false);
+   NIR_PASS_V(nir, crocus_lower_storage_image_derefs);
+
+   nir_sweep(nir);
+
+   ish->program_id = get_new_program_id(screen);
+   ish->nir = nir;
+   if (so_info) {
+      memcpy(&ish->stream_output, so_info, sizeof(*so_info));
+      update_so_info(&ish->stream_output, nir->info.outputs_written);
+   }
+
+   /* Save this now before potentially dropping nir->info.name */
+   if (nir->info.name && strncmp(nir->info.name, "ARB", 3) == 0)
+      ish->use_alt_mode = true;
+
+   if (screen->disk_cache) {
+      /* Serialize the NIR to a binary blob that we can hash for the disk
+       * cache.  Drop unnecessary information (like variable names)
+       * so the serialized NIR is smaller, and also to let us detect more
+       * isomorphic shaders when hashing, increasing cache hits.
+       */
+      struct blob blob;
+      blob_init(&blob);
+      nir_serialize(&blob, nir, true);
+      _mesa_sha1_compute(blob.data, blob.size, ish->nir_sha1);
+      blob_finish(&blob);
+   }
+
+   return ish;
+}
+
+static struct crocus_uncompiled_shader *
+crocus_create_shader_state(struct pipe_context *ctx,
+                           const struct pipe_shader_state *state)
+{
+   struct nir_shader *nir;
+
+   if (state->type == PIPE_SHADER_IR_TGSI)
+      nir = tgsi_to_nir(state->tokens, ctx->screen, false);
+   else
+      nir = state->ir.nir;
+
+   return crocus_create_uncompiled_shader(ctx, nir, &state->stream_output);
+}
+
+static void *
+crocus_create_vs_state(struct pipe_context *ctx,
+                       const struct pipe_shader_state *state)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+   ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+   /* User clip planes or gen5 sprite coord enable */
+   if (ish->nir->info.clip_distance_array_size == 0 ||
+       screen->devinfo.ver <= 5)
+      ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+   if (!screen->devinfo.is_haswell)
+      ish->nos |= (1ull << CROCUS_NOS_VERTEX_ELEMENTS);
+
+   if (screen->precompile) {
+      struct brw_vs_prog_key key = { KEY_INIT() };
+
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_vs(ice, ish, &key);
+   }
+
+   return ish;
+}
+
+static void *
+crocus_create_tcs_state(struct pipe_context *ctx,
+                        const struct pipe_shader_state *state)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+   struct shader_info *info = &ish->nir->info;
+
+   ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+   if (screen->precompile) {
+      const unsigned _GL_TRIANGLES = 0x0004;
+      struct brw_tcs_prog_key key = {
+         KEY_INIT(),
+         // XXX: make sure the linker fills this out from the TES...
+         .tes_primitive_mode =
+            info->tess.primitive_mode ? info->tess.primitive_mode
+                                      : _GL_TRIANGLES,
+         .outputs_written = info->outputs_written,
+         .patch_outputs_written = info->patch_outputs_written,
+      };
+
+      key.input_vertices = info->tess.tcs_vertices_out;
+
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_tcs(ice, ish, &key);
+   }
+
+   return ish;
+}
+
+static void *
+crocus_create_tes_state(struct pipe_context *ctx,
+                        const struct pipe_shader_state *state)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+   struct shader_info *info = &ish->nir->info;
+
+   ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+   /* User clip planes */
+   if (ish->nir->info.clip_distance_array_size == 0)
+      ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+   if (screen->precompile) {
+      struct brw_tes_prog_key key = {
+         KEY_INIT(),
+         // XXX: not ideal, need TCS output/TES input unification
+         .inputs_read = info->inputs_read,
+         .patch_inputs_read = info->patch_inputs_read,
+      };
+
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_tes(ice, ish, &key);
+   }
+
+   return ish;
+}
+
+static void *
+crocus_create_gs_state(struct pipe_context *ctx,
+                       const struct pipe_shader_state *state)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+
+   ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+   /* User clip planes */
+   if (ish->nir->info.clip_distance_array_size == 0)
+      ish->nos |= (1ull << CROCUS_NOS_RASTERIZER);
+
+   if (screen->precompile) {
+      struct brw_gs_prog_key key = { KEY_INIT() };
+
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_gs(ice, ish, &key);
+   }
+
+   return ish;
+}
+
+static void *
+crocus_create_fs_state(struct pipe_context *ctx,
+                       const struct pipe_shader_state *state)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish = crocus_create_shader_state(ctx, state);
+   struct shader_info *info = &ish->nir->info;
+
+   ish->nos |= (1ull << CROCUS_NOS_FRAMEBUFFER) |
+               (1ull << CROCUS_NOS_DEPTH_STENCIL_ALPHA) |
+               (1ull << CROCUS_NOS_RASTERIZER) |
+               (1ull << CROCUS_NOS_TEXTURES) |
+               (1ull << CROCUS_NOS_BLEND);
+
+   /* The program key needs the VUE map if there are > 16 inputs or gen4/5 */
+   if (screen->devinfo.ver < 6 || util_bitcount64(ish->nir->info.inputs_read &
+                                                  BRW_FS_VARYING_INPUT_MASK) > 16) {
+      ish->nos |= (1ull << CROCUS_NOS_LAST_VUE_MAP);
+   }
+
+   if (screen->precompile) {
+      const uint64_t color_outputs = info->outputs_written &
+         ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
+           BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
+           BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
+
+      bool can_rearrange_varyings =
+         screen->devinfo.ver > 6 && util_bitcount64(info->inputs_read & BRW_FS_VARYING_INPUT_MASK) <= 16;
+
+      const struct intel_device_info *devinfo = &screen->devinfo;
+      struct brw_wm_prog_key key = {
+         KEY_INIT(),
+         .nr_color_regions = util_bitcount(color_outputs),
+         .coherent_fb_fetch = false,
+         .input_slots_valid =
+         can_rearrange_varyings ? 0 : info->inputs_read | VARYING_BIT_POS,
+      };
+
+      struct brw_vue_map vue_map;
+      if (devinfo->ver < 6) {
+         brw_compute_vue_map(devinfo, &vue_map,
+                             info->inputs_read | VARYING_BIT_POS,
+                             false, /* pos slots */ 1);
+      }
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_fs(ice, ish, &key, &vue_map);
+   }
+
+   return ish;
+}
+
+static void *
+crocus_create_compute_state(struct pipe_context *ctx,
+                            const struct pipe_compute_state *state)
+{
+   assert(state->ir_type == PIPE_SHADER_IR_NIR);
+
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_uncompiled_shader *ish =
+      crocus_create_uncompiled_shader(ctx, (void *) state->prog, NULL);
+
+   ish->nos |= (1ull << CROCUS_NOS_TEXTURES);
+   // XXX: disallow more than 64KB of shared variables
+
+   if (screen->precompile) {
+      struct brw_cs_prog_key key = { KEY_INIT() };
+
+      if (!crocus_disk_cache_retrieve(ice, ish, &key, sizeof(key)))
+         crocus_compile_cs(ice, ish, &key);
+   }
+
+   return ish;
+}
+
+/**
+ * The pipe->delete_[stage]_state() driver hooks.
+ *
+ * Frees the crocus_uncompiled_shader.
+ */
+static void
+crocus_delete_shader_state(struct pipe_context *ctx, void *state, gl_shader_stage stage)
+{
+   struct crocus_uncompiled_shader *ish = state;
+   struct crocus_context *ice = (void *) ctx;
+
+   if (ice->shaders.uncompiled[stage] == ish) {
+      ice->shaders.uncompiled[stage] = NULL;
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+   }
+
+   if (ish->const_data) {
+      pipe_resource_reference(&ish->const_data, NULL);
+      pipe_resource_reference(&ish->const_data_state.res, NULL);
+   }
+
+   ralloc_free(ish->nir);
+   free(ish);
+}
+
+static void
+crocus_delete_vs_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_delete_tcs_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_delete_tes_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_delete_gs_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_delete_fs_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_delete_cs_state(struct pipe_context *ctx, void *state)
+{
+   crocus_delete_shader_state(ctx, state, MESA_SHADER_COMPUTE);
+}
+
+/**
+ * The pipe->bind_[stage]_state() driver hook.
+ *
+ * Binds an uncompiled shader as the current one for a particular stage.
+ * Updates dirty tracking to account for the shader's NOS.
+ */
+static void
+bind_shader_state(struct crocus_context *ice,
+                  struct crocus_uncompiled_shader *ish,
+                  gl_shader_stage stage)
+{
+   uint64_t dirty_bit = CROCUS_STAGE_DIRTY_UNCOMPILED_VS << stage;
+   const uint64_t nos = ish ? ish->nos : 0;
+
+   const struct shader_info *old_info = crocus_get_shader_info(ice, stage);
+   const struct shader_info *new_info = ish ? &ish->nir->info : NULL;
+
+   if ((old_info ? BITSET_LAST_BIT(old_info->textures_used) : 0) !=
+       (new_info ? BITSET_LAST_BIT(new_info->textures_used) : 0)) {
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
+   }
+
+   ice->shaders.uncompiled[stage] = ish;
+   ice->state.stage_dirty |= dirty_bit;
+
+   /* Record that CSOs need to mark CROCUS_DIRTY_UNCOMPILED_XS when they change
+    * (or that they no longer need to do so).
+    */
+   for (int i = 0; i < CROCUS_NOS_COUNT; i++) {
+      if (nos & (1 << i))
+         ice->state.stage_dirty_for_nos[i] |= dirty_bit;
+      else
+         ice->state.stage_dirty_for_nos[i] &= ~dirty_bit;
+   }
+}
+
+static void
+crocus_bind_vs_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_uncompiled_shader *new_ish = state;
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (new_ish &&
+       ice->state.window_space_position !=
+       new_ish->nir->info.vs.window_space_position) {
+      ice->state.window_space_position =
+         new_ish->nir->info.vs.window_space_position;
+
+      ice->state.dirty |= CROCUS_DIRTY_CLIP |
+                          CROCUS_DIRTY_RASTER |
+                          CROCUS_DIRTY_CC_VIEWPORT;
+   }
+
+   if (devinfo->ver == 6) {
+      ice->state.stage_dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+   }
+
+   bind_shader_state((void *) ctx, state, MESA_SHADER_VERTEX);
+}
+
+static void
+crocus_bind_tcs_state(struct pipe_context *ctx, void *state)
+{
+   bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_CTRL);
+}
+
+static void
+crocus_bind_tes_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   /* Enabling/disabling optional stages requires a URB reconfiguration. */
+   if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_TESS_EVAL])
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+   bind_shader_state((void *) ctx, state, MESA_SHADER_TESS_EVAL);
+}
+
+static void
+crocus_bind_gs_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+
+   /* Enabling/disabling optional stages requires a URB reconfiguration. */
+   if (!!state != !!ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_URB;
+
+   bind_shader_state((void *) ctx, state, MESA_SHADER_GEOMETRY);
+}
+
+static void
+crocus_bind_fs_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_uncompiled_shader *old_ish =
+      ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+   struct crocus_uncompiled_shader *new_ish = state;
+
+   const unsigned color_bits =
+      BITFIELD64_BIT(FRAG_RESULT_COLOR) |
+      BITFIELD64_RANGE(FRAG_RESULT_DATA0, BRW_MAX_DRAW_BUFFERS);
+
+   /* Fragment shader outputs influence HasWriteableRT */
+   if (!old_ish || !new_ish ||
+       (old_ish->nir->info.outputs_written & color_bits) !=
+       (new_ish->nir->info.outputs_written & color_bits))
+      ice->state.dirty |= CROCUS_DIRTY_WM;
+
+   bind_shader_state((void *) ctx, state, MESA_SHADER_FRAGMENT);
+}
+
+static void
+crocus_bind_cs_state(struct pipe_context *ctx, void *state)
+{
+   bind_shader_state((void *) ctx, state, MESA_SHADER_COMPUTE);
+}
+
+void
+crocus_init_program_functions(struct pipe_context *ctx)
+{
+   ctx->create_vs_state  = crocus_create_vs_state;
+   ctx->create_tcs_state = crocus_create_tcs_state;
+   ctx->create_tes_state = crocus_create_tes_state;
+   ctx->create_gs_state  = crocus_create_gs_state;
+   ctx->create_fs_state  = crocus_create_fs_state;
+   ctx->create_compute_state = crocus_create_compute_state;
+
+   ctx->delete_vs_state  = crocus_delete_vs_state;
+   ctx->delete_tcs_state = crocus_delete_tcs_state;
+   ctx->delete_tes_state = crocus_delete_tes_state;
+   ctx->delete_gs_state  = crocus_delete_gs_state;
+   ctx->delete_fs_state  = crocus_delete_fs_state;
+   ctx->delete_compute_state = crocus_delete_cs_state;
+
+   ctx->bind_vs_state  = crocus_bind_vs_state;
+   ctx->bind_tcs_state = crocus_bind_tcs_state;
+   ctx->bind_tes_state = crocus_bind_tes_state;
+   ctx->bind_gs_state  = crocus_bind_gs_state;
+   ctx->bind_fs_state  = crocus_bind_fs_state;
+   ctx->bind_compute_state = crocus_bind_cs_state;
+}
diff --git a/src/gallium/drivers/crocus/crocus_program_cache.c b/src/gallium/drivers/crocus/crocus_program_cache.c
new file mode 100644
index 00000000000..d2d4b821754
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_program_cache.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_program_cache.c
+ *
+ * The in-memory program cache.  This is basically a hash table mapping
+ * API-specified shaders and a state key to a compiled variant.  It also
+ * takes care of uploading shader assembly into a BO for use on the GPU.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_atomic.h"
+#include "util/u_upload_mgr.h"
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/compiler/brw_eu.h"
+#include "intel/compiler/brw_nir.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+
+struct keybox {
+   uint16_t size;
+   enum crocus_program_cache_id cache_id;
+   uint8_t data[0];
+};
+
+static struct keybox *
+make_keybox(void *mem_ctx, enum crocus_program_cache_id cache_id,
+            const void *key, uint32_t key_size)
+{
+   struct keybox *keybox =
+      ralloc_size(mem_ctx, sizeof(struct keybox) + key_size);
+
+   keybox->cache_id = cache_id;
+   keybox->size = key_size;
+   memcpy(keybox->data, key, key_size);
+
+   return keybox;
+}
+
+static uint32_t
+keybox_hash(const void *void_key)
+{
+   const struct keybox *key = void_key;
+   return _mesa_hash_data(&key->cache_id, key->size + sizeof(key->cache_id));
+}
+
+static bool
+keybox_equals(const void *void_a, const void *void_b)
+{
+   const struct keybox *a = void_a, *b = void_b;
+   if (a->size != b->size)
+      return false;
+
+   return memcmp(a->data, b->data, a->size) == 0;
+}
+
+struct crocus_compiled_shader *
+crocus_find_cached_shader(struct crocus_context *ice,
+                          enum crocus_program_cache_id cache_id,
+                          uint32_t key_size, const void *key)
+{
+   struct keybox *keybox = make_keybox(NULL, cache_id, key, key_size);
+   struct hash_entry *entry =
+      _mesa_hash_table_search(ice->shaders.cache, keybox);
+
+   ralloc_free(keybox);
+
+   return entry ? entry->data : NULL;
+}
+
+const void *
+crocus_find_previous_compile(const struct crocus_context *ice,
+                             enum crocus_program_cache_id cache_id,
+                             unsigned program_string_id)
+{
+   hash_table_foreach(ice->shaders.cache, entry) {
+      const struct keybox *keybox = entry->key;
+      const struct brw_base_prog_key *key = (const void *)keybox->data;
+      if (keybox->cache_id == cache_id &&
+          key->program_string_id == program_string_id) {
+         return keybox->data;
+      }
+   }
+
+   return NULL;
+}
+
+/**
+ * Look for an existing entry in the cache that has identical assembly code.
+ *
+ * This is useful for programs generating shaders at runtime, where multiple
+ * distinct shaders (from an API perspective) may compile to the same assembly
+ * in our backend.  This saves space in the program cache buffer.
+ */
+static const struct crocus_compiled_shader *
+find_existing_assembly(struct hash_table *cache, void *map,
+                       const void *assembly, unsigned assembly_size)
+{
+   hash_table_foreach (cache, entry) {
+      const struct crocus_compiled_shader *existing = entry->data;
+
+      if (existing->map_size != assembly_size)
+         continue;
+
+      if (memcmp(map + existing->offset, assembly, assembly_size) == 0)
+         return existing;
+   }
+   return NULL;
+}
+
+static void
+crocus_cache_new_bo(struct crocus_context *ice,
+                    uint32_t new_size)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   struct crocus_bo *new_bo;
+   new_bo = crocus_bo_alloc(screen->bufmgr, "program cache", new_size);
+
+   void *map = crocus_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE |
+                             MAP_ASYNC | MAP_PERSISTENT);
+
+   if (ice->shaders.cache_next_offset != 0) {
+      memcpy(map, ice->shaders.cache_bo_map, ice->shaders.cache_next_offset);
+   }
+
+   crocus_bo_unmap(ice->shaders.cache_bo);
+   crocus_bo_unreference(ice->shaders.cache_bo);
+   ice->shaders.cache_bo = new_bo;
+   ice->shaders.cache_bo_map = map;
+
+   if (screen->devinfo.ver == 4) {
+      /* reemit all shaders on GEN4 only. */
+      ice->state.dirty |= CROCUS_DIRTY_CLIP | CROCUS_DIRTY_RASTER |
+         CROCUS_DIRTY_WM;
+   }
+   ice->batches[CROCUS_BATCH_RENDER].state_base_address_emitted = false;
+   ice->batches[CROCUS_BATCH_COMPUTE].state_base_address_emitted = false;
+   /* unset state base address */
+}
+
+static uint32_t
+crocus_alloc_item_data(struct crocus_context *ice, uint32_t size)
+{
+   if (ice->shaders.cache_next_offset + size > ice->shaders.cache_bo->size) {
+      uint32_t new_size = ice->shaders.cache_bo->size * 2;
+      while (ice->shaders.cache_next_offset + size > new_size)
+         new_size *= 2;
+
+      crocus_cache_new_bo(ice, new_size);
+   }
+   uint32_t offset = ice->shaders.cache_next_offset;
+
+   /* Programs are always 64-byte aligned, so set up the next one now */
+   ice->shaders.cache_next_offset = ALIGN(offset + size, 64);
+   return offset;
+}
+
+struct crocus_compiled_shader *
+crocus_upload_shader(struct crocus_context *ice,
+                     enum crocus_program_cache_id cache_id, uint32_t key_size,
+                     const void *key, const void *assembly, uint32_t asm_size,
+                     struct brw_stage_prog_data *prog_data,
+                     uint32_t prog_data_size, uint32_t *streamout,
+                     enum brw_param_builtin *system_values,
+                     unsigned num_system_values, unsigned num_cbufs,
+                     const struct crocus_binding_table *bt)
+{
+   struct hash_table *cache = ice->shaders.cache;
+   struct crocus_compiled_shader *shader =
+      rzalloc_size(cache, sizeof(struct crocus_compiled_shader));
+   const struct crocus_compiled_shader *existing = find_existing_assembly(
+      cache, ice->shaders.cache_bo_map, assembly, asm_size);
+
+   /* If we can find a matching prog in the cache already, then reuse the
+    * existing stuff without creating new copy into the underlying buffer
+    * object.  This is notably useful for programs generating shaders at
+    * runtime, where multiple shaders may compile to the same thing in our
+    * backend.
+    */
+   if (existing) {
+      shader->offset = existing->offset;
+      shader->map_size = existing->map_size;
+   } else {
+      shader->offset = crocus_alloc_item_data(ice, asm_size);
+      shader->map_size = asm_size;
+
+      memcpy(ice->shaders.cache_bo_map + shader->offset, assembly, asm_size);
+   }
+
+   shader->prog_data = prog_data;
+   shader->prog_data_size = prog_data_size;
+   shader->streamout = streamout;
+   shader->system_values = system_values;
+   shader->num_system_values = num_system_values;
+   shader->num_cbufs = num_cbufs;
+   shader->bt = *bt;
+
+   ralloc_steal(shader, shader->prog_data);
+   if (prog_data_size > 16) {
+      ralloc_steal(shader->prog_data, prog_data->param);
+      ralloc_steal(shader->prog_data, prog_data->pull_param);
+   }
+   ralloc_steal(shader, shader->streamout);
+   ralloc_steal(shader, shader->system_values);
+
+   struct keybox *keybox = make_keybox(shader, cache_id, key, key_size);
+   _mesa_hash_table_insert(ice->shaders.cache, keybox, shader);
+
+   return shader;
+}
+
+bool
+crocus_blorp_lookup_shader(struct blorp_batch *blorp_batch, const void *key,
+                           uint32_t key_size, uint32_t *kernel_out,
+                           void *prog_data_out)
+{
+   struct blorp_context *blorp = blorp_batch->blorp;
+   struct crocus_context *ice = blorp->driver_ctx;
+   struct crocus_compiled_shader *shader =
+      crocus_find_cached_shader(ice, CROCUS_CACHE_BLORP, key_size, key);
+
+   if (!shader)
+      return false;
+
+   *kernel_out = shader->offset;
+   *((void **)prog_data_out) = shader->prog_data;
+
+   return true;
+}
+
+bool
+crocus_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage,
+                           const void *key, uint32_t key_size,
+                           const void *kernel, uint32_t kernel_size,
+                           const struct brw_stage_prog_data *prog_data_templ,
+                           uint32_t prog_data_size, uint32_t *kernel_out,
+                           void *prog_data_out)
+{
+   struct blorp_context *blorp = blorp_batch->blorp;
+   struct crocus_context *ice = blorp->driver_ctx;
+
+   struct brw_stage_prog_data *prog_data = ralloc_size(NULL, prog_data_size);
+   memcpy(prog_data, prog_data_templ, prog_data_size);
+
+   struct crocus_binding_table bt;
+   memset(&bt, 0, sizeof(bt));
+
+   struct crocus_compiled_shader *shader = crocus_upload_shader(
+      ice, CROCUS_CACHE_BLORP, key_size, key, kernel, kernel_size, prog_data,
+      prog_data_size, NULL, NULL, 0, 0, &bt);
+
+   *kernel_out = shader->offset;
+   *((void **)prog_data_out) = shader->prog_data;
+
+   return true;
+}
+
+void
+crocus_init_program_cache(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   ice->shaders.cache =
+      _mesa_hash_table_create(ice, keybox_hash, keybox_equals);
+
+   ice->shaders.cache_bo =
+      crocus_bo_alloc(screen->bufmgr, "program_cache", 16384);
+   ice->shaders.cache_bo_map =
+      crocus_bo_map(NULL, ice->shaders.cache_bo,
+                    MAP_READ | MAP_WRITE | MAP_ASYNC | MAP_PERSISTENT);
+}
+
+void
+crocus_destroy_program_cache(struct crocus_context *ice)
+{
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      ice->shaders.prog[i] = NULL;
+   }
+
+   if (ice->shaders.cache_bo) {
+      crocus_bo_unmap(ice->shaders.cache_bo);
+      crocus_bo_unreference(ice->shaders.cache_bo);
+      ice->shaders.cache_bo_map = NULL;
+      ice->shaders.cache_bo = NULL;
+   }
+
+   ralloc_free(ice->shaders.cache);
+}
+
+static const char *
+cache_name(enum crocus_program_cache_id cache_id)
+{
+   if (cache_id == CROCUS_CACHE_BLORP)
+      return "BLORP";
+
+   if (cache_id == CROCUS_CACHE_SF)
+      return "SF";
+
+   if (cache_id == CROCUS_CACHE_CLIP)
+      return "CLIP";
+
+   if (cache_id == CROCUS_CACHE_FF_GS)
+      return "FF_GS";
+
+   return _mesa_shader_stage_to_string(cache_id);
+}
+
+void
+crocus_print_program_cache(struct crocus_context *ice)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   hash_table_foreach(ice->shaders.cache, entry) {
+      const struct keybox *keybox = entry->key;
+      struct crocus_compiled_shader *shader = entry->data;
+      fprintf(stderr, "%s:\n", cache_name(keybox->cache_id));
+      brw_disassemble(devinfo, ice->shaders.cache_bo_map + shader->offset, 0,
+                      shader->prog_data->program_size, NULL, stderr);
+   }
+}
diff --git a/src/gallium/drivers/crocus/crocus_query.c b/src/gallium/drivers/crocus/crocus_query.c
new file mode 100644
index 00000000000..14ba9fbce59
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_query.c
@@ -0,0 +1,996 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_query.c
+ *
+ * ============================= GENXML CODE =============================
+ *              [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * Query object support.  This allows measuring various simple statistics
+ * via counters on the GPU.  We use GenX code for MI_MATH calculations.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "perf/intel_perf.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_fence.h"
+#include "crocus_monitor.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+
+#include "crocus_genx_macros.h"
+
+#if GFX_VER == 6
+// TOOD: Add these to genxml?
+#define SO_PRIM_STORAGE_NEEDED(n) (0x2280)
+#define SO_NUM_PRIMS_WRITTEN(n)   (0x2288)
+
+// TODO: remove HS/DS/CS
+#define GFX6_IA_VERTICES_COUNT_num          0x2310
+#define GFX6_IA_PRIMITIVES_COUNT_num        0x2318
+#define GFX6_VS_INVOCATION_COUNT_num        0x2320
+#define GFX6_HS_INVOCATION_COUNT_num        0x2300
+#define GFX6_DS_INVOCATION_COUNT_num        0x2308
+#define GFX6_GS_INVOCATION_COUNT_num        0x2328
+#define GFX6_GS_PRIMITIVES_COUNT_num        0x2330
+#define GFX6_CL_INVOCATION_COUNT_num        0x2338
+#define GFX6_CL_PRIMITIVES_COUNT_num        0x2340
+#define GFX6_PS_INVOCATION_COUNT_num        0x2348
+#define GFX6_CS_INVOCATION_COUNT_num        0x2290
+#define GFX6_PS_DEPTH_COUNT_num             0x2350
+
+#elif GFX_VER == 7
+#define SO_PRIM_STORAGE_NEEDED(n) (GENX(SO_PRIM_STORAGE_NEEDED0_num) + (n) * 8)
+#define SO_NUM_PRIMS_WRITTEN(n)   (GENX(SO_NUM_PRIMS_WRITTEN0_num) + (n) * 8)
+#endif
+
+struct crocus_query {
+   enum pipe_query_type type;
+   int index;
+
+   bool ready;
+
+   bool stalled;
+
+   uint64_t result;
+
+   struct crocus_state_ref query_state_ref;
+   struct crocus_query_snapshots *map;
+   struct crocus_syncobj *syncobj;
+
+   int batch_idx;
+
+   struct crocus_monitor_object *monitor;
+
+   /* Fence for PIPE_QUERY_GPU_FINISHED. */
+   struct pipe_fence_handle *fence;
+};
+
+struct crocus_query_snapshots {
+   /** crocus_render_condition's saved MI_PREDICATE_RESULT value. */
+   uint64_t predicate_result;
+
+   /** Have the start/end snapshots landed? */
+   uint64_t snapshots_landed;
+
+   /** Starting and ending counter snapshots */
+   uint64_t start;
+   uint64_t end;
+};
+
+struct crocus_query_so_overflow {
+   uint64_t predicate_result;
+   uint64_t snapshots_landed;
+
+   struct {
+      uint64_t prim_storage_needed[2];
+      uint64_t num_prims[2];
+   } stream[4];
+};
+
+#if GFX_VERx10 == 75
+static struct mi_value
+query_mem64(struct crocus_query *q, uint32_t offset)
+{
+   return mi_mem64(rw_bo(crocus_resource_bo(q->query_state_ref.res),
+                         q->query_state_ref.offset + offset));
+}
+#endif
+
+/**
+ * Is this type of query written by PIPE_CONTROL?
+ */
+static bool
+crocus_is_query_pipelined(struct crocus_query *q)
+{
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_TIME_ELAPSED:
+      return true;
+
+   default:
+      return false;
+   }
+}
+
+static void
+mark_available(struct crocus_context *ice, struct crocus_query *q)
+{
+#if GFX_VERx10 == 75
+   struct crocus_batch *batch = &ice->batches[q->batch_idx];
+   struct crocus_screen *screen = batch->screen;
+   unsigned flags = PIPE_CONTROL_WRITE_IMMEDIATE;
+   unsigned offset = offsetof(struct crocus_query_snapshots, snapshots_landed);
+   struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+   offset += q->query_state_ref.offset;
+
+   if (!crocus_is_query_pipelined(q)) {
+      screen->vtbl.store_data_imm64(batch, bo, offset, true);
+   } else {
+      /* Order available *after* the query results. */
+      flags |= PIPE_CONTROL_FLUSH_ENABLE;
+      crocus_emit_pipe_control_write(batch, "query: mark available",
+                                     flags, bo, offset, true);
+   }
+#endif
+}
+
+/**
+ * Write PS_DEPTH_COUNT to q->(dest) via a PIPE_CONTROL.
+ */
+static void
+crocus_pipelined_write(struct crocus_batch *batch,
+                       struct crocus_query *q,
+                       enum pipe_control_flags flags,
+                       unsigned offset)
+{
+   struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+   crocus_emit_pipe_control_write(batch, "query: pipelined snapshot write",
+                                  flags,
+                                  bo, offset, 0ull);
+}
+
+static void
+write_value(struct crocus_context *ice, struct crocus_query *q, unsigned offset)
+{
+   struct crocus_batch *batch = &ice->batches[q->batch_idx];
+#if GFX_VER >= 6
+   struct crocus_screen *screen = batch->screen;
+   struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+#endif
+
+   if (!crocus_is_query_pipelined(q)) {
+      crocus_emit_pipe_control_flush(batch,
+                                     "query: non-pipelined snapshot write",
+                                     PIPE_CONTROL_CS_STALL |
+                                     PIPE_CONTROL_STALL_AT_SCOREBOARD);
+      q->stalled = true;
+   }
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+                             PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                             PIPE_CONTROL_DEPTH_STALL,
+                             offset);
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      crocus_pipelined_write(&ice->batches[CROCUS_BATCH_RENDER], q,
+                             PIPE_CONTROL_WRITE_TIMESTAMP,
+                             offset);
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+#if GFX_VER >= 6
+      screen->vtbl.store_register_mem64(batch,
+                                        q->index == 0 ?
+                                        GENX(CL_INVOCATION_COUNT_num) :
+                                        SO_PRIM_STORAGE_NEEDED(q->index),
+                                        bo, offset, false);
+#endif
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+#if GFX_VER >= 6
+      screen->vtbl.store_register_mem64(batch,
+                                        SO_NUM_PRIMS_WRITTEN(q->index),
+                                        bo, offset, false);
+#endif
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE: {
+#if GFX_VER >= 6
+      static const uint32_t index_to_reg[] = {
+         GENX(IA_VERTICES_COUNT_num),
+         GENX(IA_PRIMITIVES_COUNT_num),
+         GENX(VS_INVOCATION_COUNT_num),
+         GENX(GS_INVOCATION_COUNT_num),
+         GENX(GS_PRIMITIVES_COUNT_num),
+         GENX(CL_INVOCATION_COUNT_num),
+         GENX(CL_PRIMITIVES_COUNT_num),
+         GENX(PS_INVOCATION_COUNT_num),
+         GENX(HS_INVOCATION_COUNT_num),
+         GENX(DS_INVOCATION_COUNT_num),
+         GENX(CS_INVOCATION_COUNT_num),
+      };
+      uint32_t reg = index_to_reg[q->index];
+
+#if GFX_VER == 6
+      /* Gfx6 GS code counts full primitives, that is, it won't count individual
+       * triangles in a triangle strip. Use CL_INVOCATION_COUNT for that.
+       */
+      if (q->index == PIPE_STAT_QUERY_GS_PRIMITIVES)
+         reg = GENX(CL_INVOCATION_COUNT_num);
+#endif
+
+      screen->vtbl.store_register_mem64(batch, reg, bo, offset, false);
+#endif
+      break;
+   }
+   default:
+      assert(false);
+   }
+}
+
+#if GFX_VER >= 6
+static void
+write_overflow_values(struct crocus_context *ice, struct crocus_query *q, bool end)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_screen *screen = batch->screen;
+   uint32_t count = q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ? 1 : 4;
+   struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+   uint32_t offset = q->query_state_ref.offset;
+   crocus_emit_pipe_control_flush(batch,
+                                  "query: write SO overflow snapshots",
+                                  PIPE_CONTROL_CS_STALL |
+                                  PIPE_CONTROL_STALL_AT_SCOREBOARD);
+   for (uint32_t i = 0; i < count; i++) {
+      int s = q->index + i;
+      int g_idx = offset + offsetof(struct crocus_query_so_overflow,
+                                    stream[s].num_prims[end]);
+      int w_idx = offset + offsetof(struct crocus_query_so_overflow,
+                                    stream[s].prim_storage_needed[end]);
+      screen->vtbl.store_register_mem64(batch, SO_NUM_PRIMS_WRITTEN(s),
+                                        bo, g_idx, false);
+      screen->vtbl.store_register_mem64(batch, SO_PRIM_STORAGE_NEEDED(s),
+                                        bo, w_idx, false);
+   }
+}
+#endif
+static uint64_t
+crocus_raw_timestamp_delta(uint64_t time0, uint64_t time1)
+{
+   if (time0 > time1) {
+      return (1ULL << TIMESTAMP_BITS) + time1 - time0;
+   } else {
+      return time1 - time0;
+   }
+}
+
+static bool
+stream_overflowed(struct crocus_query_so_overflow *so, int s)
+{
+   return (so->stream[s].prim_storage_needed[1] -
+           so->stream[s].prim_storage_needed[0]) !=
+          (so->stream[s].num_prims[1] - so->stream[s].num_prims[0]);
+}
+
+static void
+calculate_result_on_cpu(const struct intel_device_info *devinfo,
+                        struct crocus_query *q)
+{
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      q->result = q->map->end != q->map->start;
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* The timestamp is the single starting snapshot. */
+      q->result = intel_device_info_timebase_scale(devinfo, q->map->start);
+      q->result &= (1ull << TIMESTAMP_BITS) - 1;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      q->result = crocus_raw_timestamp_delta(q->map->start, q->map->end);
+      q->result = intel_device_info_timebase_scale(devinfo, q->result);
+      q->result &= (1ull << TIMESTAMP_BITS) - 1;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      q->result = stream_overflowed((void *) q->map, q->index);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      q->result = false;
+      for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+         q->result |= stream_overflowed((void *) q->map, i);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS_SINGLE:
+      q->result = q->map->end - q->map->start;
+
+      /* WaDividePSInvocationCountBy4:HSW,BDW */
+      if (GFX_VER == 7 && devinfo->is_haswell && q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+         q->result /= 4;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   default:
+      q->result = q->map->end - q->map->start;
+      break;
+   }
+
+   q->ready = true;
+}
+
+#if GFX_VERx10 == 75
+/**
+ * Calculate the streamout overflow for stream \p idx:
+ *
+ * (num_prims[1] - num_prims[0]) - (storage_needed[1] - storage_needed[0])
+ */
+static struct mi_value
+calc_overflow_for_stream(struct mi_builder *b,
+                         struct crocus_query *q,
+                         int idx)
+{
+#define C(counter, i) query_mem64(q, \
+   offsetof(struct crocus_query_so_overflow, stream[idx].counter[i]))
+
+   return mi_isub(b, mi_isub(b, C(num_prims, 1), C(num_prims, 0)),
+                  mi_isub(b, C(prim_storage_needed, 1),
+                          C(prim_storage_needed, 0)));
+#undef C
+}
+
+/**
+ * Calculate whether any stream has overflowed.
+ */
+static struct mi_value
+calc_overflow_any_stream(struct mi_builder *b, struct crocus_query *q)
+{
+   struct mi_value stream_result[MAX_VERTEX_STREAMS];
+   for (int i = 0; i < MAX_VERTEX_STREAMS; i++)
+      stream_result[i] = calc_overflow_for_stream(b, q, i);
+
+   struct mi_value result = stream_result[0];
+   for (int i = 1; i < MAX_VERTEX_STREAMS; i++)
+      result = mi_ior(b, result, stream_result[i]);
+
+   return result;
+}
+
+
+static bool
+query_is_boolean(enum pipe_query_type type)
+{
+   switch (type) {
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Calculate the result using MI_MATH.
+ */
+static struct mi_value
+calculate_result_on_gpu(const struct intel_device_info *devinfo,
+                        struct mi_builder *b,
+                        struct crocus_query *q)
+{
+   struct mi_value result;
+   struct mi_value start_val =
+      query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+   struct mi_value end_val =
+      query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result = calc_overflow_for_stream(b, q, q->index);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      result = calc_overflow_any_stream(b, q);
+      break;
+   case PIPE_QUERY_TIMESTAMP: {
+      /* TODO: This discards any fractional bits of the timebase scale.
+       * We would need to do a bit of fixed point math on the CS ALU, or
+       * launch an actual shader to calculate this with full precision.
+       */
+      uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+      result = mi_iand(b, mi_imm((1ull << 36) - 1),
+                       mi_imul_imm(b, start_val, scale));
+      break;
+   }
+   case PIPE_QUERY_TIME_ELAPSED: {
+      /* TODO: This discards fractional bits (see above). */
+      uint32_t scale = 1000000000ull / devinfo->timestamp_frequency;
+      result = mi_imul_imm(b, mi_isub(b, end_val, start_val), scale);
+      break;
+   }
+   default:
+      result = mi_isub(b, end_val, start_val);
+      break;
+   }
+   /* WaDividePSInvocationCountBy4:HSW,BDW */
+   if (GFX_VER == 7 && devinfo->is_haswell &&
+       q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+       q->index == PIPE_STAT_QUERY_PS_INVOCATIONS)
+      result = mi_ushr32_imm(b, result, 2);
+
+   if (query_is_boolean(q->type))
+      result = mi_iand(b, mi_nz(b, result), mi_imm(1));
+
+   return result;
+}
+#endif
+
+static struct pipe_query *
+crocus_create_query(struct pipe_context *ctx,
+                    unsigned query_type,
+                    unsigned index)
+{
+   struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+
+   q->type = query_type;
+   q->index = index;
+   q->monitor = NULL;
+
+   if (q->type == PIPE_QUERY_PIPELINE_STATISTICS_SINGLE &&
+       q->index == PIPE_STAT_QUERY_CS_INVOCATIONS)
+      q->batch_idx = CROCUS_BATCH_COMPUTE;
+   else
+      q->batch_idx = CROCUS_BATCH_RENDER;
+   return (struct pipe_query *) q;
+}
+
+static struct pipe_query *
+crocus_create_batch_query(struct pipe_context *ctx,
+                          unsigned num_queries,
+                          unsigned *query_types)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = calloc(1, sizeof(struct crocus_query));
+   if (unlikely(!q))
+      return NULL;
+   q->type = PIPE_QUERY_DRIVER_SPECIFIC;
+   q->index = -1;
+   q->monitor = crocus_create_monitor_object(ice, num_queries, query_types);
+   if (unlikely(!q->monitor)) {
+      free(q);
+      return NULL;
+   }
+
+   return (struct pipe_query *) q;
+}
+
+static void
+crocus_destroy_query(struct pipe_context *ctx, struct pipe_query *p_query)
+{
+   struct crocus_query *query = (void *) p_query;
+   struct crocus_screen *screen = (void *) ctx->screen;
+   if (query->monitor) {
+      crocus_destroy_monitor_object(ctx, query->monitor);
+      query->monitor = NULL;
+   } else {
+      crocus_syncobj_reference(screen, &query->syncobj, NULL);
+      screen->base.fence_reference(ctx->screen, &query->fence, NULL);
+   }
+   free(query);
+}
+
+
+static bool
+crocus_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = (void *) query;
+
+   if (q->monitor)
+      return crocus_begin_monitor(ctx, q->monitor);
+
+   void *ptr = NULL;
+   uint32_t size;
+
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      size = sizeof(struct crocus_query_so_overflow);
+   else
+      size = sizeof(struct crocus_query_snapshots);
+
+   u_upload_alloc(ice->query_buffer_uploader, 0,
+                  size, size, &q->query_state_ref.offset,
+                  &q->query_state_ref.res, &ptr);
+
+   if (!crocus_resource_bo(q->query_state_ref.res))
+      return false;
+
+   q->map = ptr;
+   if (!q->map)
+      return false;
+
+   q->result = 0ull;
+   q->ready = false;
+   WRITE_ONCE(q->map->snapshots_landed, false);
+
+   if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+      ice->state.prims_generated_query_active = true;
+      ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+   }
+
+#if GFX_VER <= 5
+   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+       q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+      ice->state.stats_wm++;
+      ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+   }
+#endif
+#if GFX_VER >= 6
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      write_overflow_values(ice, q, false);
+   else
+#endif
+      write_value(ice, q,
+                  q->query_state_ref.offset +
+                  offsetof(struct crocus_query_snapshots, start));
+
+   return true;
+}
+
+static bool
+crocus_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = (void *) query;
+
+   if (q->monitor)
+      return crocus_end_monitor(ctx, q->monitor);
+
+   if (q->type == PIPE_QUERY_GPU_FINISHED) {
+      ctx->flush(ctx, &q->fence, PIPE_FLUSH_DEFERRED);
+      return true;
+   }
+
+   struct crocus_batch *batch = &ice->batches[q->batch_idx];
+
+   if (q->type == PIPE_QUERY_TIMESTAMP) {
+      crocus_begin_query(ctx, query);
+      crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+      mark_available(ice, q);
+      return true;
+   }
+
+#if GFX_VER <= 5
+   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+       q->type == PIPE_QUERY_OCCLUSION_PREDICATE) {
+      ice->state.stats_wm--;
+      ice->state.dirty |= CROCUS_DIRTY_WM | CROCUS_DIRTY_COLOR_CALC_STATE;
+   }
+#endif
+   if (q->type == PIPE_QUERY_PRIMITIVES_GENERATED && q->index == 0) {
+      ice->state.prims_generated_query_active = false;
+      ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+   }
+
+#if GFX_VER >= 6
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
+      write_overflow_values(ice, q, true);
+   else
+#endif
+      write_value(ice, q,
+                  q->query_state_ref.offset +
+                  offsetof(struct crocus_query_snapshots, end));
+
+   crocus_batch_reference_signal_syncobj(batch, &q->syncobj);
+   mark_available(ice, q);
+
+   return true;
+}
+
+/**
+ * See if the snapshots have landed for a query, and if so, compute the
+ * result and mark it ready.  Does not flush (unlike crocus_get_query_result).
+ */
+static void
+crocus_check_query_no_flush(struct crocus_context *ice, struct crocus_query *q)
+{
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+      calculate_result_on_cpu(devinfo, q);
+   }
+}
+
+static bool
+crocus_get_query_result(struct pipe_context *ctx,
+                        struct pipe_query *query,
+                        bool wait,
+                        union pipe_query_result *result)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = (void *) query;
+
+   if (q->monitor)
+      return crocus_get_monitor_result(ctx, q->monitor, wait, result->batch);
+
+   struct crocus_screen *screen = (void *) ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (unlikely(screen->no_hw)) {
+      result->u64 = 0;
+      return true;
+   }
+
+   if (!q->ready) {
+      struct crocus_batch *batch = &ice->batches[q->batch_idx];
+      if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+         crocus_batch_flush(batch);
+
+#if GFX_VERx10 == 75
+      while (!READ_ONCE(q->map->snapshots_landed)) {
+         if (wait)
+            crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+         else
+            return false;
+      }
+      assert(READ_ONCE(q->map->snapshots_landed));
+#else
+      if (wait)
+         crocus_wait_syncobj(ctx->screen, q->syncobj, INT64_MAX);
+#endif
+      calculate_result_on_cpu(devinfo, q);
+   }
+
+   assert(q->ready);
+
+   result->u64 = q->result;
+
+   return true;
+}
+
+#if GFX_VER == 7
+static void
+crocus_get_query_result_resource(struct pipe_context *ctx,
+                                 struct pipe_query *query,
+                                 bool wait,
+                                 enum pipe_query_value_type result_type,
+                                 int index,
+                                 struct pipe_resource *p_res,
+                                 unsigned offset)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = (void *) query;
+   struct crocus_batch *batch = &ice->batches[q->batch_idx];
+   struct crocus_screen *screen = batch->screen;
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   struct crocus_resource *res = (void *) p_res;
+   struct crocus_bo *query_bo = crocus_resource_bo(q->query_state_ref.res);
+   struct crocus_bo *dst_bo = crocus_resource_bo(p_res);
+   unsigned snapshots_landed_offset =
+      offsetof(struct crocus_query_snapshots, snapshots_landed);
+
+   res->bind_history |= PIPE_BIND_QUERY_BUFFER;
+
+   if (index == -1) {
+      /* They're asking for the availability of the result.  If we still
+       * have commands queued up which produce the result, submit them
+       * now so that progress happens.  Either way, copy the snapshots
+       * landed field to the destination resource.
+       */
+      if (q->syncobj == crocus_batch_get_signal_syncobj(batch))
+         crocus_batch_flush(batch);
+
+      screen->vtbl.copy_mem_mem(batch, dst_bo, offset,
+                                query_bo, snapshots_landed_offset,
+                                result_type <= PIPE_QUERY_TYPE_U32 ? 4 : 8);
+      return;
+   }
+
+   if (!q->ready && READ_ONCE(q->map->snapshots_landed)) {
+      /* The final snapshots happen to have landed, so let's just compute
+       * the result on the CPU now...
+       */
+      calculate_result_on_cpu(devinfo, q);
+   }
+
+   if (q->ready) {
+      /* We happen to have the result on the CPU, so just copy it. */
+      if (result_type <= PIPE_QUERY_TYPE_U32) {
+         screen->vtbl.store_data_imm32(batch, dst_bo, offset, q->result);
+      } else {
+         screen->vtbl.store_data_imm64(batch, dst_bo, offset, q->result);
+      }
+
+      /* Make sure the result lands before they use bind the QBO elsewhere
+       * and use the result.
+       */
+      // XXX: Why?  i965 doesn't do this.
+      crocus_emit_pipe_control_flush(batch,
+                                     "query: unknown QBO flushing hack",
+                                     PIPE_CONTROL_CS_STALL);
+      return;
+   }
+
+#if GFX_VERx10 == 75
+   bool predicated = !wait && !q->stalled;
+
+   struct mi_builder b;
+   mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+   struct mi_value result = calculate_result_on_gpu(devinfo, &b, q);
+   struct mi_value dst =
+      result_type <= PIPE_QUERY_TYPE_U32 ? mi_mem32(rw_bo(dst_bo, offset))
+                                         : mi_mem64(rw_bo(dst_bo, offset));
+
+   if (predicated) {
+      mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
+                   mi_mem64(ro_bo(query_bo, snapshots_landed_offset)));
+      mi_store_if(&b, dst, result);
+   } else {
+      mi_store(&b, dst, result);
+   }
+#endif
+}
+#endif
+
+static void
+crocus_set_active_query_state(struct pipe_context *ctx, bool enable)
+{
+   struct crocus_context *ice = (void *) ctx;
+
+   if (ice->state.statistics_counters_enabled == enable)
+      return;
+
+   // XXX: most packets aren't paying attention to this yet, because it'd
+   // have to be done dynamically at draw time, which is a pain
+   ice->state.statistics_counters_enabled = enable;
+   ice->state.dirty |= CROCUS_DIRTY_CLIP |
+                       CROCUS_DIRTY_RASTER |
+                       CROCUS_DIRTY_STREAMOUT |
+                       CROCUS_DIRTY_WM;
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS |
+                             CROCUS_STAGE_DIRTY_TCS |
+                             CROCUS_STAGE_DIRTY_TES |
+                             CROCUS_STAGE_DIRTY_VS;
+}
+
+static void
+set_predicate_enable(struct crocus_context *ice, bool value)
+{
+   if (value)
+      ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+   else
+      ice->state.predicate = CROCUS_PREDICATE_STATE_DONT_RENDER;
+}
+
+#if GFX_VER == 7
+static void
+set_predicate_for_result(struct crocus_context *ice,
+                         struct crocus_query *q,
+                         bool inverted)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct crocus_bo *bo = crocus_resource_bo(q->query_state_ref.res);
+
+#if GFX_VERx10 != 75
+   /* IVB doesn't have enough MI for this */
+   if (q->type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
+       q->type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
+      ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+      return;
+   }
+#endif
+
+   /* The CPU doesn't have the query result yet; use hardware predication */
+   ice->state.predicate = CROCUS_PREDICATE_STATE_USE_BIT;
+
+   /* Ensure the memory is coherent for MI_LOAD_REGISTER_* commands. */
+   crocus_emit_pipe_control_flush(batch,
+                                  "conditional rendering: set predicate",
+                                  PIPE_CONTROL_FLUSH_ENABLE);
+   q->stalled = true;
+
+#if GFX_VERx10 != 75
+   struct crocus_screen *screen = batch->screen;
+   screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC0, bo,
+                                    q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, start));
+   screen->vtbl.load_register_mem64(batch, MI_PREDICATE_SRC1, bo,
+                                    q->query_state_ref.offset + offsetof(struct crocus_query_snapshots, end));
+
+   uint32_t mi_predicate = MI_PREDICATE | MI_PREDICATE_COMBINEOP_SET |
+      MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+   if (inverted)
+      mi_predicate |= MI_PREDICATE_LOADOP_LOAD;
+   else
+      mi_predicate |= MI_PREDICATE_LOADOP_LOADINV;
+   crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+#else
+   struct mi_builder b;
+   mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+   struct mi_value result;
+
+   switch (q->type) {
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      result = calc_overflow_for_stream(&b, q, q->index);
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
+      result = calc_overflow_any_stream(&b, q);
+      break;
+   default: {
+      /* PIPE_QUERY_OCCLUSION_* */
+      struct mi_value start =
+         query_mem64(q, offsetof(struct crocus_query_snapshots, start));
+      struct mi_value end =
+         query_mem64(q, offsetof(struct crocus_query_snapshots, end));
+      result = mi_isub(&b, end, start);
+      break;
+   }
+   }
+
+   result = inverted ? mi_z(&b, result) : mi_nz(&b, result);
+   result = mi_iand(&b, result, mi_imm(1));
+
+   /* We immediately set the predicate on the render batch, as all the
+    * counters come from 3D operations.  However, we may need to predicate
+    * a compute dispatch, which executes in a different GEM context and has
+    * a different MI_PREDICATE_RESULT register.  So, we save the result to
+    * memory and reload it in crocus_launch_grid.
+    */
+   mi_value_ref(&b, result);
+
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), result);
+   mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+   unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+      MI_PREDICATE_COMBINEOP_SET |
+      MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+   crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+   mi_store(&b, query_mem64(q, offsetof(struct crocus_query_snapshots,
+                                        predicate_result)), result);
+#endif
+   ice->state.compute_predicate = bo;
+}
+#endif
+
+static void
+crocus_render_condition(struct pipe_context *ctx,
+                        struct pipe_query *query,
+                        bool condition,
+                        enum pipe_render_cond_flag mode)
+{
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_query *q = (void *) query;
+
+   /* The old condition isn't relevant; we'll update it if necessary */
+   ice->state.compute_predicate = NULL;
+   ice->condition.query = q;
+   ice->condition.condition = condition;
+   ice->condition.mode = mode;
+
+   if (!q) {
+      ice->state.predicate = CROCUS_PREDICATE_STATE_RENDER;
+      return;
+   }
+
+   crocus_check_query_no_flush(ice, q);
+
+   if (q->result || q->ready) {
+      set_predicate_enable(ice, (q->result != 0) ^ condition);
+   } else {
+      if (mode == PIPE_RENDER_COND_NO_WAIT ||
+          mode == PIPE_RENDER_COND_BY_REGION_NO_WAIT) {
+         perf_debug(&ice->dbg, "Conditional rendering demoted from "
+                    "\"no wait\" to \"wait\".");
+      }
+#if GFX_VER == 7
+      set_predicate_for_result(ice, q, condition);
+#else
+      ice->state.predicate = CROCUS_PREDICATE_STATE_STALL_FOR_QUERY;
+#endif
+   }
+}
+
+static void
+crocus_resolve_conditional_render(struct crocus_context *ice)
+{
+   struct pipe_context *ctx = (void *) ice;
+   struct crocus_query *q = ice->condition.query;
+   struct pipe_query *query = (void *) q;
+   union pipe_query_result result;
+
+   if (ice->state.predicate != CROCUS_PREDICATE_STATE_USE_BIT)
+      return;
+
+   assert(q);
+
+   crocus_get_query_result(ctx, query, true, &result);
+   set_predicate_enable(ice, (q->result != 0) ^ ice->condition.condition);
+}
+
+#if GFX_VER >= 7
+static void
+crocus_emit_compute_predicate(struct crocus_batch *batch)
+{
+   struct crocus_context *ice = batch->ice;
+   struct crocus_screen *screen = batch->screen;
+   screen->vtbl.load_register_mem32(batch, MI_PREDICATE_SRC0,
+                                    ice->state.compute_predicate, 0);
+   screen->vtbl.load_register_imm32(batch, MI_PREDICATE_SRC1, 0);
+   unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+      MI_PREDICATE_COMBINEOP_SET |
+      MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+   crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+}
+#endif
+
+void
+genX(init_screen_query)(struct crocus_screen *screen)
+{
+   screen->vtbl.resolve_conditional_render = crocus_resolve_conditional_render;
+#if GFX_VER >= 7
+   screen->vtbl.emit_compute_predicate = crocus_emit_compute_predicate;
+#endif
+}
+
+void
+genX(init_query)(struct crocus_context *ice)
+{
+   struct pipe_context *ctx = &ice->ctx;
+
+   ctx->create_query = crocus_create_query;
+   ctx->create_batch_query = crocus_create_batch_query;
+   ctx->destroy_query = crocus_destroy_query;
+   ctx->begin_query = crocus_begin_query;
+   ctx->end_query = crocus_end_query;
+   ctx->get_query_result = crocus_get_query_result;
+#if GFX_VER == 7
+   ctx->get_query_result_resource = crocus_get_query_result_resource;
+#endif
+   ctx->set_active_query_state = crocus_set_active_query_state;
+   ctx->render_condition = crocus_render_condition;
+
+}
diff --git a/src/gallium/drivers/crocus/crocus_resolve.c b/src/gallium/drivers/crocus/crocus_resolve.c
new file mode 100644
index 00000000000..a38eb4a94a7
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resolve.c
@@ -0,0 +1,1061 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resolve.c
+ *
+ * This file handles resolve tracking for main and auxiliary surfaces.
+ *
+ * It also handles our cache tracking.  We have sets for the render cache,
+ * depth cache, and so on.  If a BO is in a cache's set, then it may have
+ * data in that cache.  The helpers take care of emitting flushes for
+ * render-to-texture, format reinterpretation issues, and other situations.
+ */
+
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "crocus_context.h"
+#include "compiler/nir/nir.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLORP
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+                             struct crocus_resource *res);
+/**
+ * Disable auxiliary buffers if a renderbuffer is also bound as a texture
+ * or shader image.  This causes a self-dependency, where both rendering
+ * and sampling may concurrently read or write the CCS buffer, causing
+ * incorrect pixels.
+ */
+static bool
+disable_rb_aux_buffer(struct crocus_context *ice,
+                      bool *draw_aux_buffer_disabled,
+                      struct crocus_resource *tex_res,
+                      unsigned min_level, unsigned num_levels,
+                      const char *usage)
+{
+   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   bool found = false;
+
+   /* We only need to worry about fast clears. */
+   if (tex_res->aux.usage != ISL_AUX_USAGE_CCS_D)
+      return false;
+
+   for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+      struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+      if (!surf)
+         continue;
+
+      struct crocus_resource *rb_res = (void *) surf->base.texture;
+
+      if (rb_res->bo == tex_res->bo &&
+          surf->base.u.tex.level >= min_level &&
+          surf->base.u.tex.level < min_level + num_levels) {
+         found = draw_aux_buffer_disabled[i] = true;
+      }
+   }
+
+   if (found) {
+      perf_debug(&ice->dbg,
+                 "Disabling CCS because a renderbuffer is also bound %s.\n",
+                 usage);
+   }
+
+   return found;
+}
+
+static void
+resolve_sampler_views(struct crocus_context *ice,
+                      struct crocus_batch *batch,
+                      struct crocus_shader_state *shs,
+                      const struct shader_info *info,
+                      bool *draw_aux_buffer_disabled,
+                      bool consider_framebuffer)
+{
+   uint32_t views = info ? (shs->bound_sampler_views & info->textures_used[0]) : 0;
+
+   while (views) {
+      const int i = u_bit_scan(&views);
+      struct crocus_sampler_view *isv = shs->textures[i];
+
+      if (isv->res->base.target != PIPE_BUFFER) {
+         if (consider_framebuffer) {
+            disable_rb_aux_buffer(ice, draw_aux_buffer_disabled, isv->res,
+                                  isv->view.base_level, isv->view.levels,
+                                  "for sampling");
+         }
+
+         crocus_resource_prepare_texture(ice, isv->res, isv->view.format,
+                                         isv->view.base_level, isv->view.levels,
+                                         isv->view.base_array_layer,
+                                         isv->view.array_len);
+      }
+
+      crocus_cache_flush_for_read(batch, isv->res->bo);
+
+      if (batch->screen->devinfo.ver >= 7 &&
+          (isv->base.format == PIPE_FORMAT_X24S8_UINT ||
+           isv->base.format == PIPE_FORMAT_X32_S8X24_UINT ||
+           isv->base.format == PIPE_FORMAT_S8_UINT)) {
+         struct crocus_resource *zres, *sres;
+         crocus_get_depth_stencil_resources(&batch->screen->devinfo, isv->base.texture, &zres, &sres);
+         crocus_update_stencil_shadow(ice, sres);
+         crocus_cache_flush_for_read(batch, sres->shadow->bo);
+      }
+   }
+}
+
+static void
+resolve_image_views(struct crocus_context *ice,
+                    struct crocus_batch *batch,
+                    struct crocus_shader_state *shs,
+                    bool *draw_aux_buffer_disabled,
+                    bool consider_framebuffer)
+{
+   /* TODO: Consider images used by program */
+   uint32_t views = shs->bound_image_views;
+
+   while (views) {
+      const int i = u_bit_scan(&views);
+      struct pipe_image_view *pview = &shs->image[i].base;
+      struct crocus_resource *res = (void *) pview->resource;
+
+      if (res->base.target != PIPE_BUFFER) {
+         if (consider_framebuffer) {
+            disable_rb_aux_buffer(ice, draw_aux_buffer_disabled,
+                                  res, pview->u.tex.level, 1,
+                                  "as a shader image");
+         }
+
+         unsigned num_layers =
+            pview->u.tex.last_layer - pview->u.tex.first_layer + 1;
+
+         /* The data port doesn't understand any compression */
+         crocus_resource_prepare_access(ice, res,
+                                        pview->u.tex.level, 1,
+                                        pview->u.tex.first_layer, num_layers,
+                                        ISL_AUX_USAGE_NONE, false);
+      }
+
+      crocus_cache_flush_for_read(batch, res->bo);
+   }
+}
+
+static void
+crocus_update_align_res(struct crocus_batch *batch,
+                        struct crocus_surface *surf,
+                        bool copy_to_wa)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)batch->screen;
+   struct pipe_blit_info info = { 0 };
+
+   info.src.resource = copy_to_wa ? surf->base.texture : surf->align_res;
+   info.src.level = copy_to_wa ? surf->base.u.tex.level : 0;
+   u_box_2d_zslice(0, 0, copy_to_wa ? surf->base.u.tex.first_layer : 0,
+                   u_minify(surf->base.texture->width0, surf->base.u.tex.level),
+                   u_minify(surf->base.texture->height0, surf->base.u.tex.level), &info.src.box);
+   info.src.format = surf->base.texture->format;
+   info.dst.resource = copy_to_wa ? surf->align_res : surf->base.texture;
+   info.dst.level = copy_to_wa ? 0 : surf->base.u.tex.level;
+   info.dst.box = info.src.box;
+   info.dst.box.z = copy_to_wa ? 0 : surf->base.u.tex.first_layer;
+   info.dst.format = surf->base.texture->format;
+   info.mask = util_format_is_depth_or_stencil(surf->base.texture->format) ? PIPE_MASK_ZS : PIPE_MASK_RGBA;
+   info.filter = 0;
+   if (!screen->vtbl.blit_blt(batch, &info)) {
+      assert(0);
+   }
+}
+
+/**
+ * \brief Resolve buffers before drawing.
+ *
+ * Resolve the depth buffer's HiZ buffer, resolve the depth buffer of each
+ * enabled depth texture, and flush the render cache for any dirty textures.
+ */
+void
+crocus_predraw_resolve_inputs(struct crocus_context *ice,
+                              struct crocus_batch *batch,
+                              bool *draw_aux_buffer_disabled,
+                              gl_shader_stage stage,
+                              bool consider_framebuffer)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+   uint64_t stage_dirty = (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage) |
+      (consider_framebuffer ? CROCUS_STAGE_DIRTY_BINDINGS_FS : 0);
+
+   if (ice->state.stage_dirty & stage_dirty) {
+      resolve_sampler_views(ice, batch, shs, info, draw_aux_buffer_disabled,
+                            consider_framebuffer);
+      resolve_image_views(ice, batch, shs, draw_aux_buffer_disabled,
+                          consider_framebuffer);
+   }
+}
+
+void
+crocus_predraw_resolve_framebuffer(struct crocus_context *ice,
+                                   struct crocus_batch *batch,
+                                   bool *draw_aux_buffer_disabled)
+{
+   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_uncompiled_shader *ish =
+      ice->shaders.uncompiled[MESA_SHADER_FRAGMENT];
+   const nir_shader *nir = ish->nir;
+
+   if (ice->state.dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
+      struct pipe_surface *zs_surf = cso_fb->zsbuf;
+
+      if (zs_surf) {
+         struct crocus_resource *z_res, *s_res;
+         crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+         unsigned num_layers =
+            zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+         if (z_res) {
+            crocus_resource_prepare_render(ice, z_res,
+                                           zs_surf->u.tex.level,
+                                           zs_surf->u.tex.first_layer,
+                                           num_layers, ice->state.hiz_usage);
+            crocus_cache_flush_for_depth(batch, z_res->bo);
+
+            if (((struct crocus_surface *)zs_surf)->align_res) {
+               crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, true);
+            }
+         }
+
+         if (s_res) {
+            crocus_cache_flush_for_depth(batch, s_res->bo);
+         }
+      }
+   }
+
+   if (nir->info.outputs_read != 0) {
+      for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+         if (cso_fb->cbufs[i]) {
+            struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+            struct crocus_resource *res = (void *) cso_fb->cbufs[i]->texture;
+
+            crocus_resource_prepare_texture(ice, res, surf->view.format,
+                                            surf->view.base_level, 1,
+                                            surf->view.base_array_layer,
+                                            surf->view.array_len);
+         }
+      }
+   }
+
+   if (ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS) {
+      for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+         struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+         if (!surf)
+            continue;
+
+         struct crocus_resource *res = (void *) surf->base.texture;
+
+         if (surf->align_res)
+            crocus_update_align_res(batch, surf, true);
+
+         enum isl_aux_usage aux_usage =
+            crocus_resource_render_aux_usage(ice, res, surf->view.format,
+                                             ice->state.blend_enables & (1u << i),
+                                             draw_aux_buffer_disabled[i]);
+
+         if (ice->state.draw_aux_usage[i] != aux_usage) {
+            ice->state.draw_aux_usage[i] = aux_usage;
+            /* XXX: Need to track which bindings to make dirty */
+            ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+         }
+
+         crocus_resource_prepare_render(ice, res, surf->view.base_level,
+                                        surf->view.base_array_layer,
+                                        surf->view.array_len,
+                                        aux_usage);
+
+         crocus_cache_flush_for_render(batch, res->bo, surf->view.format,
+                                       aux_usage);
+      }
+   }
+}
+
+/**
+ * \brief Call this after drawing to mark which buffers need resolving
+ *
+ * If the depth buffer was written to and if it has an accompanying HiZ
+ * buffer, then mark that it needs a depth resolve.
+ *
+ * If the color buffer is a multisample window system buffer, then
+ * mark that it needs a downsample.
+ *
+ * Also mark any render targets which will be textured as needing a render
+ * cache flush.
+ */
+void
+crocus_postdraw_update_resolve_tracking(struct crocus_context *ice,
+                                        struct crocus_batch *batch)
+{
+   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+   // XXX: front buffer drawing?
+
+   bool may_have_resolved_depth =
+      ice->state.dirty & (CROCUS_DIRTY_DEPTH_BUFFER |
+                          CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL);
+
+   struct pipe_surface *zs_surf = cso_fb->zsbuf;
+   if (zs_surf) {
+      struct crocus_resource *z_res, *s_res;
+      crocus_get_depth_stencil_resources(devinfo, zs_surf->texture, &z_res, &s_res);
+      unsigned num_layers =
+         zs_surf->u.tex.last_layer - zs_surf->u.tex.first_layer + 1;
+
+      if (z_res) {
+         if (may_have_resolved_depth && ice->state.depth_writes_enabled) {
+            crocus_resource_finish_render(ice, z_res, zs_surf->u.tex.level,
+                                          zs_surf->u.tex.first_layer, num_layers,
+                                          ice->state.hiz_usage);
+         }
+
+         if (ice->state.depth_writes_enabled)
+            crocus_depth_cache_add_bo(batch, z_res->bo);
+
+         if (((struct crocus_surface *)zs_surf)->align_res) {
+            crocus_update_align_res(batch, (struct crocus_surface *)zs_surf, false);
+         }
+      }
+
+      if (s_res) {
+         if (may_have_resolved_depth && ice->state.stencil_writes_enabled) {
+            crocus_resource_finish_write(ice, s_res, zs_surf->u.tex.level,
+                                         zs_surf->u.tex.first_layer, num_layers,
+                                         s_res->aux.usage);
+         }
+
+         if (ice->state.stencil_writes_enabled)
+            crocus_depth_cache_add_bo(batch, s_res->bo);
+      }
+   }
+
+   bool may_have_resolved_color =
+      ice->state.stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_FS;
+
+   for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+      struct crocus_surface *surf = (void *) cso_fb->cbufs[i];
+      if (!surf)
+         continue;
+
+      if (surf->align_res)
+         crocus_update_align_res(batch, surf, false);
+      struct crocus_resource *res = (void *) surf->base.texture;
+      enum isl_aux_usage aux_usage = ice->state.draw_aux_usage[i];
+
+      crocus_render_cache_add_bo(batch, res->bo, surf->view.format,
+                                 aux_usage);
+
+      if (may_have_resolved_color) {
+         union pipe_surface_desc *desc = &surf->base.u;
+         unsigned num_layers =
+            desc->tex.last_layer - desc->tex.first_layer + 1;
+         crocus_resource_finish_render(ice, res, desc->tex.level,
+                                       desc->tex.first_layer, num_layers,
+                                       aux_usage);
+      }
+   }
+}
+
+/**
+ * Clear the cache-tracking sets.
+ */
+void
+crocus_cache_sets_clear(struct crocus_batch *batch)
+{
+   hash_table_foreach(batch->cache.render, render_entry)
+      _mesa_hash_table_remove(batch->cache.render, render_entry);
+
+   set_foreach(batch->cache.depth, depth_entry)
+      _mesa_set_remove(batch->cache.depth, depth_entry);
+}
+
+/**
+ * Emits an appropriate flush for a BO if it has been rendered to within the
+ * same batchbuffer as a read that's about to be emitted.
+ *
+ * The GPU has separate, incoherent caches for the render cache and the
+ * sampler cache, along with other caches.  Usually data in the different
+ * caches don't interact (e.g. we don't render to our driver-generated
+ * immediate constant data), but for render-to-texture in FBOs we definitely
+ * do.  When a batchbuffer is flushed, the kernel will ensure that everything
+ * necessary is flushed before another use of that BO, but for reuse from
+ * different caches within a batchbuffer, it's all our responsibility.
+ */
+void
+crocus_flush_depth_and_render_caches(struct crocus_batch *batch)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   if (devinfo->ver >= 6) {
+      crocus_emit_pipe_control_flush(batch,
+                                     "cache tracker: render-to-texture",
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+
+      crocus_emit_pipe_control_flush(batch,
+                                     "cache tracker: render-to-texture",
+                                     PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                     PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+   } else {
+      crocus_emit_mi_flush(batch);
+   }
+
+   crocus_cache_sets_clear(batch);
+}
+
+void
+crocus_cache_flush_for_read(struct crocus_batch *batch,
+                            struct crocus_bo *bo)
+{
+   if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo) ||
+       _mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+      crocus_flush_depth_and_render_caches(batch);
+}
+
+static void *
+format_aux_tuple(enum isl_format format, enum isl_aux_usage aux_usage)
+{
+   return (void *)(uintptr_t)((uint32_t)format << 8 | aux_usage);
+}
+
+void
+crocus_cache_flush_for_render(struct crocus_batch *batch,
+                              struct crocus_bo *bo,
+                              enum isl_format format,
+                              enum isl_aux_usage aux_usage)
+{
+   if (_mesa_set_search_pre_hashed(batch->cache.depth, bo->hash, bo))
+      crocus_flush_depth_and_render_caches(batch);
+
+   /* Check to see if this bo has been used by a previous rendering operation
+    * but with a different format or aux usage.  If it has, flush the render
+    * cache so we ensure that it's only in there with one format or aux usage
+    * at a time.
+    *
+    * Even though it's not obvious, this can easily happen in practice.
+    * Suppose a client is blending on a surface with sRGB encode enabled on
+    * gen9.  This implies that you get AUX_USAGE_CCS_D at best.  If the client
+    * then disables sRGB decode and continues blending we will flip on
+    * AUX_USAGE_CCS_E without doing any sort of resolve in-between (this is
+    * perfectly valid since CCS_E is a subset of CCS_D).  However, this means
+    * that we have fragments in-flight which are rendering with UNORM+CCS_E
+    * and other fragments in-flight with SRGB+CCS_D on the same surface at the
+    * same time and the pixel scoreboard and color blender are trying to sort
+    * it all out.  This ends badly (i.e. GPU hangs).
+    *
+    * To date, we have never observed GPU hangs or even corruption to be
+    * associated with switching the format, only the aux usage.  However,
+    * there are comments in various docs which indicate that the render cache
+    * isn't 100% resilient to format changes.  We may as well be conservative
+    * and flush on format changes too.  We can always relax this later if we
+    * find it to be a performance problem.
+    */
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+   if (entry && entry->data != format_aux_tuple(format, aux_usage))
+      crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_render_cache_add_bo(struct crocus_batch *batch,
+                           struct crocus_bo *bo,
+                           enum isl_format format,
+                           enum isl_aux_usage aux_usage)
+{
+#ifndef NDEBUG
+   struct hash_entry *entry =
+      _mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo);
+   if (entry) {
+      /* Otherwise, someone didn't do a flush_for_render and that would be
+       * very bad indeed.
+       */
+      assert(entry->data == format_aux_tuple(format, aux_usage));
+   }
+#endif
+
+   _mesa_hash_table_insert_pre_hashed(batch->cache.render, bo->hash, bo,
+                                      format_aux_tuple(format, aux_usage));
+}
+
+void
+crocus_cache_flush_for_depth(struct crocus_batch *batch,
+                             struct crocus_bo *bo)
+{
+   if (_mesa_hash_table_search_pre_hashed(batch->cache.render, bo->hash, bo))
+      crocus_flush_depth_and_render_caches(batch);
+}
+
+void
+crocus_depth_cache_add_bo(struct crocus_batch *batch, struct crocus_bo *bo)
+{
+   _mesa_set_add_pre_hashed(batch->cache.depth, bo->hash, bo);
+}
+
+static void
+crocus_resolve_color(struct crocus_context *ice,
+                     struct crocus_batch *batch,
+                     struct crocus_resource *res,
+                     unsigned level, unsigned layer,
+                     enum isl_aux_op resolve_op)
+{
+   struct crocus_screen *screen = batch->screen;
+   DBG("%s to res %p level %u layer %u\n", __func__, res, level, layer);
+
+   struct blorp_surf surf;
+   crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+                                  &res->base, res->aux.usage, level, true);
+
+   crocus_batch_maybe_flush(batch, 1500);
+
+   /* Ivybridge PRM Vol 2, Part 1, "11.7 MCS Buffer for Render Target(s)":
+    *
+    *    "Any transition from any value in {Clear, Render, Resolve} to a
+    *     different value in {Clear, Render, Resolve} requires end of pipe
+    *     synchronization."
+    *
+    * In other words, fast clear ops are not properly synchronized with
+    * other drawing.  We need to use a PIPE_CONTROL to ensure that the
+    * contents of the previous draw hit the render target before we resolve
+    * and again afterwards to ensure that the resolve is complete before we
+    * do any more regular drawing.
+    */
+   crocus_emit_end_of_pipe_sync(batch, "color resolve: pre-flush",
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+   blorp_ccs_resolve(&blorp_batch, &surf, level, layer, 1,
+                     isl_format_srgb_to_linear(res->surf.format),
+                     resolve_op);
+   blorp_batch_finish(&blorp_batch);
+
+   /* See comment above */
+   crocus_emit_end_of_pipe_sync(batch, "color resolve: post-flush",
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
+}
+
+static void
+crocus_mcs_partial_resolve(struct crocus_context *ice,
+                           struct crocus_batch *batch,
+                           struct crocus_resource *res,
+                           uint32_t start_layer,
+                           uint32_t num_layers)
+{
+   struct crocus_screen *screen = batch->screen;
+
+   DBG("%s to res %p layers %u-%u\n", __func__, res,
+       start_layer, start_layer + num_layers - 1);
+
+   assert(isl_aux_usage_has_mcs(res->aux.usage));
+
+   struct blorp_surf surf;
+   crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+                                  &res->base, res->aux.usage, 0, true);
+
+   struct blorp_batch blorp_batch;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, 0);
+   blorp_mcs_partial_resolve(&blorp_batch, &surf,
+                             isl_format_srgb_to_linear(res->surf.format),
+                             start_layer, num_layers);
+   blorp_batch_finish(&blorp_batch);
+}
+
+/**
+ * Perform a HiZ or depth resolve operation.
+ *
+ * For an overview of HiZ ops, see the following sections of the Sandy Bridge
+ * PRM, Volume 1, Part 2:
+ *   - 7.5.3.1 Depth Buffer Clear
+ *   - 7.5.3.2 Depth Buffer Resolve
+ *   - 7.5.3.3 Hierarchical Depth Buffer Resolve
+ */
+void
+crocus_hiz_exec(struct crocus_context *ice,
+                struct crocus_batch *batch,
+                struct crocus_resource *res,
+                unsigned int level, unsigned int start_layer,
+                unsigned int num_layers, enum isl_aux_op op,
+                bool update_clear_depth)
+{
+   struct crocus_screen *screen = batch->screen;
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   assert(crocus_resource_level_has_hiz(res, level));
+   assert(op != ISL_AUX_OP_NONE);
+   UNUSED const char *name = NULL;
+
+   switch (op) {
+   case ISL_AUX_OP_FULL_RESOLVE:
+      name = "depth resolve";
+      break;
+   case ISL_AUX_OP_AMBIGUATE:
+      name = "hiz ambiguate";
+      break;
+   case ISL_AUX_OP_FAST_CLEAR:
+      name = "depth clear";
+      break;
+   case ISL_AUX_OP_PARTIAL_RESOLVE:
+   case ISL_AUX_OP_NONE:
+      unreachable("Invalid HiZ op");
+   }
+
+   DBG("%s %s to res %p level %d layers %d-%d\n",
+       __func__, name, res, level, start_layer, start_layer + num_layers - 1);
+
+   /* The following stalls and flushes are only documented to be required
+    * for HiZ clear operations.  However, they also seem to be required for
+    * resolve operations.
+    *
+    * From the Ivybridge PRM, volume 2, "Depth Buffer Clear":
+    *
+    *   "If other rendering operations have preceded this clear, a
+    *    PIPE_CONTROL with depth cache flush enabled, Depth Stall bit
+    *    enabled must be issued before the rectangle primitive used for
+    *    the depth buffer clear operation."
+    *
+    * Same applies for Gen8 and Gen9.
+    *
+    * In addition, from the Ivybridge PRM, volume 2, 1.10.4.1
+    * PIPE_CONTROL, Depth Cache Flush Enable:
+    *
+    *   "This bit must not be set when Depth Stall Enable bit is set in
+    *    this packet."
+    *
+    * This is confirmed to hold for real, Haswell gets immediate gpu hangs.
+    *
+    * Therefore issue two pipe control flushes, one for cache flush and
+    * another for depth stall.
+    */
+   if (devinfo->ver == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 313:
+       *
+       *   "If other rendering operations have preceded this clear, a
+       *   PIPE_CONTROL with write cache flush enabled and Z-inhibit
+       *   disabled must be issued before the rectangle primitive used for
+       *   the depth buffer clear operation.
+       */
+      crocus_emit_pipe_control_flush(batch,
+                                     "hiz op: pre-flushes (1)",
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+   } else if (devinfo->ver >= 7) {
+      crocus_emit_pipe_control_flush(batch,
+                                     "hiz op: pre-flushes (1/2)",
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+      crocus_emit_pipe_control_flush(batch, "hiz op: pre-flushes (2/2)",
+                                     PIPE_CONTROL_DEPTH_STALL);
+   }
+
+   assert(isl_aux_usage_has_hiz(res->aux.usage) && res->aux.bo);
+
+   crocus_batch_maybe_flush(batch, 1500);
+
+   struct blorp_surf surf;
+   crocus_blorp_surf_for_resource(&screen->vtbl, &batch->screen->isl_dev, &surf,
+                                  &res->base, res->aux.usage, level, true);
+
+   struct blorp_batch blorp_batch;
+   enum blorp_batch_flags flags = 0;
+   flags |= update_clear_depth ? 0 : BLORP_BATCH_NO_UPDATE_CLEAR_COLOR;
+   blorp_batch_init(&ice->blorp, &blorp_batch, batch, flags);
+   blorp_hiz_op(&blorp_batch, &surf, level, start_layer, num_layers, op);
+   blorp_batch_finish(&blorp_batch);
+
+   /* The following stalls and flushes are only documented to be required
+    * for HiZ clear operations.  However, they also seem to be required for
+    * resolve operations.
+    *
+    * From the Broadwell PRM, volume 7, "Depth Buffer Clear":
+    *
+    *    "Depth buffer clear pass using any of the methods (WM_STATE,
+    *     3DSTATE_WM or 3DSTATE_WM_HZ_OP) must be followed by a
+    *     PIPE_CONTROL command with DEPTH_STALL bit and Depth FLUSH bits
+    *     "set" before starting to render.  DepthStall and DepthFlush are
+    *     not needed between consecutive depth clear passes nor is it
+    *     required if the depth clear pass was done with
+    *     'full_surf_clear' bit set in the 3DSTATE_WM_HZ_OP."
+    *
+    * TODO: Such as the spec says, this could be conditional.
+    */
+   if (devinfo->ver == 6) {
+      /* From the Sandy Bridge PRM, volume 2 part 1, page 314:
+       *
+       *     "DevSNB, DevSNB-B{W/A}]: Depth buffer clear pass must be
+       *     followed by a PIPE_CONTROL command with DEPTH_STALL bit set
+       *     and Then followed by Depth FLUSH'
+       */
+      crocus_emit_pipe_control_flush(batch,
+                                     "hiz op: post-flushes (1/2)",
+                                     PIPE_CONTROL_DEPTH_STALL);
+
+      crocus_emit_pipe_control_flush(batch,
+                                     "hiz op: post-flushes (2/2)",
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+   }
+}
+
+/**
+ * Does the resource's slice have hiz enabled?
+ */
+bool
+crocus_resource_level_has_hiz(const struct crocus_resource *res, uint32_t level)
+{
+   crocus_resource_check_level_layer(res, level, 0);
+   return res->aux.has_hiz & 1 << level;
+}
+
+static bool
+crocus_resource_level_has_aux(const struct crocus_resource *res, uint32_t level)
+{
+   if (isl_aux_usage_has_hiz(res->aux.usage))
+      return crocus_resource_level_has_hiz(res, level);
+   else
+      return level < res->aux.surf.levels;
+}
+
+/** \brief Assert that the level and layer are valid for the resource. */
+void
+crocus_resource_check_level_layer(UNUSED const struct crocus_resource *res,
+                                  UNUSED uint32_t level, UNUSED uint32_t layer)
+{
+   assert(level < res->surf.levels);
+   assert(layer < util_num_layers(&res->base, level));
+}
+
+static inline uint32_t
+miptree_level_range_length(const struct crocus_resource *res,
+                           uint32_t start_level, uint32_t num_levels)
+{
+   assert(start_level < res->surf.levels);
+
+   if (num_levels == INTEL_REMAINING_LAYERS)
+      num_levels = res->surf.levels;
+
+   /* Check for overflow */
+   assert(start_level + num_levels >= start_level);
+   assert(start_level + num_levels <= res->surf.levels);
+
+   return num_levels;
+}
+
+static inline uint32_t
+miptree_layer_range_length(const struct crocus_resource *res, uint32_t level,
+                           uint32_t start_layer, uint32_t num_layers)
+{
+   assert(level <= res->base.last_level);
+
+   const uint32_t total_num_layers = crocus_get_num_logical_layers(res, level);
+   assert(start_layer < total_num_layers);
+   if (num_layers == INTEL_REMAINING_LAYERS)
+      num_layers = total_num_layers - start_layer;
+   /* Check for overflow */
+   assert(start_layer + num_layers >= start_layer);
+   assert(start_layer + num_layers <= total_num_layers);
+
+   return num_layers;
+}
+
+bool
+crocus_has_invalid_primary(const struct crocus_resource *res,
+                           unsigned start_level, unsigned num_levels,
+                           unsigned start_layer, unsigned num_layers)
+{
+   if (!res->aux.bo)
+      return false;
+
+   /* Clamp the level range to fit the resource */
+   num_levels = miptree_level_range_length(res, start_level, num_levels);
+
+   for (uint32_t l = 0; l < num_levels; l++) {
+      const uint32_t level = start_level + l;
+      if (!crocus_resource_level_has_aux(res, level))
+         continue;
+
+      const uint32_t level_layers =
+         miptree_layer_range_length(res, level, start_layer, num_layers);
+      for (unsigned a = 0; a < level_layers; a++) {
+         enum isl_aux_state aux_state =
+            crocus_resource_get_aux_state(res, level, start_layer + a);
+         if (!isl_aux_state_has_valid_primary(aux_state))
+            return true;
+      }
+   }
+
+   return false;
+}
+
+void
+crocus_resource_prepare_access(struct crocus_context *ice,
+                               struct crocus_resource *res,
+                               uint32_t start_level, uint32_t num_levels,
+                               uint32_t start_layer, uint32_t num_layers,
+                               enum isl_aux_usage aux_usage,
+                               bool fast_clear_supported)
+{
+   if (!res->aux.bo)
+      return;
+
+   /* We can't do resolves on the compute engine, so awkwardly, we have to
+    * do them on the render batch...
+    */
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+   const uint32_t clamped_levels =
+      miptree_level_range_length(res, start_level, num_levels);
+   for (uint32_t l = 0; l < clamped_levels; l++) {
+      const uint32_t level = start_level + l;
+      if (!crocus_resource_level_has_aux(res, level))
+         continue;
+
+      const uint32_t level_layers =
+         miptree_layer_range_length(res, level, start_layer, num_layers);
+      for (uint32_t a = 0; a < level_layers; a++) {
+         const uint32_t layer = start_layer + a;
+         const enum isl_aux_state aux_state =
+            crocus_resource_get_aux_state(res, level, layer);
+         const enum isl_aux_op aux_op =
+            isl_aux_prepare_access(aux_state, aux_usage, fast_clear_supported);
+
+         /* Prepare the aux buffer for a conditional or unconditional access.
+          * A conditional access is handled by assuming that the access will
+          * not evaluate to a no-op. If the access does in fact occur, the aux
+          * will be in the required state. If it does not, no data is lost
+          * because the aux_op performed is lossless.
+          */
+         if (aux_op == ISL_AUX_OP_NONE) {
+            /* Nothing to do here. */
+         } else if (isl_aux_usage_has_mcs(res->aux.usage)) {
+            assert(aux_op == ISL_AUX_OP_PARTIAL_RESOLVE);
+            crocus_mcs_partial_resolve(ice, batch, res, layer, 1);
+         } else if (isl_aux_usage_has_hiz(res->aux.usage)) {
+            crocus_hiz_exec(ice, batch, res, level, layer, 1, aux_op, false);
+         } else if (res->aux.usage == ISL_AUX_USAGE_STC_CCS) {
+            unreachable("crocus doesn't resolve STC_CCS resources");
+         } else {
+            assert(isl_aux_usage_has_ccs(res->aux.usage));
+            crocus_resolve_color(ice, batch, res, level, layer, aux_op);
+         }
+
+         const enum isl_aux_state new_state =
+            isl_aux_state_transition_aux_op(aux_state, res->aux.usage, aux_op);
+         crocus_resource_set_aux_state(ice, res, level, layer, 1, new_state);
+      }
+   }
+}
+
+void
+crocus_resource_finish_write(struct crocus_context *ice,
+                             struct crocus_resource *res, uint32_t level,
+                             uint32_t start_layer, uint32_t num_layers,
+                             enum isl_aux_usage aux_usage)
+{
+   if (res->base.format == PIPE_FORMAT_S8_UINT)
+      res->shadow_needs_update = true;
+
+   if (!crocus_resource_level_has_aux(res, level))
+      return;
+
+   const uint32_t level_layers =
+      miptree_layer_range_length(res, level, start_layer, num_layers);
+
+   for (uint32_t a = 0; a < level_layers; a++) {
+      const uint32_t layer = start_layer + a;
+      const enum isl_aux_state aux_state =
+         crocus_resource_get_aux_state(res, level, layer);
+
+      /* Transition the aux state for a conditional or unconditional write. A
+       * conditional write is handled by assuming that the write applies to
+       * only part of the render target. This prevents the new state from
+       * losing the types of compression that might exist in the current state
+       * (e.g. CLEAR). If the write evaluates to a no-op, the state will still
+       * be able to communicate when resolves are necessary (but it may
+       * falsely communicate this as well).
+       */
+      const enum isl_aux_state new_aux_state =
+         isl_aux_state_transition_write(aux_state, aux_usage, false);
+
+      crocus_resource_set_aux_state(ice, res, level, layer, 1, new_aux_state);
+   }
+}
+
+enum isl_aux_state
+crocus_resource_get_aux_state(const struct crocus_resource *res,
+                              uint32_t level, uint32_t layer)
+{
+   crocus_resource_check_level_layer(res, level, layer);
+   assert(crocus_resource_level_has_aux(res, level));
+
+   return res->aux.state[level][layer];
+}
+
+void
+crocus_resource_set_aux_state(struct crocus_context *ice,
+                              struct crocus_resource *res, uint32_t level,
+                              uint32_t start_layer, uint32_t num_layers,
+                              enum isl_aux_state aux_state)
+{
+   assert(crocus_resource_level_has_aux(res, level));
+
+   num_layers = miptree_layer_range_length(res, level, start_layer, num_layers);
+   for (unsigned a = 0; a < num_layers; a++) {
+      if (res->aux.state[level][start_layer + a] != aux_state) {
+         res->aux.state[level][start_layer + a] = aux_state;
+         ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES |
+                             CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES;
+         /* XXX: Need to track which bindings to make dirty */
+         ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+      }
+   }
+}
+
+static bool
+isl_formats_are_fast_clear_compatible(enum isl_format a, enum isl_format b)
+{
+   /* On gen8 and earlier, the hardware was only capable of handling 0/1 clear
+    * values so sRGB curve application was a no-op for all fast-clearable
+    * formats.
+    *
+    * On gen9+, the hardware supports arbitrary clear values.  For sRGB clear
+    * values, the hardware interprets the floats, not as what would be
+    * returned from the sampler (or written by the shader), but as being
+    * between format conversion and sRGB curve application.  This means that
+    * we can switch between sRGB and UNORM without having to whack the clear
+    * color.
+    */
+   return isl_format_srgb_to_linear(a) == isl_format_srgb_to_linear(b);
+}
+
+void
+crocus_resource_prepare_texture(struct crocus_context *ice,
+                                struct crocus_resource *res,
+                                enum isl_format view_format,
+                                uint32_t start_level, uint32_t num_levels,
+                                uint32_t start_layer, uint32_t num_layers)
+{
+   enum isl_aux_usage aux_usage =
+      crocus_resource_texture_aux_usage(res);
+
+   bool clear_supported = aux_usage != ISL_AUX_USAGE_NONE;
+
+   /* Clear color is specified as ints or floats and the conversion is done by
+    * the sampler.  If we have a texture view, we would have to perform the
+    * clear color conversion manually.  Just disable clear color.
+    */
+   if (!isl_formats_are_fast_clear_compatible(res->surf.format, view_format))
+      clear_supported = false;
+
+   crocus_resource_prepare_access(ice, res, start_level, num_levels,
+                                  start_layer, num_layers,
+                                  aux_usage, clear_supported);
+}
+
+enum isl_aux_usage
+crocus_resource_render_aux_usage(struct crocus_context *ice,
+                                 struct crocus_resource *res,
+                                 enum isl_format render_format,
+                                 bool blend_enabled,
+                                 bool draw_aux_disabled)
+{
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (draw_aux_disabled)
+      return ISL_AUX_USAGE_NONE;
+
+   switch (res->aux.usage) {
+   case ISL_AUX_USAGE_MCS:
+      return res->aux.usage;
+
+   case ISL_AUX_USAGE_CCS_D:
+      /* Otherwise, we try to fall back to CCS_D */
+      if (isl_format_supports_ccs_d(devinfo, render_format))
+         return ISL_AUX_USAGE_CCS_D;
+
+      return ISL_AUX_USAGE_NONE;
+
+   default:
+      return ISL_AUX_USAGE_NONE;
+   }
+}
+
+void
+crocus_resource_prepare_render(struct crocus_context *ice,
+                               struct crocus_resource *res, uint32_t level,
+                               uint32_t start_layer, uint32_t layer_count,
+                               enum isl_aux_usage aux_usage)
+{
+   crocus_resource_prepare_access(ice, res, level, 1, start_layer,
+                                  layer_count, aux_usage,
+                                  aux_usage != ISL_AUX_USAGE_NONE);
+}
+
+void
+crocus_resource_finish_render(struct crocus_context *ice,
+                              struct crocus_resource *res, uint32_t level,
+                              uint32_t start_layer, uint32_t layer_count,
+                              enum isl_aux_usage aux_usage)
+{
+   crocus_resource_finish_write(ice, res, level, start_layer, layer_count,
+                                aux_usage);
+}
+
+static void
+crocus_update_stencil_shadow(struct crocus_context *ice,
+                             struct crocus_resource *res)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ice->ctx.screen;
+   UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
+   assert(devinfo->ver >= 7);
+
+   if (!res->shadow_needs_update)
+      return;
+
+   struct pipe_box box;
+   for (unsigned level = 0; level <= res->base.last_level; level++) {
+      u_box_2d(0, 0,
+               u_minify(res->base.width0, level),
+               u_minify(res->base.height0, level), &box);
+      const unsigned depth = res->base.target == PIPE_TEXTURE_3D ?
+         u_minify(res->base.depth0, level) : res->base.array_size;
+
+      for (unsigned layer = 0; layer < depth; layer++) {
+         box.z = layer;
+         ice->ctx.resource_copy_region(&ice->ctx,
+                                       &res->shadow->base, level, 0, 0, layer,
+                                       &res->base, level, &box);
+      }
+   }
+   res->shadow_needs_update = false;
+}
diff --git a/src/gallium/drivers/crocus/crocus_resource.c b/src/gallium/drivers/crocus/crocus_resource.c
new file mode 100644
index 00000000000..b5bf5a42e1a
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resource.c
@@ -0,0 +1,1946 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_resource.c
+ *
+ * Resources are images, buffers, and other objects used by the GPU.
+ *
+ * XXX: explain resources
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/os_memory.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_threaded_context.h"
+#include "util/u_transfer.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "crocus_batch.h"
+#include "crocus_context.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/dev/intel_debug.h"
+#include "isl/isl.h"
+#include "drm-uapi/drm_fourcc.h"
+#include "drm-uapi/i915_drm.h"
+
+enum modifier_priority {
+   MODIFIER_PRIORITY_INVALID = 0,
+   MODIFIER_PRIORITY_LINEAR,
+   MODIFIER_PRIORITY_X,
+   MODIFIER_PRIORITY_Y,
+   MODIFIER_PRIORITY_Y_CCS,
+};
+
+static const uint64_t priority_to_modifier[] = {
+   [MODIFIER_PRIORITY_INVALID] = DRM_FORMAT_MOD_INVALID,
+   [MODIFIER_PRIORITY_LINEAR] = DRM_FORMAT_MOD_LINEAR,
+   [MODIFIER_PRIORITY_X] = I915_FORMAT_MOD_X_TILED,
+   [MODIFIER_PRIORITY_Y] = I915_FORMAT_MOD_Y_TILED,
+   [MODIFIER_PRIORITY_Y_CCS] = I915_FORMAT_MOD_Y_TILED_CCS,
+};
+
+static bool
+modifier_is_supported(const struct intel_device_info *devinfo,
+                      enum pipe_format pfmt, uint64_t modifier)
+{
+   /* XXX: do something real */
+   switch (modifier) {
+   case I915_FORMAT_MOD_Y_TILED_CCS:
+      return false;
+   case I915_FORMAT_MOD_Y_TILED:
+      return devinfo->ver >= 6;
+   case I915_FORMAT_MOD_X_TILED:
+   case DRM_FORMAT_MOD_LINEAR:
+      return true;
+   case DRM_FORMAT_MOD_INVALID:
+   default:
+      return false;
+   }
+}
+
+static uint64_t
+select_best_modifier(struct intel_device_info *devinfo, enum pipe_format pfmt,
+                     const uint64_t *modifiers,
+                     int count)
+{
+   enum modifier_priority prio = MODIFIER_PRIORITY_INVALID;
+
+   for (int i = 0; i < count; i++) {
+      if (!modifier_is_supported(devinfo, pfmt, modifiers[i]))
+         continue;
+
+      switch (modifiers[i]) {
+      case I915_FORMAT_MOD_Y_TILED_CCS:
+         prio = MAX2(prio, MODIFIER_PRIORITY_Y_CCS);
+         break;
+      case I915_FORMAT_MOD_Y_TILED:
+         prio = MAX2(prio, MODIFIER_PRIORITY_Y);
+         break;
+      case I915_FORMAT_MOD_X_TILED:
+         prio = MAX2(prio, MODIFIER_PRIORITY_X);
+         break;
+      case DRM_FORMAT_MOD_LINEAR:
+         prio = MAX2(prio, MODIFIER_PRIORITY_LINEAR);
+         break;
+      case DRM_FORMAT_MOD_INVALID:
+      default:
+         break;
+      }
+   }
+
+   return priority_to_modifier[prio];
+}
+
+static enum isl_surf_dim
+crocus_target_to_isl_surf_dim(enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_BUFFER:
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return ISL_SURF_DIM_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return ISL_SURF_DIM_2D;
+   case PIPE_TEXTURE_3D:
+      return ISL_SURF_DIM_3D;
+   case PIPE_MAX_TEXTURE_TYPES:
+      break;
+   }
+   unreachable("invalid texture type");
+}
+
+static void
+crocus_query_dmabuf_modifiers(struct pipe_screen *pscreen,
+                              enum pipe_format pfmt,
+                              int max,
+                              uint64_t *modifiers,
+                              unsigned int *external_only,
+                              int *count)
+{
+   struct crocus_screen *screen = (void *) pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   uint64_t all_modifiers[] = {
+      DRM_FORMAT_MOD_LINEAR,
+      I915_FORMAT_MOD_X_TILED,
+      I915_FORMAT_MOD_Y_TILED,
+      I915_FORMAT_MOD_Y_TILED_CCS,
+   };
+
+   int supported_mods = 0;
+
+   for (int i = 0; i < ARRAY_SIZE(all_modifiers); i++) {
+      if (!modifier_is_supported(devinfo, pfmt, all_modifiers[i]))
+         continue;
+
+      if (supported_mods < max) {
+         if (modifiers)
+            modifiers[supported_mods] = all_modifiers[i];
+
+         if (external_only)
+            external_only[supported_mods] = util_format_is_yuv(pfmt);
+      }
+
+      supported_mods++;
+   }
+
+   *count = supported_mods;
+}
+
+static isl_surf_usage_flags_t
+pipe_bind_to_isl_usage(unsigned bindings)
+{
+   isl_surf_usage_flags_t usage = 0;
+
+   if (bindings & PIPE_BIND_RENDER_TARGET)
+      usage |= ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+   if (bindings & PIPE_BIND_SAMPLER_VIEW)
+      usage |= ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (bindings & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SHADER_BUFFER))
+      usage |= ISL_SURF_USAGE_STORAGE_BIT;
+
+   if (bindings & PIPE_BIND_DISPLAY_TARGET)
+      usage |= ISL_SURF_USAGE_DISPLAY_BIT;
+
+   return usage;
+}
+
+struct pipe_resource *
+crocus_resource_get_separate_stencil(struct pipe_resource *p_res)
+{
+   /* For packed depth-stencil, we treat depth as the primary resource
+    * and store S8 as the "second plane" resource.
+    */
+   if (p_res->next && p_res->next->format == PIPE_FORMAT_S8_UINT)
+      return p_res->next;
+
+   return NULL;
+
+}
+
+static void
+crocus_resource_set_separate_stencil(struct pipe_resource *p_res,
+                                     struct pipe_resource *stencil)
+{
+   assert(util_format_has_depth(util_format_description(p_res->format)));
+   pipe_resource_reference(&p_res->next, stencil);
+}
+
+void
+crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo,
+                                   struct pipe_resource *res,
+                                   struct crocus_resource **out_z,
+                                   struct crocus_resource **out_s)
+{
+   if (!res) {
+      *out_z = NULL;
+      *out_s = NULL;
+      return;
+   }
+
+   /* gen4/5 only supports packed ds */
+   if (devinfo->ver < 6) {
+      *out_z = (void *)res;
+      *out_s = (void *)res;
+      return;
+   }
+
+   if (res->format != PIPE_FORMAT_S8_UINT) {
+      *out_z = (void *) res;
+      *out_s = (void *) crocus_resource_get_separate_stencil(res);
+   } else {
+      *out_z = NULL;
+      *out_s = (void *) res;
+   }
+}
+
+void
+crocus_resource_disable_aux(struct crocus_resource *res)
+{
+   crocus_bo_unreference(res->aux.bo);
+   free(res->aux.state);
+
+   res->aux.usage = ISL_AUX_USAGE_NONE;
+   res->aux.has_hiz = 0;
+   res->aux.surf.size_B = 0;
+   res->aux.surf.levels = 0;
+   res->aux.bo = NULL;
+   res->aux.extra_aux.surf.size_B = 0;
+   res->aux.state = NULL;
+}
+
+static void
+crocus_resource_destroy(struct pipe_screen *screen,
+                        struct pipe_resource *resource)
+{
+   struct crocus_resource *res = (struct crocus_resource *)resource;
+
+   if (resource->target == PIPE_BUFFER)
+      util_range_destroy(&res->valid_buffer_range);
+
+   if (res->shadow)
+      pipe_resource_reference((struct pipe_resource **)&res->shadow, NULL);
+   crocus_resource_disable_aux(res);
+
+   crocus_bo_unreference(res->bo);
+   crocus_pscreen_unref(res->orig_screen);
+   free(res);
+}
+
+static struct crocus_resource *
+crocus_alloc_resource(struct pipe_screen *pscreen,
+                      const struct pipe_resource *templ)
+{
+   struct crocus_resource *res = calloc(1, sizeof(struct crocus_resource));
+   if (!res)
+      return NULL;
+
+   res->base = *templ;
+   res->base.screen = pscreen;
+   res->orig_screen = crocus_pscreen_ref(pscreen);
+   pipe_reference_init(&res->base.reference, 1);
+
+   if (templ->target == PIPE_BUFFER)
+      util_range_init(&res->valid_buffer_range);
+
+   return res;
+}
+
+unsigned
+crocus_get_num_logical_layers(const struct crocus_resource *res, unsigned level)
+{
+   if (res->surf.dim == ISL_SURF_DIM_3D)
+      return minify(res->surf.logical_level0_px.depth, level);
+   else
+      return res->surf.logical_level0_px.array_len;
+}
+
+static enum isl_aux_state **
+create_aux_state_map(struct crocus_resource *res, enum isl_aux_state initial)
+{
+   assert(res->aux.state == NULL);
+
+   uint32_t total_slices = 0;
+   for (uint32_t level = 0; level < res->surf.levels; level++)
+      total_slices += crocus_get_num_logical_layers(res, level);
+
+   const size_t per_level_array_size =
+      res->surf.levels * sizeof(enum isl_aux_state *);
+
+   /* We're going to allocate a single chunk of data for both the per-level
+    * reference array and the arrays of aux_state.  This makes cleanup
+    * significantly easier.
+    */
+   const size_t total_size =
+      per_level_array_size + total_slices * sizeof(enum isl_aux_state);
+
+   void *data = malloc(total_size);
+   if (!data)
+      return NULL;
+
+   enum isl_aux_state **per_level_arr = data;
+   enum isl_aux_state *s = data + per_level_array_size;
+   for (uint32_t level = 0; level < res->surf.levels; level++) {
+      per_level_arr[level] = s;
+      const unsigned level_layers = crocus_get_num_logical_layers(res, level);
+      for (uint32_t a = 0; a < level_layers; a++)
+         *(s++) = initial;
+   }
+   assert((void *)s == data + total_size);
+
+   return per_level_arr;
+}
+
+/**
+ * Configure aux for the resource, but don't allocate it. For images which
+ * might be shared with modifiers, we must allocate the image and aux data in
+ * a single bo.
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_configure_aux(struct crocus_screen *screen,
+                              struct crocus_resource *res, bool imported,
+                              uint64_t *aux_size_B,
+                              uint32_t *alloc_flags)
+{
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   /* Try to create the auxiliary surfaces allowed by the modifier or by
+    * the user if no modifier is specified.
+    */
+   assert(!res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE);
+
+   const bool has_mcs = devinfo->ver >= 7 && !res->mod_info &&
+      isl_surf_get_mcs_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+   const bool has_hiz = devinfo->ver >= 6 && !res->mod_info &&
+      !(INTEL_DEBUG & DEBUG_NO_HIZ) &&
+      isl_surf_get_hiz_surf(&screen->isl_dev, &res->surf, &res->aux.surf);
+
+   const bool has_ccs =
+      ((devinfo->ver >= 7 && !res->mod_info && !(INTEL_DEBUG & DEBUG_NO_RBC)) ||
+       (res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE)) &&
+      isl_surf_get_ccs_surf(&screen->isl_dev, &res->surf, &res->aux.surf,
+                            &res->aux.extra_aux.surf, 0);
+
+   /* Having both HIZ and MCS is impossible. */
+   assert(!has_mcs || !has_hiz);
+
+   /* Ensure aux surface creation for MCS_CCS and HIZ_CCS is correct. */
+   if (has_ccs && (has_mcs || has_hiz)) {
+      assert(res->aux.extra_aux.surf.size_B > 0 &&
+             res->aux.extra_aux.surf.usage & ISL_SURF_USAGE_CCS_BIT);
+      assert(res->aux.surf.size_B > 0 &&
+             res->aux.surf.usage &
+             (ISL_SURF_USAGE_HIZ_BIT | ISL_SURF_USAGE_MCS_BIT));
+   }
+
+   if (res->mod_info && has_ccs) {
+      res->aux.usage = res->mod_info->aux_usage;
+   } else if (has_mcs) {
+      res->aux.usage = ISL_AUX_USAGE_MCS;
+   } else if (has_hiz) {
+      res->aux.usage = ISL_AUX_USAGE_HIZ;
+   } else if (has_ccs) {
+      if (isl_format_supports_ccs_d(devinfo, res->surf.format))
+         res->aux.usage = ISL_AUX_USAGE_CCS_D;
+   }
+
+   enum isl_aux_state initial_state = ISL_AUX_STATE_AUX_INVALID;
+   *aux_size_B = 0;
+   *alloc_flags = 0;
+   assert(!res->aux.bo);
+
+   switch (res->aux.usage) {
+   case ISL_AUX_USAGE_NONE:
+      /* Having no aux buffer is only okay if there's no modifier with aux. */
+      res->aux.surf.levels = 0;
+      return !res->mod_info || res->mod_info->aux_usage == ISL_AUX_USAGE_NONE;
+   case ISL_AUX_USAGE_HIZ:
+      initial_state = ISL_AUX_STATE_AUX_INVALID;
+      break;
+   case ISL_AUX_USAGE_MCS:
+      /* The Ivybridge PRM, Vol 2 Part 1 p326 says:
+       *
+       *    "When MCS buffer is enabled and bound to MSRT, it is required
+       *     that it is cleared prior to any rendering."
+       *
+       * Since we only use the MCS buffer for rendering, we just clear it
+       * immediately on allocation.  The clear value for MCS buffers is all
+       * 1's, so we simply memset it to 0xff.
+       */
+      initial_state = ISL_AUX_STATE_CLEAR;
+      break;
+   case ISL_AUX_USAGE_CCS_D:
+      /* When CCS_E is used, we need to ensure that the CCS starts off in
+       * a valid state.  From the Sky Lake PRM, "MCS Buffer for Render
+       * Target(s)":
+       *
+       *    "If Software wants to enable Color Compression without Fast
+       *     clear, Software needs to initialize MCS with zeros."
+       *
+       * A CCS value of 0 indicates that the corresponding block is in the
+       * pass-through state which is what we want.
+       *
+       * For CCS_D, do the same thing.  On Gen9+, this avoids having any
+       * undefined bits in the aux buffer.
+       */
+      if (imported)
+         initial_state =
+            isl_drm_modifier_get_default_aux_state(res->mod_info->modifier);
+      else
+         initial_state = ISL_AUX_STATE_PASS_THROUGH;
+      *alloc_flags |= BO_ALLOC_ZEROED;
+      break;
+   default:
+      unreachable("non-crocus aux");
+   }
+
+   /* Create the aux_state for the auxiliary buffer. */
+   res->aux.state = create_aux_state_map(res, initial_state);
+   if (!res->aux.state)
+      return false;
+
+   /* Increase the aux offset if the main and aux surfaces will share a BO. */
+   res->aux.offset =
+      !res->mod_info || res->mod_info->aux_usage == res->aux.usage ?
+      ALIGN(res->surf.size_B, res->aux.surf.alignment_B) : 0;
+   uint64_t size = res->aux.surf.size_B;
+
+   /* Allocate space in the buffer for storing the CCS. */
+   if (res->aux.extra_aux.surf.size_B > 0) {
+      const uint64_t padded_aux_size =
+         ALIGN(size, res->aux.extra_aux.surf.alignment_B);
+      res->aux.extra_aux.offset = res->aux.offset + padded_aux_size;
+      size = padded_aux_size + res->aux.extra_aux.surf.size_B;
+   }
+
+   /* Allocate space in the buffer for storing the clear color. On modern
+    * platforms (gen > 9), we can read it directly from such buffer.
+    *
+    * On gen <= 9, we are going to store the clear color on the buffer
+    * anyways, and copy it back to the surface state during state emission.
+    *
+    * Also add some padding to make sure the fast clear color state buffer
+    * starts at a 4K alignment. We believe that 256B might be enough, but due
+    * to lack of testing we will leave this as 4K for now.
+    */
+   size = ALIGN(size, 4096);
+   *aux_size_B = size;
+
+   if (isl_aux_usage_has_hiz(res->aux.usage)) {
+      for (unsigned level = 0; level < res->surf.levels; ++level) {
+         uint32_t width = u_minify(res->surf.phys_level0_sa.width, level);
+         uint32_t height = u_minify(res->surf.phys_level0_sa.height, level);
+
+         /* Disable HiZ for LOD > 0 unless the width/height are 8x4 aligned.
+          * For LOD == 0, we can grow the dimensions to make it work.
+          */
+         if (!devinfo->is_haswell ||
+             (level == 0 || ((width & 7) == 0 && (height & 3) == 0)))
+            res->aux.has_hiz |= 1 << level;
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Initialize the aux buffer contents.
+ *
+ * Returns false on unexpected error (e.g. mapping a BO failed).
+ */
+static bool
+crocus_resource_init_aux_buf(struct crocus_resource *res, uint32_t alloc_flags)
+{
+   if (!(alloc_flags & BO_ALLOC_ZEROED)) {
+      void *map = crocus_bo_map(NULL, res->aux.bo, MAP_WRITE | MAP_RAW);
+
+      if (!map)
+         return false;
+
+      if (crocus_resource_get_aux_state(res, 0, 0) != ISL_AUX_STATE_AUX_INVALID) {
+         uint8_t memset_value = isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0;
+         memset((char*)map + res->aux.offset, memset_value,
+                res->aux.surf.size_B);
+      }
+
+      /* Bspec section titled : MCS/CCS Buffers for Render Target(s) states:
+       *    - If Software wants to enable Color Compression without Fast clear,
+       *      Software needs to initialize MCS with zeros.
+       *    - Lossless compression and CCS initialized to all F (using HW Fast
+       *      Clear or SW direct Clear)
+       *
+       * We think, the first bullet point above is referring to CCS aux
+       * surface. Since we initialize the MCS in the clear state, we also
+       * initialize the CCS in the clear state (via SW direct clear) to keep
+       * the two in sync.
+       */
+      memset((char*)map + res->aux.extra_aux.offset,
+             isl_aux_usage_has_mcs(res->aux.usage) ? 0xFF : 0,
+             res->aux.extra_aux.surf.size_B);
+
+      crocus_bo_unmap(res->aux.bo);
+   }
+
+   return true;
+}
+
+/**
+ * Allocate the initial aux surface for a resource based on aux.usage
+ *
+ * Returns false on unexpected error (e.g. allocation failed, or invalid
+ * configuration result).
+ */
+static bool
+crocus_resource_alloc_separate_aux(struct crocus_screen *screen,
+                                   struct crocus_resource *res)
+{
+   uint32_t alloc_flags;
+   uint64_t size;
+   if (!crocus_resource_configure_aux(screen, res, false, &size, &alloc_flags))
+      return false;
+
+   if (size == 0)
+      return true;
+
+   /* Allocate the auxiliary buffer.  ISL has stricter set of alignment rules
+    * the drm allocator.  Therefore, one can pass the ISL dimensions in terms
+    * of bytes instead of trying to recalculate based on different format
+    * block sizes.
+    */
+   res->aux.bo = crocus_bo_alloc_tiled(screen->bufmgr, "aux buffer", size, 4096,
+                                       isl_tiling_to_i915_tiling(res->aux.surf.tiling),
+                                       res->aux.surf.row_pitch_B, alloc_flags);
+   if (!res->aux.bo) {
+      return false;
+   }
+
+   if (!crocus_resource_init_aux_buf(res, alloc_flags))
+      return false;
+
+   return true;
+}
+
+void
+crocus_resource_finish_aux_import(struct pipe_screen *pscreen,
+                                  struct crocus_resource *res)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   assert(crocus_resource_unfinished_aux_import(res));
+   assert(!res->mod_info->supports_clear_color);
+
+   struct crocus_resource *aux_res = (void *) res->base.next;
+   assert(aux_res->aux.surf.row_pitch_B && aux_res->aux.offset &&
+          aux_res->aux.bo);
+
+   assert(res->bo == aux_res->aux.bo);
+   crocus_bo_reference(aux_res->aux.bo);
+   res->aux.bo = aux_res->aux.bo;
+
+   res->aux.offset = aux_res->aux.offset;
+
+   assert(res->bo->size >= (res->aux.offset + res->aux.surf.size_B));
+   assert(aux_res->aux.surf.row_pitch_B == res->aux.surf.row_pitch_B);
+
+   crocus_resource_destroy(&screen->base, res->base.next);
+   res->base.next = NULL;
+}
+
+static struct pipe_resource *
+crocus_resource_create_for_buffer(struct pipe_screen *pscreen,
+                                  const struct pipe_resource *templ)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+
+   assert(templ->target == PIPE_BUFFER);
+   assert(templ->height0 <= 1);
+   assert(templ->depth0 <= 1);
+   assert(templ->format == PIPE_FORMAT_NONE ||
+          util_format_get_blocksize(templ->format) == 1);
+
+   res->internal_format = templ->format;
+   res->surf.tiling = ISL_TILING_LINEAR;
+
+   const char *name = templ->target == PIPE_BUFFER ? "buffer" : "miptree";
+
+   res->bo = crocus_bo_alloc(screen->bufmgr, name, templ->width0);
+   if (!res->bo) {
+      crocus_resource_destroy(pscreen, &res->base);
+      return NULL;
+   }
+
+   return &res->base;
+}
+
+static struct pipe_resource *
+crocus_resource_create_with_modifiers(struct pipe_screen *pscreen,
+                                      const struct pipe_resource *templ,
+                                      const uint64_t *modifiers,
+                                      int modifiers_count)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+
+   if (!res)
+      return NULL;
+
+   const struct util_format_description *format_desc =
+      util_format_description(templ->format);
+   const bool has_depth = util_format_has_depth(format_desc);
+   uint64_t modifier =
+      select_best_modifier(devinfo, templ->format, modifiers, modifiers_count);
+
+   isl_tiling_flags_t tiling_flags = ISL_TILING_ANY_MASK;
+
+   /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */
+   if (devinfo->ver < 6 && !util_format_is_depth_or_stencil(templ->format))
+      tiling_flags &= ~ISL_TILING_Y0_BIT;
+
+   if (modifier != DRM_FORMAT_MOD_INVALID) {
+      res->mod_info = isl_drm_modifier_get_info(modifier);
+
+      tiling_flags = 1 << res->mod_info->tiling;
+   } else {
+      if (modifiers_count > 0) {
+         fprintf(stderr, "Unsupported modifier, resource creation failed.\n");
+         goto fail;
+      }
+
+      if (templ->bind & PIPE_BIND_RENDER_TARGET && devinfo->ver < 6) {
+         modifier = I915_FORMAT_MOD_X_TILED;
+         res->mod_info = isl_drm_modifier_get_info(modifier);
+         tiling_flags = 1 << res->mod_info->tiling;
+      }
+      /* Use linear for staging buffers */
+      if (templ->usage == PIPE_USAGE_STAGING ||
+          templ->bind & (PIPE_BIND_LINEAR | PIPE_BIND_CURSOR) )
+         tiling_flags = ISL_TILING_LINEAR_BIT;
+   }
+
+   isl_surf_usage_flags_t usage = pipe_bind_to_isl_usage(templ->bind);
+
+   if (templ->target == PIPE_TEXTURE_CUBE ||
+       templ->target == PIPE_TEXTURE_CUBE_ARRAY)
+      usage |= ISL_SURF_USAGE_CUBE_BIT;
+
+   if (templ->usage != PIPE_USAGE_STAGING) {
+      if (templ->format == PIPE_FORMAT_S8_UINT)
+         usage |= ISL_SURF_USAGE_STENCIL_BIT;
+      else if (has_depth) {
+         /* combined DS only on gen4/5 */
+         if (devinfo->ver < 6) {
+            if (templ->format == PIPE_FORMAT_Z24X8_UNORM ||
+                templ->format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+                templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
+               usage |= ISL_SURF_USAGE_STENCIL_BIT;
+         }
+         usage |= ISL_SURF_USAGE_DEPTH_BIT;
+      }
+
+      if (templ->format == PIPE_FORMAT_S8_UINT)
+         tiling_flags = ISL_TILING_W_BIT;
+   }
+
+   if (templ->usage == PIPE_USAGE_STAGING &&
+       templ->bind == PIPE_BIND_DEPTH_STENCIL &&
+       devinfo->ver < 6)
+      return NULL;
+
+   enum pipe_format pfmt = templ->format;
+   res->internal_format = pfmt;
+
+   /* Should be handled by u_transfer_helper */
+//   assert(!util_format_is_depth_and_stencil(pfmt));
+
+   struct crocus_format_info fmt = crocus_format_for_usage(devinfo, pfmt, usage);
+   assert(fmt.fmt != ISL_FORMAT_UNSUPPORTED);
+   enum isl_surf_dim dim = crocus_target_to_isl_surf_dim(templ->target);
+
+   UNUSED const bool isl_surf_created_successfully =
+      isl_surf_init(&screen->isl_dev, &res->surf,
+                    .dim = dim,
+                    .format = fmt.fmt,
+                    .width = templ->width0,
+                    .height = templ->height0,
+                    .depth = templ->depth0,
+                    .levels = templ->last_level + 1,
+                    .array_len = templ->array_size,
+                    .samples = MAX2(templ->nr_samples, 1),
+                    .min_alignment_B = 0,
+                    .row_pitch_B = 0,
+                    .usage = usage,
+                    .tiling_flags = tiling_flags);
+   assert(isl_surf_created_successfully);
+
+   const char *name = "miptree";
+
+   unsigned int flags = 0;
+   if (templ->usage == PIPE_USAGE_STAGING)
+      flags |= BO_ALLOC_COHERENT;
+
+   uint64_t aux_size = 0;
+   uint32_t aux_preferred_alloc_flags;
+
+   if (!crocus_resource_configure_aux(screen, res, false, &aux_size,
+                                      &aux_preferred_alloc_flags)) {
+      goto fail;
+   }
+
+   /* Modifiers require the aux data to be in the same buffer as the main
+    * surface, but we combine them even when a modifiers is not being used.
+    */
+   const uint64_t bo_size =
+      MAX2(res->surf.size_B, res->aux.offset + aux_size);
+   uint32_t alignment = MAX2(4096, res->surf.alignment_B);
+   res->bo = crocus_bo_alloc_tiled(screen->bufmgr, name, bo_size, alignment,
+                                   isl_tiling_to_i915_tiling(res->surf.tiling),
+                                   res->surf.row_pitch_B, flags);
+
+   if (!res->bo)
+      goto fail;
+
+   if (aux_size > 0) {
+      res->aux.bo = res->bo;
+      crocus_bo_reference(res->aux.bo);
+      if (!crocus_resource_init_aux_buf(res, flags))
+         goto fail;
+   }
+
+   if (templ->format == PIPE_FORMAT_S8_UINT && !(templ->usage == PIPE_USAGE_STAGING) &&
+       devinfo->ver == 7 && (templ->bind & PIPE_BIND_SAMPLER_VIEW)) {
+      struct pipe_resource templ_shadow = (struct pipe_resource) {
+         .usage = 0,
+         .bind = PIPE_BIND_SAMPLER_VIEW,
+         .width0 = res->base.width0,
+         .height0 = res->base.height0,
+         .depth0 = res->base.depth0,
+         .last_level = res->base.last_level,
+         .nr_samples = res->base.nr_samples,
+         .nr_storage_samples = res->base.nr_storage_samples,
+         .array_size = res->base.array_size,
+         .format = PIPE_FORMAT_R8_UINT,
+         .target = res->base.target,
+      };
+      res->shadow = (struct crocus_resource *)screen->base.resource_create(&screen->base, &templ_shadow);
+      assert(res->shadow);
+   }
+
+   return &res->base;
+
+fail:
+   fprintf(stderr, "XXX: resource creation failed\n");
+   crocus_resource_destroy(pscreen, &res->base);
+   return NULL;
+
+}
+
+static struct pipe_resource *
+crocus_resource_create(struct pipe_screen *pscreen,
+                       const struct pipe_resource *templ)
+{
+   if (templ->target == PIPE_BUFFER)
+      return crocus_resource_create_for_buffer(pscreen, templ);
+   else
+      return crocus_resource_create_with_modifiers(pscreen, templ, NULL, 0);
+}
+
+static uint64_t
+tiling_to_modifier(uint32_t tiling)
+{
+   static const uint64_t map[] = {
+      [I915_TILING_NONE]   = DRM_FORMAT_MOD_LINEAR,
+      [I915_TILING_X]      = I915_FORMAT_MOD_X_TILED,
+      [I915_TILING_Y]      = I915_FORMAT_MOD_Y_TILED,
+   };
+
+   assert(tiling < ARRAY_SIZE(map));
+
+   return map[tiling];
+}
+
+static struct pipe_resource *
+crocus_resource_from_user_memory(struct pipe_screen *pscreen,
+                                 const struct pipe_resource *templ,
+                                 void *user_memory)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+   struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+   if (!res)
+      return NULL;
+
+   assert(templ->target == PIPE_BUFFER);
+
+   res->internal_format = templ->format;
+   res->bo = crocus_bo_create_userptr(bufmgr, "user",
+                                      user_memory, templ->width0);
+   if (!res->bo) {
+      free(res);
+      return NULL;
+   }
+
+   util_range_add(&res->base, &res->valid_buffer_range, 0, templ->width0);
+
+   return &res->base;
+}
+
+static struct pipe_resource *
+crocus_resource_from_handle(struct pipe_screen *pscreen,
+                            const struct pipe_resource *templ,
+                            struct winsys_handle *whandle,
+                            unsigned usage)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_bufmgr *bufmgr = screen->bufmgr;
+   struct crocus_resource *res = crocus_alloc_resource(pscreen, templ);
+   const struct isl_drm_modifier_info *mod_inf =
+      isl_drm_modifier_get_info(whandle->modifier);
+   uint32_t tiling;
+
+   if (!res)
+      return NULL;
+
+   switch (whandle->type) {
+   case WINSYS_HANDLE_TYPE_FD:
+      if (mod_inf)
+         tiling = isl_tiling_to_i915_tiling(mod_inf->tiling);
+      else
+         tiling = I915_TILING_LAST + 1;
+      res->bo = crocus_bo_import_dmabuf(bufmgr, whandle->handle,
+                                        tiling, whandle->stride);
+      break;
+   case WINSYS_HANDLE_TYPE_SHARED:
+      res->bo = crocus_bo_gem_create_from_name(bufmgr, "winsys image",
+                                               whandle->handle);
+      break;
+   default:
+      unreachable("invalid winsys handle type");
+   }
+   if (!res->bo)
+      return NULL;
+
+   res->offset = whandle->offset;
+
+   if (mod_inf == NULL) {
+      mod_inf =
+         isl_drm_modifier_get_info(tiling_to_modifier(res->bo->tiling_mode));
+   }
+   assert(mod_inf);
+
+   res->external_format = whandle->format;
+   res->mod_info = mod_inf;
+
+   isl_surf_usage_flags_t isl_usage = pipe_bind_to_isl_usage(templ->bind);
+
+   const struct crocus_format_info fmt =
+      crocus_format_for_usage(devinfo, templ->format, isl_usage);
+   res->internal_format = templ->format;
+
+   if (templ->target == PIPE_BUFFER) {
+      res->surf.tiling = ISL_TILING_LINEAR;
+   } else {
+      if (whandle->plane < util_format_get_num_planes(whandle->format)) {
+         UNUSED const bool isl_surf_created_successfully =
+            isl_surf_init(&screen->isl_dev, &res->surf,
+                          .dim = crocus_target_to_isl_surf_dim(templ->target),
+                          .format = fmt.fmt,
+                          .width = templ->width0,
+                          .height = templ->height0,
+                          .depth = templ->depth0,
+                          .levels = templ->last_level + 1,
+                          .array_len = templ->array_size,
+                          .samples = MAX2(templ->nr_samples, 1),
+                          .min_alignment_B = 0,
+                          .row_pitch_B = whandle->stride,
+                          .usage = isl_usage,
+                          .tiling_flags = 1 << res->mod_info->tiling);
+         assert(isl_surf_created_successfully);
+         assert(res->bo->tiling_mode ==
+                isl_tiling_to_i915_tiling(res->surf.tiling));
+
+         // XXX: create_ccs_buf_for_image?
+         if (whandle->modifier == DRM_FORMAT_MOD_INVALID) {
+            if (!crocus_resource_alloc_separate_aux(screen, res))
+               goto fail;
+         } else {
+            if (res->mod_info->aux_usage != ISL_AUX_USAGE_NONE) {
+               uint32_t alloc_flags;
+               uint64_t size;
+               UNUSED bool ok = crocus_resource_configure_aux(screen, res, true, &size,
+                                                       &alloc_flags);
+               assert(ok);
+               /* The gallium dri layer will create a separate plane resource
+                * for the aux image. crocus_resource_finish_aux_import will
+                * merge the separate aux parameters back into a single
+                * crocus_resource.
+                */
+            }
+         }
+      } else {
+         /* Save modifier import information to reconstruct later. After
+          * import, this will be available under a second image accessible
+          * from the main image with res->base.next. See
+          * crocus_resource_finish_aux_import.
+          */
+         res->aux.surf.row_pitch_B = whandle->stride;
+         res->aux.offset = whandle->offset;
+         res->aux.bo = res->bo;
+         res->bo = NULL;
+      }
+   }
+
+   return &res->base;
+
+fail:
+   crocus_resource_destroy(pscreen, &res->base);
+   return NULL;
+}
+
+static void
+crocus_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_resource *res = (void *) resource;
+   const struct isl_drm_modifier_info *mod = res->mod_info;
+
+   crocus_resource_prepare_access(ice, res,
+                                  0, INTEL_REMAINING_LEVELS,
+                                  0, INTEL_REMAINING_LAYERS,
+                                  mod ? mod->aux_usage : ISL_AUX_USAGE_NONE,
+                                  mod ? mod->supports_clear_color : false);
+}
+
+static void
+crocus_resource_disable_aux_on_first_query(struct pipe_resource *resource,
+                                           unsigned usage)
+{
+   struct crocus_resource *res = (struct crocus_resource *)resource;
+   bool mod_with_aux =
+      res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+
+   /* Disable aux usage if explicit flush not set and this is the first time
+    * we are dealing with this resource and the resource was not created with
+    * a modifier with aux.
+    */
+   if (!mod_with_aux &&
+       (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) && res->aux.usage != 0) &&
+       p_atomic_read(&resource->reference.count) == 1) {
+      crocus_resource_disable_aux(res);
+   }
+}
+
+static bool
+crocus_resource_get_param(struct pipe_screen *pscreen,
+                          struct pipe_context *context,
+                          struct pipe_resource *resource,
+                          unsigned plane,
+                          unsigned layer,
+                          unsigned level,
+                          enum pipe_resource_param param,
+                          unsigned handle_usage,
+                          uint64_t *value)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   struct crocus_resource *res = (struct crocus_resource *)resource;
+   bool mod_with_aux =
+      res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+   bool wants_aux = mod_with_aux && plane > 0;
+   bool result;
+   unsigned handle;
+
+   if (crocus_resource_unfinished_aux_import(res))
+      crocus_resource_finish_aux_import(pscreen, res);
+
+   struct crocus_bo *bo = wants_aux ? res->aux.bo : res->bo;
+
+   crocus_resource_disable_aux_on_first_query(resource, handle_usage);
+
+   switch (param) {
+   case PIPE_RESOURCE_PARAM_NPLANES:
+      if (mod_with_aux) {
+         *value = util_format_get_num_planes(res->external_format);
+      } else {
+         unsigned count = 0;
+         for (struct pipe_resource *cur = resource; cur; cur = cur->next)
+            count++;
+         *value = count;
+      }
+      return true;
+   case PIPE_RESOURCE_PARAM_STRIDE:
+      *value = wants_aux ? res->aux.surf.row_pitch_B : res->surf.row_pitch_B;
+      return true;
+   case PIPE_RESOURCE_PARAM_OFFSET:
+      *value = wants_aux ? res->aux.offset : 0;
+      return true;
+   case PIPE_RESOURCE_PARAM_MODIFIER:
+      *value = res->mod_info ? res->mod_info->modifier :
+               tiling_to_modifier(res->bo->tiling_mode);
+      return true;
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_SHARED:
+      result = crocus_bo_flink(bo, &handle) == 0;
+      if (result)
+         *value = handle;
+      return result;
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_KMS: {
+      /* Because we share the same drm file across multiple crocus_screen, when
+       * we export a GEM handle we must make sure it is valid in the DRM file
+       * descriptor the caller is using (this is the FD given at screen
+       * creation).
+       */
+      uint32_t handle;
+      if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle))
+         return false;
+      *value = handle;
+      return true;
+   }
+   case PIPE_RESOURCE_PARAM_HANDLE_TYPE_FD:
+      result = crocus_bo_export_dmabuf(bo, (int *) &handle) == 0;
+      if (result)
+         *value = handle;
+      return result;
+   default:
+      return false;
+   }
+}
+
+static bool
+crocus_resource_get_handle(struct pipe_screen *pscreen,
+                           struct pipe_context *ctx,
+                           struct pipe_resource *resource,
+                           struct winsys_handle *whandle,
+                           unsigned usage)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+   struct crocus_resource *res = (struct crocus_resource *)resource;
+   bool mod_with_aux =
+      res->mod_info && res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+
+   crocus_resource_disable_aux_on_first_query(resource, usage);
+
+   struct crocus_bo *bo;
+   if (mod_with_aux && whandle->plane > 0) {
+      assert(res->aux.bo);
+      bo = res->aux.bo;
+      whandle->stride = res->aux.surf.row_pitch_B;
+      whandle->offset = res->aux.offset;
+   } else {
+      /* If this is a buffer, stride should be 0 - no need to special case */
+      whandle->stride = res->surf.row_pitch_B;
+      bo = res->bo;
+   }
+   whandle->format = res->external_format;
+   whandle->modifier =
+      res->mod_info ? res->mod_info->modifier
+                    : tiling_to_modifier(res->bo->tiling_mode);
+
+#ifndef NDEBUG
+   enum isl_aux_usage allowed_usage =
+      res->mod_info ? res->mod_info->aux_usage : ISL_AUX_USAGE_NONE;
+
+   if (res->aux.usage != allowed_usage) {
+      enum isl_aux_state aux_state = crocus_resource_get_aux_state(res, 0, 0);
+      assert(aux_state == ISL_AUX_STATE_RESOLVED ||
+             aux_state == ISL_AUX_STATE_PASS_THROUGH);
+   }
+#endif
+
+   switch (whandle->type) {
+   case WINSYS_HANDLE_TYPE_SHARED:
+      return crocus_bo_flink(bo, &whandle->handle) == 0;
+   case WINSYS_HANDLE_TYPE_KMS: {
+      /* Because we share the same drm file across multiple crocus_screen, when
+       * we export a GEM handle we must make sure it is valid in the DRM file
+       * descriptor the caller is using (this is the FD given at screen
+       * creation).
+       */
+      uint32_t handle;
+      if (crocus_bo_export_gem_handle_for_device(bo, screen->winsys_fd, &handle))
+         return false;
+      whandle->handle = handle;
+      return true;
+   }
+   case WINSYS_HANDLE_TYPE_FD:
+      return crocus_bo_export_dmabuf(bo, (int *) &whandle->handle) == 0;
+   }
+
+   return false;
+}
+
+static bool
+resource_is_busy(struct crocus_context *ice,
+                 struct crocus_resource *res)
+{
+   bool busy = crocus_bo_busy(res->bo);
+
+   for (int i = 0; i < ice->batch_count; i++)
+      busy |= crocus_batch_references(&ice->batches[i], res->bo);
+
+   return busy;
+}
+
+static void
+crocus_invalidate_resource(struct pipe_context *ctx,
+                           struct pipe_resource *resource)
+{
+   struct crocus_screen *screen = (void *) ctx->screen;
+   struct crocus_context *ice = (void *) ctx;
+   struct crocus_resource *res = (void *) resource;
+
+   if (resource->target != PIPE_BUFFER)
+      return;
+
+   if (!resource_is_busy(ice, res)) {
+      /* The resource is idle, so just mark that it contains no data and
+       * keep using the same underlying buffer object.
+       */
+      util_range_set_empty(&res->valid_buffer_range);
+      return;
+   }
+
+   /* Otherwise, try and replace the backing storage with a new BO. */
+
+   /* We can't reallocate memory we didn't allocate in the first place. */
+   if (res->bo->userptr)
+      return;
+
+   // XXX: We should support this.
+   if (res->bind_history & PIPE_BIND_STREAM_OUTPUT)
+      return;
+
+   struct crocus_bo *old_bo = res->bo;
+   struct crocus_bo *new_bo =
+      crocus_bo_alloc(screen->bufmgr, res->bo->name, resource->width0);
+
+   if (!new_bo)
+      return;
+
+   /* Swap out the backing storage */
+   res->bo = new_bo;
+
+   /* Rebind the buffer, replacing any state referring to the old BO's
+    * address, and marking state dirty so it's reemitted.
+    */
+   screen->vtbl.rebind_buffer(ice, res);
+
+   util_range_set_empty(&res->valid_buffer_range);
+
+   crocus_bo_unreference(old_bo);
+}
+
+static void
+crocus_flush_staging_region(struct pipe_transfer *xfer,
+                            const struct pipe_box *flush_box)
+{
+   if (!(xfer->usage & PIPE_MAP_WRITE))
+      return;
+
+   struct crocus_transfer *map = (void *) xfer;
+
+   struct pipe_box src_box = *flush_box;
+
+   /* Account for extra alignment padding in staging buffer */
+   if (xfer->resource->target == PIPE_BUFFER)
+      src_box.x += xfer->box.x % CROCUS_MAP_BUFFER_ALIGNMENT;
+
+   struct pipe_box dst_box = (struct pipe_box) {
+      .x = xfer->box.x + flush_box->x,
+      .y = xfer->box.y + flush_box->y,
+      .z = xfer->box.z + flush_box->z,
+      .width = flush_box->width,
+      .height = flush_box->height,
+      .depth = flush_box->depth,
+   };
+
+   crocus_copy_region(map->blorp, map->batch, xfer->resource, xfer->level,
+                      dst_box.x, dst_box.y, dst_box.z, map->staging, 0,
+                      &src_box);
+}
+
+static void
+crocus_unmap_copy_region(struct crocus_transfer *map)
+{
+   crocus_resource_destroy(map->staging->screen, map->staging);
+
+   map->ptr = NULL;
+}
+
+static void
+crocus_map_copy_region(struct crocus_transfer *map)
+{
+   struct pipe_screen *pscreen = &map->batch->screen->base;
+   struct pipe_transfer *xfer = &map->base;
+   struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (void *) xfer->resource;
+
+   unsigned extra = xfer->resource->target == PIPE_BUFFER ?
+                    box->x % CROCUS_MAP_BUFFER_ALIGNMENT : 0;
+
+   struct pipe_resource templ = (struct pipe_resource) {
+      .usage = PIPE_USAGE_STAGING,
+      .width0 = box->width + extra,
+      .height0 = box->height,
+      .depth0 = 1,
+      .nr_samples = xfer->resource->nr_samples,
+      .nr_storage_samples = xfer->resource->nr_storage_samples,
+      .array_size = box->depth,
+      .format = res->internal_format,
+   };
+
+   if (xfer->resource->target == PIPE_BUFFER)
+      templ.target = PIPE_BUFFER;
+   else if (templ.array_size > 1)
+      templ.target = PIPE_TEXTURE_2D_ARRAY;
+   else
+      templ.target = PIPE_TEXTURE_2D;
+
+   map->staging = crocus_resource_create(pscreen, &templ);
+   assert(map->staging);
+
+   if (templ.target != PIPE_BUFFER) {
+      struct isl_surf *surf = &((struct crocus_resource *) map->staging)->surf;
+      xfer->stride = isl_surf_get_row_pitch_B(surf);
+      xfer->layer_stride = isl_surf_get_array_pitch(surf);
+   }
+
+   if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+      crocus_copy_region(map->blorp, map->batch, map->staging, 0, extra, 0, 0,
+                         xfer->resource, xfer->level, box);
+      /* Ensure writes to the staging BO land before we map it below. */
+      crocus_emit_pipe_control_flush(map->batch,
+                                     "transfer read: flush before mapping",
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+   }
+
+   struct crocus_bo *staging_bo = crocus_resource_bo(map->staging);
+
+   if (crocus_batch_references(map->batch, staging_bo))
+      crocus_batch_flush(map->batch);
+
+   map->ptr =
+      crocus_bo_map(map->dbg, staging_bo, xfer->usage & MAP_FLAGS) + extra;
+
+   map->unmap = crocus_unmap_copy_region;
+}
+
+static void
+get_image_offset_el(const struct isl_surf *surf, unsigned level, unsigned z,
+                    unsigned *out_x0_el, unsigned *out_y0_el)
+{
+   ASSERTED uint32_t z0_el, a0_el;
+   if (surf->dim == ISL_SURF_DIM_3D) {
+      isl_surf_get_image_offset_el(surf, level, 0, z,
+                                   out_x0_el, out_y0_el, &z0_el, &a0_el);
+   } else {
+      isl_surf_get_image_offset_el(surf, level, z, 0,
+                                   out_x0_el, out_y0_el, &z0_el, &a0_el);
+   }
+   assert(z0_el == 0 && a0_el == 0);
+}
+
+void
+crocus_resource_get_image_offset(struct crocus_resource *res,
+                                 uint32_t level, uint32_t z,
+                                 uint32_t *x, uint32_t *y)
+{
+   get_image_offset_el(&res->surf, level, z, x, y);
+}
+
+/**
+ * Get pointer offset into stencil buffer.
+ *
+ * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, we
+ * must decode the tile's layout in software.
+ *
+ * See
+ *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major Tile
+ *     Format.
+ *   - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling Algorithm
+ *
+ * Even though the returned offset is always positive, the return type is
+ * signed due to
+ *    commit e8b1c6d6f55f5be3bef25084fdd8b6127517e137
+ *    mesa: Fix return type of  _mesa_get_format_bytes() (#37351)
+ */
+static intptr_t
+s8_offset(uint32_t stride, uint32_t x, uint32_t y, bool swizzled)
+{
+   uint32_t tile_size = 4096;
+   uint32_t tile_width = 64;
+   uint32_t tile_height = 64;
+   uint32_t row_size = 64 * stride / 2; /* Two rows are interleaved. */
+
+   uint32_t tile_x = x / tile_width;
+   uint32_t tile_y = y / tile_height;
+
+   /* The byte's address relative to the tile's base addres. */
+   uint32_t byte_x = x % tile_width;
+   uint32_t byte_y = y % tile_height;
+
+   uintptr_t u = tile_y * row_size
+               + tile_x * tile_size
+               + 512 * (byte_x / 8)
+               +  64 * (byte_y / 8)
+               +  32 * ((byte_y / 4) % 2)
+               +  16 * ((byte_x / 4) % 2)
+               +   8 * ((byte_y / 2) % 2)
+               +   4 * ((byte_x / 2) % 2)
+               +   2 * (byte_y % 2)
+               +   1 * (byte_x % 2);
+
+   if (swizzled) {
+      /* adjust for bit6 swizzling */
+      if (((byte_x / 8) % 2) == 1) {
+         if (((byte_y / 8) % 2) == 0) {
+            u += 64;
+         } else {
+            u -= 64;
+         }
+      }
+   }
+
+   return u;
+}
+
+static void
+crocus_unmap_s8(struct crocus_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   const struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+   struct isl_surf *surf = &res->surf;
+
+   if (xfer->usage & PIPE_MAP_WRITE) {
+      uint8_t *untiled_s8_map = map->ptr;
+      uint8_t *tiled_s8_map =
+         crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+      for (int s = 0; s < box->depth; s++) {
+         unsigned x0_el, y0_el;
+         get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el);
+
+         for (uint32_t y = 0; y < box->height; y++) {
+            for (uint32_t x = 0; x < box->width; x++) {
+               ptrdiff_t offset = s8_offset(surf->row_pitch_B,
+                                            x0_el + box->x + x,
+                                            y0_el + box->y + y,
+                                            map->has_swizzling);
+               tiled_s8_map[offset] =
+                  untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x];
+            }
+         }
+      }
+   }
+
+   free(map->buffer);
+}
+
+static void
+crocus_map_s8(struct crocus_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   const struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+   struct isl_surf *surf = &res->surf;
+
+   xfer->stride = surf->row_pitch_B;
+   xfer->layer_stride = xfer->stride * box->height;
+
+   /* The tiling and detiling functions require that the linear buffer has
+    * a 16-byte alignment (that is, its `x0` is 16-byte aligned).  Here we
+    * over-allocate the linear buffer to get the proper alignment.
+    */
+   map->buffer = map->ptr = malloc(xfer->layer_stride * box->depth);
+   assert(map->buffer);
+
+   /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
+    * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
+    * invalidate is set, since we'll be writing the whole rectangle from our
+    * temporary buffer back out.
+    */
+   if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+      uint8_t *untiled_s8_map = map->ptr;
+      uint8_t *tiled_s8_map =
+         crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+      for (int s = 0; s < box->depth; s++) {
+         unsigned x0_el, y0_el;
+         get_image_offset_el(surf, xfer->level, box->z + s, &x0_el, &y0_el);
+
+         for (uint32_t y = 0; y < box->height; y++) {
+            for (uint32_t x = 0; x < box->width; x++) {
+               ptrdiff_t offset = s8_offset(surf->row_pitch_B,
+                                            x0_el + box->x + x,
+                                            y0_el + box->y + y,
+                                            map->has_swizzling);
+               untiled_s8_map[s * xfer->layer_stride + y * xfer->stride + x] =
+                  tiled_s8_map[offset];
+            }
+         }
+      }
+   }
+
+   map->unmap = crocus_unmap_s8;
+}
+
+/* Compute extent parameters for use with tiled_memcpy functions.
+ * xs are in units of bytes and ys are in units of strides.
+ */
+static inline void
+tile_extents(const struct isl_surf *surf,
+             const struct pipe_box *box,
+             unsigned level, int z,
+             unsigned *x1_B, unsigned *x2_B,
+             unsigned *y1_el, unsigned *y2_el)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+   const unsigned cpp = fmtl->bpb / 8;
+
+   assert(box->x % fmtl->bw == 0);
+   assert(box->y % fmtl->bh == 0);
+
+   unsigned x0_el, y0_el;
+   get_image_offset_el(surf, level, box->z + z, &x0_el, &y0_el);
+
+   *x1_B = (box->x / fmtl->bw + x0_el) * cpp;
+   *y1_el = box->y / fmtl->bh + y0_el;
+   *x2_B = (DIV_ROUND_UP(box->x + box->width, fmtl->bw) + x0_el) * cpp;
+   *y2_el = DIV_ROUND_UP(box->y + box->height, fmtl->bh) + y0_el;
+}
+
+static void
+crocus_unmap_tiled_memcpy(struct crocus_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   const struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+   struct isl_surf *surf = &res->surf;
+
+   if (xfer->usage & PIPE_MAP_WRITE) {
+      char *dst =
+         crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+      for (int s = 0; s < box->depth; s++) {
+         unsigned x1, x2, y1, y2;
+         tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2);
+
+         void *ptr = map->ptr + s * xfer->layer_stride;
+
+         isl_memcpy_linear_to_tiled(x1, x2, y1, y2, dst, ptr,
+                                    surf->row_pitch_B, xfer->stride,
+                                    map->has_swizzling,
+                                    surf->tiling, ISL_MEMCPY);
+      }
+   }
+   os_free_aligned(map->buffer);
+   map->buffer = map->ptr = NULL;
+}
+
+static void
+crocus_map_tiled_memcpy(struct crocus_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   const struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+   struct isl_surf *surf = &res->surf;
+
+   xfer->stride = ALIGN(surf->row_pitch_B, 16);
+   xfer->layer_stride = xfer->stride * box->height;
+
+   unsigned x1, x2, y1, y2;
+   tile_extents(surf, box, xfer->level, 0, &x1, &x2, &y1, &y2);
+
+   /* The tiling and detiling functions require that the linear buffer has
+    * a 16-byte alignment (that is, its `x0` is 16-byte aligned).  Here we
+    * over-allocate the linear buffer to get the proper alignment.
+    */
+   map->buffer =
+      os_malloc_aligned(xfer->layer_stride * box->depth, 16);
+   assert(map->buffer);
+   map->ptr = (char *)map->buffer + (x1 & 0xf);
+
+   if (!(xfer->usage & PIPE_MAP_DISCARD_RANGE)) {
+      char *src =
+         crocus_bo_map(map->dbg, res->bo, (xfer->usage | MAP_RAW) & MAP_FLAGS);
+
+      for (int s = 0; s < box->depth; s++) {
+         unsigned x1, x2, y1, y2;
+         tile_extents(surf, box, xfer->level, s, &x1, &x2, &y1, &y2);
+
+         /* Use 's' rather than 'box->z' to rebase the first slice to 0. */
+         void *ptr = map->ptr + s * xfer->layer_stride;
+
+         isl_memcpy_tiled_to_linear(x1, x2, y1, y2, ptr, src, xfer->stride,
+                                    surf->row_pitch_B,
+                                    map->has_swizzling,
+                                    surf->tiling,
+#if defined(USE_SSE41)
+                                    util_get_cpu_caps()->has_sse4_1 ? ISL_MEMCPY_STREAMING_LOAD :
+#endif
+                                    ISL_MEMCPY);
+      }
+   }
+
+   map->unmap = crocus_unmap_tiled_memcpy;
+}
+
+static void
+crocus_map_direct(struct crocus_transfer *map)
+{
+   struct pipe_transfer *xfer = &map->base;
+   struct pipe_box *box = &xfer->box;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+
+   void *ptr = crocus_bo_map(map->dbg, res->bo, xfer->usage & MAP_FLAGS);
+
+   if (res->base.target == PIPE_BUFFER) {
+      xfer->stride = 0;
+      xfer->layer_stride = 0;
+
+      map->ptr = ptr + box->x;
+   } else {
+      struct isl_surf *surf = &res->surf;
+      const struct isl_format_layout *fmtl =
+         isl_format_get_layout(surf->format);
+      const unsigned cpp = fmtl->bpb / 8;
+      unsigned x0_el, y0_el;
+
+      get_image_offset_el(surf, xfer->level, box->z, &x0_el, &y0_el);
+
+      xfer->stride = isl_surf_get_row_pitch_B(surf);
+      xfer->layer_stride = isl_surf_get_array_pitch(surf);
+
+      map->ptr = ptr + (y0_el + box->y) * xfer->stride + (x0_el + box->x) * cpp;
+   }
+}
+
+static bool
+can_promote_to_async(const struct crocus_resource *res,
+                     const struct pipe_box *box,
+                     unsigned usage)
+{
+   /* If we're writing to a section of the buffer that hasn't even been
+    * initialized with useful data, then we can safely promote this write
+    * to be unsynchronized.  This helps the common pattern of appending data.
+    */
+   return res->base.target == PIPE_BUFFER && (usage & PIPE_MAP_WRITE) &&
+          !(usage & TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED) &&
+          !util_ranges_intersect(&res->valid_buffer_range, box->x,
+                                 box->x + box->width);
+}
+
+static void *
+crocus_transfer_map(struct pipe_context *ctx,
+                    struct pipe_resource *resource,
+                    unsigned level,
+                    unsigned usage,
+                    const struct pipe_box *box,
+                    struct pipe_transfer **ptransfer)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_resource *res = (struct crocus_resource *)resource;
+   struct isl_surf *surf = &res->surf;
+
+   if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
+      /* Replace the backing storage with a fresh buffer for non-async maps */
+      if (!(usage & (PIPE_MAP_UNSYNCHRONIZED |
+                     TC_TRANSFER_MAP_NO_INVALIDATE)))
+         crocus_invalidate_resource(ctx, resource);
+
+      /* If we can discard the whole resource, we can discard the range. */
+      usage |= PIPE_MAP_DISCARD_RANGE;
+   }
+
+   if (!(usage & PIPE_MAP_UNSYNCHRONIZED) &&
+       can_promote_to_async(res, box, usage)) {
+      usage |= PIPE_MAP_UNSYNCHRONIZED;
+   }
+
+   bool map_would_stall = false;
+
+   if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
+      map_would_stall = resource_is_busy(ice, res) ||
+         crocus_has_invalid_primary(res, level, 1, box->z, box->depth);
+
+
+      if (map_would_stall && (usage & PIPE_MAP_DONTBLOCK) &&
+                             (usage & PIPE_MAP_DIRECTLY))
+         return NULL;
+   }
+
+   if (surf->tiling != ISL_TILING_LINEAR &&
+       (usage & PIPE_MAP_DIRECTLY))
+      return NULL;
+
+   struct crocus_transfer *map = slab_alloc(&ice->transfer_pool);
+   struct pipe_transfer *xfer = &map->base;
+
+   if (!map)
+      return NULL;
+
+   memset(map, 0, sizeof(*map));
+   map->dbg = &ice->dbg;
+
+   map->has_swizzling = ((struct crocus_screen *)ctx->screen)->has_swizzling;
+   pipe_resource_reference(&xfer->resource, resource);
+   xfer->level = level;
+   xfer->usage = usage;
+   xfer->box = *box;
+   *ptransfer = xfer;
+
+   map->dest_had_defined_contents =
+      util_ranges_intersect(&res->valid_buffer_range, box->x,
+                            box->x + box->width);
+
+   if (usage & PIPE_MAP_WRITE)
+      util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+
+   /* Avoid using GPU copies for persistent/coherent buffers, as the idea
+    * there is to access them simultaneously on the CPU & GPU.  This also
+    * avoids trying to use GPU copies for our u_upload_mgr buffers which
+    * contain state we're constructing for a GPU draw call, which would
+    * kill us with infinite stack recursion.
+    */
+   bool no_gpu = usage & (PIPE_MAP_PERSISTENT |
+                          PIPE_MAP_COHERENT |
+                          PIPE_MAP_DIRECTLY);
+
+   /* GPU copies are not useful for buffer reads.  Instead of stalling to
+    * read from the original buffer, we'd simply copy it to a temporary...
+    * then stall (a bit longer) to read from that buffer.
+    *
+    * Images are less clear-cut.  Color resolves are destructive, removing
+    * the underlying compression, so we'd rather blit the data to a linear
+    * temporary and map that, to avoid the resolve.  (It might be better to
+    * a tiled temporary and use the tiled_memcpy paths...)
+    */
+   if (!(usage & PIPE_MAP_DISCARD_RANGE) &&
+       !crocus_has_invalid_primary(res, level, 1, box->z, box->depth))
+      no_gpu = true;
+
+   const struct isl_format_layout *fmtl = isl_format_get_layout(surf->format);
+   if (fmtl->txc == ISL_TXC_ASTC)
+      no_gpu = true;
+
+   if (map_would_stall && !no_gpu) {
+      /* If we need a synchronous mapping and the resource is busy, or needs
+       * resolving, we copy to/from a linear temporary buffer using the GPU.
+       */
+      map->batch = &ice->batches[CROCUS_BATCH_RENDER];
+      map->blorp = &ice->blorp;
+      crocus_map_copy_region(map);
+   } else {
+      /* Otherwise we're free to map on the CPU. */
+
+      if (resource->target != PIPE_BUFFER) {
+         crocus_resource_access_raw(ice, res,
+                                    level, box->z, box->depth,
+                                    usage & PIPE_MAP_WRITE);
+      }
+
+      if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
+         for (int i = 0; i < ice->batch_count; i++) {
+            if (crocus_batch_references(&ice->batches[i], res->bo))
+               crocus_batch_flush(&ice->batches[i]);
+         }
+      }
+
+      if (surf->tiling == ISL_TILING_W) {
+         /* TODO: Teach crocus_map_tiled_memcpy about W-tiling... */
+         crocus_map_s8(map);
+      } else if (surf->tiling != ISL_TILING_LINEAR) {
+         crocus_map_tiled_memcpy(map);
+      } else {
+         crocus_map_direct(map);
+      }
+   }
+
+   return map->ptr;
+}
+
+static void
+crocus_transfer_flush_region(struct pipe_context *ctx,
+                             struct pipe_transfer *xfer,
+                             const struct pipe_box *box)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_resource *res = (struct crocus_resource *) xfer->resource;
+   struct crocus_transfer *map = (void *) xfer;
+
+   if (map->staging)
+      crocus_flush_staging_region(xfer, box);
+
+   uint32_t history_flush = 0;
+
+   if (res->base.target == PIPE_BUFFER) {
+      if (map->staging)
+         history_flush |= PIPE_CONTROL_RENDER_TARGET_FLUSH;
+
+      if (map->dest_had_defined_contents)
+         history_flush |= crocus_flush_bits_for_history(res);
+
+      util_range_add(&res->base, &res->valid_buffer_range, box->x, box->x + box->width);
+   }
+
+   if (history_flush & ~PIPE_CONTROL_CS_STALL) {
+      for (int i = 0; i < ice->batch_count; i++) {
+         struct crocus_batch *batch = &ice->batches[i];
+
+         if (!batch->command.bo)
+            continue;
+         if (batch->contains_draw || batch->cache.render->entries) {
+            crocus_batch_maybe_flush(batch, 24);
+            crocus_emit_pipe_control_flush(batch,
+                                           "cache history: transfer flush",
+                                           history_flush);
+         }
+      }
+   }
+
+   /* Make sure we flag constants dirty even if there's no need to emit
+    * any PIPE_CONTROLs to a batch.
+    */
+   crocus_dirty_for_history(ice, res);
+}
+
+static void
+crocus_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer *xfer)
+{
+   struct crocus_context *ice = (struct crocus_context *)ctx;
+   struct crocus_transfer *map = (void *) xfer;
+
+   if (!(xfer->usage & (PIPE_MAP_FLUSH_EXPLICIT |
+                        PIPE_MAP_COHERENT))) {
+      struct pipe_box flush_box = {
+         .x = 0, .y = 0, .z = 0,
+         .width  = xfer->box.width,
+         .height = xfer->box.height,
+         .depth  = xfer->box.depth,
+      };
+      crocus_transfer_flush_region(ctx, xfer, &flush_box);
+   }
+
+   if (map->unmap)
+      map->unmap(map);
+
+   pipe_resource_reference(&xfer->resource, NULL);
+   slab_free(&ice->transfer_pool, map);
+}
+
+/**
+ * Mark state dirty that needs to be re-emitted when a resource is written.
+ */
+void
+crocus_dirty_for_history(struct crocus_context *ice,
+                         struct crocus_resource *res)
+{
+   uint64_t stage_dirty = 0ull;
+
+   if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+      stage_dirty |= ((uint64_t)res->bind_stages) << CROCUS_SHIFT_FOR_STAGE_DIRTY_CONSTANTS;
+   }
+
+   ice->state.stage_dirty |= stage_dirty;
+}
+
+/**
+ * Produce a set of PIPE_CONTROL bits which ensure data written to a
+ * resource becomes visible, and any stale read cache data is invalidated.
+ */
+uint32_t
+crocus_flush_bits_for_history(struct crocus_resource *res)
+{
+   uint32_t flush = PIPE_CONTROL_CS_STALL;
+
+   if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+      flush |= PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+   }
+
+   if (res->bind_history & PIPE_BIND_SAMPLER_VIEW)
+      flush |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+
+   if (res->bind_history & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
+      flush |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+
+   if (res->bind_history & (PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE))
+      flush |= PIPE_CONTROL_DATA_CACHE_FLUSH;
+
+   return flush;
+}
+
+void
+crocus_flush_and_dirty_for_history(struct crocus_context *ice,
+                                   struct crocus_batch *batch,
+                                   struct crocus_resource *res,
+                                   uint32_t extra_flags,
+                                   const char *reason)
+{
+   if (res->base.target != PIPE_BUFFER)
+      return;
+
+   uint32_t flush = crocus_flush_bits_for_history(res) | extra_flags;
+
+   crocus_emit_pipe_control_flush(batch, reason, flush);
+
+   crocus_dirty_for_history(ice, res);
+}
+
+bool
+crocus_resource_set_clear_color(struct crocus_context *ice,
+                                struct crocus_resource *res,
+                                union isl_color_value color)
+{
+   if (memcmp(&res->aux.clear_color, &color, sizeof(color)) != 0) {
+      res->aux.clear_color = color;
+      return true;
+   }
+
+   return false;
+}
+
+union isl_color_value
+crocus_resource_get_clear_color(const struct crocus_resource *res)
+{
+   assert(res->aux.bo);
+
+   return res->aux.clear_color;
+}
+
+static enum pipe_format
+crocus_resource_get_internal_format(struct pipe_resource *p_res)
+{
+   struct crocus_resource *res = (void *) p_res;
+   return res->internal_format;
+}
+
+static const struct u_transfer_vtbl transfer_vtbl = {
+   .resource_create       = crocus_resource_create,
+   .resource_destroy      = crocus_resource_destroy,
+   .transfer_map          = crocus_transfer_map,
+   .transfer_unmap        = crocus_transfer_unmap,
+   .transfer_flush_region = crocus_transfer_flush_region,
+   .get_internal_format   = crocus_resource_get_internal_format,
+   .set_stencil           = crocus_resource_set_separate_stencil,
+   .get_stencil           = crocus_resource_get_separate_stencil,
+};
+
+static bool
+crocus_is_dmabuf_modifier_supported(struct pipe_screen *pscreen,
+                                    uint64_t modifier, enum pipe_format pfmt,
+                                    bool *external_only)
+{
+   struct crocus_screen *screen = (void *) pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (modifier_is_supported(devinfo, pfmt, modifier)) {
+      if (external_only)
+         *external_only = false;
+
+      return true;
+   }
+
+   return false;
+}
+
+static unsigned int
+crocus_get_dmabuf_modifier_planes(struct pipe_screen *pscreen, uint64_t modifier,
+                                  enum pipe_format format)
+{
+   return util_format_get_num_planes(format);
+}
+
+void
+crocus_init_screen_resource_functions(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (void *) pscreen;
+   pscreen->query_dmabuf_modifiers = crocus_query_dmabuf_modifiers;
+   pscreen->is_dmabuf_modifier_supported = crocus_is_dmabuf_modifier_supported;
+   pscreen->get_dmabuf_modifier_planes = crocus_get_dmabuf_modifier_planes;
+   pscreen->resource_create_with_modifiers =
+      crocus_resource_create_with_modifiers;
+   pscreen->resource_create = u_transfer_helper_resource_create;
+   pscreen->resource_from_user_memory = crocus_resource_from_user_memory;
+   pscreen->resource_from_handle = crocus_resource_from_handle;
+   pscreen->resource_get_handle = crocus_resource_get_handle;
+   pscreen->resource_get_param = crocus_resource_get_param;
+   pscreen->resource_destroy = u_transfer_helper_resource_destroy;
+   pscreen->transfer_helper =
+      u_transfer_helper_create(&transfer_vtbl, screen->devinfo.ver >= 6,
+                               screen->devinfo.ver >= 6, false, true);
+}
+
+void
+crocus_init_resource_functions(struct pipe_context *ctx)
+{
+   ctx->flush_resource = crocus_flush_resource;
+   ctx->invalidate_resource = crocus_invalidate_resource;
+   ctx->buffer_map = u_transfer_helper_transfer_map;
+   ctx->texture_map = u_transfer_helper_transfer_map;
+   ctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
+   ctx->buffer_unmap = u_transfer_helper_transfer_unmap;
+   ctx->texture_unmap = u_transfer_helper_transfer_unmap;
+   ctx->buffer_subdata = u_default_buffer_subdata;
+   ctx->texture_subdata = u_default_texture_subdata;
+}
diff --git a/src/gallium/drivers/crocus/crocus_resource.h b/src/gallium/drivers/crocus/crocus_resource.h
new file mode 100644
index 00000000000..8eb49118f54
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_resource.h
@@ -0,0 +1,501 @@
+/*
+ * Copyright 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_RESOURCE_H
+#define CROCUS_RESOURCE_H
+
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_range.h"
+#include "intel/isl/isl.h"
+
+#include "crocus_bufmgr.h"
+
+struct crocus_batch;
+struct crocus_context;
+
+#define CROCUS_MAX_MIPLEVELS 15
+
+struct crocus_format_info {
+   enum isl_format fmt;
+   enum pipe_swizzle swizzles[4];
+};
+
+static inline enum isl_channel_select
+pipe_to_isl_swizzle(const enum pipe_swizzle pswz, bool green_to_blue)
+{
+   unsigned swz = (pswz + 4) & 7;
+
+   return (green_to_blue && swz == ISL_CHANNEL_SELECT_GREEN) ? ISL_CHANNEL_SELECT_BLUE : swz;
+}
+
+static inline struct isl_swizzle
+pipe_to_isl_swizzles(const enum pipe_swizzle pswz[4])
+{
+   struct isl_swizzle swz;
+   swz.r = pipe_to_isl_swizzle(pswz[0], false);
+   swz.g = pipe_to_isl_swizzle(pswz[1], false);
+   swz.b = pipe_to_isl_swizzle(pswz[2], false);
+   swz.a = pipe_to_isl_swizzle(pswz[3], false);
+   return swz;
+}
+
+static inline void
+crocus_combine_swizzle(enum pipe_swizzle outswz[4],
+                       const enum pipe_swizzle fswz[4],
+                       const enum pipe_swizzle vswz[4])
+{
+   for (unsigned i = 0; i < 4; i++) {
+      switch (vswz[i]) {
+      case PIPE_SWIZZLE_X: outswz[i] = fswz[0]; break;
+      case PIPE_SWIZZLE_Y: outswz[i] = fswz[1]; break;
+      case PIPE_SWIZZLE_Z: outswz[i] = fswz[2]; break;
+      case PIPE_SWIZZLE_W: outswz[i] = fswz[3]; break;
+      case PIPE_SWIZZLE_1: outswz[i] = PIPE_SWIZZLE_1; break;
+      case PIPE_SWIZZLE_0: outswz[i] = PIPE_SWIZZLE_0; break;
+      default: unreachable("invalid swizzle");
+      }
+   }
+}
+
+/**
+ * Resources represent a GPU buffer object or image (mipmap tree).
+ *
+ * They contain the storage (BO) and layout information (ISL surface).
+ */
+struct crocus_resource {
+   struct pipe_resource base;
+   enum pipe_format internal_format;
+
+   /**
+    * The ISL surface layout information for this resource.
+    *
+    * This is not filled out for PIPE_BUFFER resources, but is guaranteed
+    * to be zeroed.  Note that this also guarantees that res->surf.tiling
+    * will be ISL_TILING_LINEAR, so it's safe to check that.
+    */
+   struct isl_surf surf;
+
+   /** Backing storage for the resource */
+   struct crocus_bo *bo;
+
+   /** offset at which data starts in the BO */
+   uint64_t offset;
+
+   /**
+    * A bitfield of PIPE_BIND_* indicating how this resource was bound
+    * in the past.  Only meaningful for PIPE_BUFFER; used for flushing.
+    */
+   unsigned bind_history;
+
+   /**
+    * A bitfield of MESA_SHADER_* stages indicating where this resource
+    * was bound.
+    */
+   unsigned bind_stages;
+
+   /**
+    * For PIPE_BUFFER resources, a range which may contain valid data.
+    *
+    * This is a conservative estimate of what part of the buffer contains
+    * valid data that we have to preserve.  The rest of the buffer is
+    * considered invalid, and we can promote writes to that region to
+    * be unsynchronized writes, avoiding blit copies.
+    */
+   struct util_range valid_buffer_range;
+
+   /**
+    * Auxiliary buffer information (CCS, MCS, or HiZ).
+    */
+   struct {
+      /** The surface layout for the auxiliary buffer. */
+      struct isl_surf surf;
+
+      /** The buffer object containing the auxiliary data. */
+      struct crocus_bo *bo;
+
+      /** Offset into 'bo' where the auxiliary surface starts. */
+      uint32_t offset;
+
+      struct {
+         struct isl_surf surf;
+
+         /** Offset into 'bo' where the auxiliary surface starts. */
+         uint32_t offset;
+      } extra_aux;
+
+      /**
+       * Fast clear color for this surface.  For depth surfaces, the clear
+       * value is stored as a float32 in the red component.
+       */
+      union isl_color_value clear_color;
+
+      /**
+       * \brief The type of auxiliary compression used by this resource.
+       *
+       * This describes the type of auxiliary compression that is intended to
+       * be used by this resource.  An aux usage of ISL_AUX_USAGE_NONE means
+       * that auxiliary compression is permanently disabled.  An aux usage
+       * other than ISL_AUX_USAGE_NONE does not imply that auxiliary
+       * compression will always be enabled for this surface.
+       */
+      enum isl_aux_usage usage;
+
+      /**
+       * \brief Maps miptree slices to their current aux state.
+       *
+       * This two-dimensional array is indexed as [level][layer] and stores an
+       * aux state for each slice.
+       */
+      enum isl_aux_state **state;
+
+      /**
+       * If (1 << level) is set, HiZ is enabled for that miplevel.
+       */
+      uint16_t has_hiz;
+   } aux;
+
+   /**
+    * \brief Shadow miptree for sampling when the main isn't supported by HW.
+    *
+    * To workaround various sampler bugs and limitations, we blit the main
+    * texture into a new texture that can be sampled.
+    *
+    * This miptree may be used for:
+    * - Stencil texturing (pre-BDW) as required by GL_ARB_stencil_texturing.
+    */
+   struct crocus_resource *shadow;
+   bool shadow_needs_update;
+
+   /**
+    * For external surfaces, this is format that was used to create or import
+    * the surface. For internal surfaces, this will always be
+    * PIPE_FORMAT_NONE.
+    */
+   enum pipe_format external_format;
+
+   /**
+    * For external surfaces, this is DRM format modifier that was used to
+    * create or import the surface.  For internal surfaces, this will always
+    * be DRM_FORMAT_MOD_INVALID.
+    */
+   const struct isl_drm_modifier_info *mod_info;
+
+   /**
+    * The screen the resource was originally created with, stored for refcounting.
+    */
+   struct pipe_screen *orig_screen;
+};
+
+/**
+ * A simple <resource, offset> tuple for storing a reference to a
+ * piece of state stored in a GPU buffer object.
+ */
+struct crocus_state_ref {
+   struct pipe_resource *res;
+   uint32_t offset;
+};
+
+/**
+ * Gallium CSO for sampler views (texture views).
+ *
+ * In addition to the normal pipe_resource, this adds an ISL view
+ * which may reinterpret the format or restrict levels/layers.
+ *
+ * These can also be linear texture buffers.
+ */
+struct crocus_sampler_view {
+   struct pipe_sampler_view base;
+   struct isl_view view;
+   struct isl_view gather_view;
+
+   enum pipe_swizzle swizzle[4];
+   union isl_color_value clear_color;
+
+   /* A short-cut (not a reference) to the actual resource being viewed.
+    * Multi-planar (or depth+stencil) images may have multiple resources
+    * chained together; this skips having to traverse base->texture->*.
+    */
+   struct crocus_resource *res;
+};
+
+/**
+ * Image view representation.
+ */
+struct crocus_image_view {
+   struct pipe_image_view base;
+   struct isl_view view;
+};
+
+/**
+ * Gallium CSO for surfaces (framebuffer attachments).
+ *
+ * A view of a surface that can be bound to a color render target or
+ * depth/stencil attachment.
+ */
+struct crocus_surface {
+   struct pipe_surface base;
+   struct isl_view view;
+   struct isl_view read_view;
+   struct isl_surf surf;
+   union isl_color_value clear_color;
+
+   struct pipe_resource *align_res;
+};
+
+/**
+ * Transfer object - information about a buffer mapping.
+ */
+struct crocus_transfer {
+   struct pipe_transfer base;
+   struct pipe_debug_callback *dbg;
+   void *buffer;
+   void *ptr;
+
+   /** A linear staging resource for GPU-based copy_region transfers. */
+   struct pipe_resource *staging;
+   struct blorp_context *blorp;
+   struct crocus_batch *batch;
+
+   bool dest_had_defined_contents;
+   bool has_swizzling;
+
+   void (*unmap)(struct crocus_transfer *);
+};
+
+/**
+ * Unwrap a pipe_resource to get the underlying crocus_bo (for convenience).
+ */
+static inline struct crocus_bo *
+crocus_resource_bo(struct pipe_resource *p_res)
+{
+   struct crocus_resource *res = (void *) p_res;
+   return res->bo;
+}
+
+static inline uint32_t
+crocus_mocs(const struct crocus_bo *bo,
+            const struct isl_device *dev)
+{
+   return isl_mocs(dev, 0, bo && crocus_bo_is_external(bo));
+}
+
+struct crocus_format_info crocus_format_for_usage(const struct intel_device_info *,
+                                                  enum pipe_format pf,
+                                                  isl_surf_usage_flags_t usage);
+
+struct pipe_resource *crocus_resource_get_separate_stencil(struct pipe_resource *);
+
+void crocus_get_depth_stencil_resources(const struct intel_device_info *devinfo,
+                                        struct pipe_resource *res,
+                                        struct crocus_resource **out_z,
+                                        struct crocus_resource **out_s);
+bool crocus_resource_set_clear_color(struct crocus_context *ice,
+                                     struct crocus_resource *res,
+                                     union isl_color_value color);
+union isl_color_value
+crocus_resource_get_clear_color(const struct crocus_resource *res);
+
+void crocus_init_screen_resource_functions(struct pipe_screen *pscreen);
+
+void crocus_dirty_for_history(struct crocus_context *ice,
+                              struct crocus_resource *res);
+uint32_t crocus_flush_bits_for_history(struct crocus_resource *res);
+
+void crocus_flush_and_dirty_for_history(struct crocus_context *ice,
+                                        struct crocus_batch *batch,
+                                        struct crocus_resource *res,
+                                        uint32_t extra_flags,
+                                        const char *reason);
+
+unsigned crocus_get_num_logical_layers(const struct crocus_resource *res,
+                                       unsigned level);
+
+void crocus_resource_disable_aux(struct crocus_resource *res);
+
+#define INTEL_REMAINING_LAYERS UINT32_MAX
+#define INTEL_REMAINING_LEVELS UINT32_MAX
+
+void
+crocus_hiz_exec(struct crocus_context *ice,
+                struct crocus_batch *batch,
+                struct crocus_resource *res,
+                unsigned int level, unsigned int start_layer,
+                unsigned int num_layers, enum isl_aux_op op,
+                bool update_clear_depth);
+
+/**
+ * Prepare a miptree for access
+ *
+ * This function should be called prior to any access to miptree in order to
+ * perform any needed resolves.
+ *
+ * \param[in]  start_level    The first mip level to be accessed
+ *
+ * \param[in]  num_levels     The number of miplevels to be accessed or
+ *                            INTEL_REMAINING_LEVELS to indicate every level
+ *                            above start_level will be accessed
+ *
+ * \param[in]  start_layer    The first array slice or 3D layer to be accessed
+ *
+ * \param[in]  num_layers     The number of array slices or 3D layers be
+ *                            accessed or INTEL_REMAINING_LAYERS to indicate
+ *                            every layer above start_layer will be accessed
+ *
+ * \param[in]  aux_supported  Whether or not the access will support the
+ *                            miptree's auxiliary compression format;  this
+ *                            must be false for uncompressed miptrees
+ *
+ * \param[in]  fast_clear_supported Whether or not the access will support
+ *                                  fast clears in the miptree's auxiliary
+ *                                  compression format
+ */
+void
+crocus_resource_prepare_access(struct crocus_context *ice,
+                               struct crocus_resource *res,
+                               uint32_t start_level, uint32_t num_levels,
+                               uint32_t start_layer, uint32_t num_layers,
+                               enum isl_aux_usage aux_usage,
+                               bool fast_clear_supported);
+
+/**
+ * Complete a write operation
+ *
+ * This function should be called after any operation writes to a miptree.
+ * This will update the miptree's compression state so that future resolves
+ * happen correctly.  Technically, this function can be called before the
+ * write occurs but the caller must ensure that they don't interlace
+ * crocus_resource_prepare_access and crocus_resource_finish_write calls to
+ * overlapping layer/level ranges.
+ *
+ * \param[in]  level             The mip level that was written
+ *
+ * \param[in]  start_layer       The first array slice or 3D layer written
+ *
+ * \param[in]  num_layers        The number of array slices or 3D layers
+ *                               written or INTEL_REMAINING_LAYERS to indicate
+ *                               every layer above start_layer was written
+ *
+ * \param[in]  written_with_aux  Whether or not the write was done with
+ *                               auxiliary compression enabled
+ */
+void
+crocus_resource_finish_write(struct crocus_context *ice,
+                             struct crocus_resource *res, uint32_t level,
+                             uint32_t start_layer, uint32_t num_layers,
+                             enum isl_aux_usage aux_usage);
+
+/** Get the auxiliary compression state of a miptree slice */
+enum isl_aux_state
+crocus_resource_get_aux_state(const struct crocus_resource *res,
+                              uint32_t level, uint32_t layer);
+
+/**
+ * Set the auxiliary compression state of a miptree slice range
+ *
+ * This function directly sets the auxiliary compression state of a slice
+ * range of a miptree.  It only modifies data structures and does not do any
+ * resolves.  This should only be called by code which directly performs
+ * compression operations such as fast clears and resolves.  Most code should
+ * use crocus_resource_prepare_access or crocus_resource_finish_write.
+ */
+void
+crocus_resource_set_aux_state(struct crocus_context *ice,
+                              struct crocus_resource *res, uint32_t level,
+                              uint32_t start_layer, uint32_t num_layers,
+                              enum isl_aux_state aux_state);
+
+/**
+ * Prepare a miptree for raw access
+ *
+ * This helper prepares the miptree for access that knows nothing about any
+ * sort of compression whatsoever.  This is useful when mapping the surface or
+ * using it with the blitter.
+ */
+static inline void
+crocus_resource_access_raw(struct crocus_context *ice,
+                           struct crocus_resource *res,
+                           uint32_t level, uint32_t layer,
+                           uint32_t num_layers,
+                           bool write)
+{
+   crocus_resource_prepare_access(ice, res, level, 1, layer, num_layers,
+                                  ISL_AUX_USAGE_NONE, false);
+   if (write) {
+      crocus_resource_finish_write(ice, res, level, layer, num_layers,
+                                   ISL_AUX_USAGE_NONE);
+   }
+}
+
+void
+crocus_resource_get_image_offset(struct crocus_resource *res,
+                                 uint32_t level, uint32_t z,
+                                 uint32_t *x, uint32_t *y);
+static inline enum isl_aux_usage
+crocus_resource_texture_aux_usage(const struct crocus_resource *res)
+{
+   return res->aux.usage == ISL_AUX_USAGE_MCS ? ISL_AUX_USAGE_MCS : ISL_AUX_USAGE_NONE;
+}
+
+void crocus_resource_prepare_texture(struct crocus_context *ice,
+                                     struct crocus_resource *res,
+                                     enum isl_format view_format,
+                                     uint32_t start_level, uint32_t num_levels,
+                                     uint32_t start_layer, uint32_t num_layers);
+
+static inline bool
+crocus_resource_unfinished_aux_import(struct crocus_resource *res)
+{
+   return res->base.next != NULL && res->mod_info &&
+      res->mod_info->aux_usage != ISL_AUX_USAGE_NONE;
+}
+
+void crocus_resource_finish_aux_import(struct pipe_screen *pscreen,
+                                       struct crocus_resource *res);
+
+bool crocus_has_invalid_primary(const struct crocus_resource *res,
+                                unsigned start_level, unsigned num_levels,
+                                unsigned start_layer, unsigned num_layers);
+
+void crocus_resource_check_level_layer(const struct crocus_resource *res,
+                                       uint32_t level, uint32_t layer);
+
+bool crocus_resource_level_has_hiz(const struct crocus_resource *res,
+                                   uint32_t level);
+bool crocus_has_color_unresolved(const struct crocus_resource *res,
+                                 unsigned start_level, unsigned num_levels,
+                                 unsigned start_layer, unsigned num_layers);
+
+enum isl_aux_usage crocus_resource_render_aux_usage(struct crocus_context *ice,
+                                                    struct crocus_resource *res,
+                                                    enum isl_format render_fmt,
+                                                    bool blend_enabled,
+                                                    bool draw_aux_disabled);
+void crocus_resource_prepare_render(struct crocus_context *ice,
+                                    struct crocus_resource *res, uint32_t level,
+                                    uint32_t start_layer, uint32_t layer_count,
+                                    enum isl_aux_usage aux_usage);
+void crocus_resource_finish_render(struct crocus_context *ice,
+                                   struct crocus_resource *res, uint32_t level,
+                                   uint32_t start_layer, uint32_t layer_count,
+                                   enum isl_aux_usage aux_usage);
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c
new file mode 100644
index 00000000000..d5331d66730
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_screen.c
+ *
+ * Screen related driver hooks and capability lists.
+ *
+ * A program may use multiple rendering contexts (crocus_context), but
+ * they all share a common screen (crocus_screen).  Global driver state
+ * can be stored in the screen; it may be accessed by multiple threads.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/debug.h"
+#include "util/u_inlines.h"
+#include "util/format/u_format.h"
+#include "util/u_transfer_helper.h"
+#include "util/u_upload_mgr.h"
+#include "util/ralloc.h"
+#include "util/xmlconfig.h"
+#include "drm-uapi/i915_drm.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_fence.h"
+#include "crocus_pipe.h"
+#include "crocus_resource.h"
+#include "crocus_screen.h"
+#include "intel/compiler/brw_compiler.h"
+#include "intel/common/intel_gem.h"
+#include "intel/common/intel_l3_config.h"
+#include "crocus_monitor.h"
+
+#define genX_call(devinfo, func, ...)                   \
+   switch ((devinfo)->verx10) {                         \
+   case 75:                                             \
+      gfx75_##func(__VA_ARGS__);                        \
+      break;                                            \
+   case 70:                                             \
+      gfx7_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 60:                                             \
+      gfx6_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 50:                                             \
+      gfx5_##func(__VA_ARGS__);                         \
+      break;                                            \
+   case 45:                                             \
+      gfx45_##func(__VA_ARGS__);                        \
+      break;                                            \
+   case 40:                                             \
+      gfx4_##func(__VA_ARGS__);                         \
+      break;                                            \
+   default:                                             \
+      unreachable("Unknown hardware generation");       \
+   }
+
+static void
+crocus_flush_frontbuffer(struct pipe_screen *_screen,
+                         struct pipe_context *_pipe,
+                         struct pipe_resource *resource,
+                         unsigned level, unsigned layer,
+                         void *context_private, struct pipe_box *box)
+{
+}
+
+static const char *
+crocus_get_vendor(struct pipe_screen *pscreen)
+{
+   return "Intel";
+}
+
+static const char *
+crocus_get_device_vendor(struct pipe_screen *pscreen)
+{
+   return "Intel";
+}
+
+static const char *
+crocus_get_name(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   static char buf[128];
+
+   const char *name = intel_get_device_name(screen->pci_id);
+
+   if (!name)
+      name = "Intel Unknown";
+
+   snprintf(buf, sizeof(buf), "Mesa %s", name);
+   return buf;
+}
+
+static uint64_t
+get_aperture_size(int fd)
+{
+   struct drm_i915_gem_get_aperture aperture = {};
+   intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
+   return aperture.aper_size;
+}
+
+static int
+crocus_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   switch (param) {
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+   case PIPE_CAP_POINT_SPRITE:
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+   case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD:
+   case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES:
+   case PIPE_CAP_VERTEX_SHADER_SATURATE:
+   case PIPE_CAP_PRIMITIVE_RESTART:
+   case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX:
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+   case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+   case PIPE_CAP_CONDITIONAL_RENDER:
+   case PIPE_CAP_TEXTURE_BARRIER:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_START_INSTANCE:
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+   case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
+   case PIPE_CAP_ACCELERATED:
+   case PIPE_CAP_UMA:
+   case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_TGSI_TEX_TXF_LZ:
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_TGSI_VOTE:
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+   case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
+   case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS:
+   case PIPE_CAP_NIR_COMPACT_ARRAYS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_CS_DERIVED_SYSTEM_VALUES_SUPPORTED:
+   case PIPE_CAP_FENCE_SIGNAL:
+   case PIPE_CAP_DEMOTE_TO_HELPER_INVOCATION:
+      return true;
+   case PIPE_CAP_INT64:
+   case PIPE_CAP_INT64_DIVMOD:
+   case PIPE_CAP_TGSI_BALLOT:
+   case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_GL_CLAMP:
+      return false;
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+      return devinfo->ver <= 5;
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+      return devinfo->ver >= 5;
+   case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+   case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
+   case PIPE_CAP_TGSI_CLOCK:
+   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_COMPUTE:
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+   case PIPE_CAP_SHADER_SAMPLES_IDENTICAL:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_GL_SPIRV:
+   case PIPE_CAP_GL_SPIRV_VARIABLE_POINTERS:
+   case PIPE_CAP_COMPUTE_SHADER_DERIVATIVES:
+   case PIPE_CAP_DOUBLES:
+      return devinfo->ver >= 7;
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+      return devinfo->is_haswell;
+   case PIPE_CAP_CULL_DISTANCE:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+   case PIPE_CAP_QUERY_SO_OVERFLOW:
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+   case PIPE_CAP_QUERY_TIMESTAMP:
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+   case PIPE_CAP_TEXTURE_SHADOW_LOD:
+   case PIPE_CAP_LOAD_CONSTBUF:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_CLEAR_SCISSORED:
+      return devinfo->ver >= 6;
+   case PIPE_CAP_FBFETCH:
+      return devinfo->verx10 >= 45 ? BRW_MAX_DRAW_BUFFERS : 0;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return devinfo->ver >= 6 ? 1 : 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return BRW_MAX_DRAW_BUFFERS;
+   case PIPE_CAP_MAX_TEXTURE_2D_SIZE:
+      if (devinfo->ver >= 7)
+         return 16384;
+      else
+         return 8192;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      if (devinfo->ver >= 7)
+         return CROCUS_MAX_MIPLEVELS; /* 16384x16384 */
+      else
+         return CROCUS_MAX_MIPLEVELS - 1; /* 8192x8192 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 12; /* 2048x2048 */
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return (devinfo->ver >= 6) ? 4 : 0;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return devinfo->ver >= 7 ? 2048 : 512;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+      return BRW_MAX_SOL_BINDINGS / CROCUS_MAX_SOL_BUFFERS;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return BRW_MAX_SOL_BINDINGS;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL: {
+      if (devinfo->is_haswell)
+         return 460;
+      else if (devinfo->ver >= 7)
+         return 420;
+      else if (devinfo->ver >= 6)
+         return 330;
+      return 120;
+   }
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return devinfo->ver < 6 ? 120 : 130;
+
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      /* 3DSTATE_CONSTANT_XS requires the start of UBOs to be 32B aligned */
+      return 32;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return CROCUS_MAP_BUFFER_ALIGNMENT;
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+      /* Choose a cacheline (64 bytes) so that we can safely have the CPU and
+       * GPU writing the same SSBO on non-coherent systems (Atom CPUs).  With
+       * UBOs, the GPU never writes, so there's no problem.  For an SSBO, the
+       * GPU and the CPU can be updating disjoint regions of the buffer
+       * simultaneously and that will break if the regions overlap the same
+       * cacheline.
+       */
+      return devinfo->ver >= 7 ? 64 : 0;
+   case PIPE_CAP_MAX_SHADER_BUFFER_SIZE:
+      return devinfo->ver >= 7 ? (1 << 27) : 0;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 16; // XXX: u_screen says 256 is the minimum value...
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return true;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return CROCUS_MAX_TEXTURE_BUFFER_SIZE;
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return devinfo->ver >= 6 ? 16 : 1;
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+      return devinfo->ver >= 6 ? 256 : 0;
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return devinfo->ver >= 6 ? 1024 : 0;
+   case PIPE_CAP_MAX_GS_INVOCATIONS:
+      return devinfo->ver >= 7 ? 32 : 1;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+      if (devinfo->ver >= 7)
+         return 4;
+      else if (devinfo->ver == 6)
+         return 1;
+      else
+         return 0;
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+      if (devinfo->ver >= 7)
+         return -32;
+      else if (devinfo->ver == 6)
+         return -8;
+      else
+         return 0;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+      if (devinfo->ver >= 7)
+         return 31;
+      else if (devinfo->ver == 6)
+         return 7;
+      else
+         return 0;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return devinfo->ver >= 7 ? 4 : 1;
+   case PIPE_CAP_VENDOR_ID:
+      return 0x8086;
+   case PIPE_CAP_DEVICE_ID:
+      return screen->pci_id;
+   case PIPE_CAP_VIDEO_MEMORY: {
+      /* Once a batch uses more than 75% of the maximum mappable size, we
+       * assume that there's some fragmentation, and we start doing extra
+       * flushing, etc.  That's the big cliff apps will care about.
+       */
+      const unsigned gpu_mappable_megabytes =
+         (screen->aperture_bytes * 3 / 4) / (1024 * 1024);
+
+      const long system_memory_pages = sysconf(_SC_PHYS_PAGES);
+      const long system_page_size = sysconf(_SC_PAGE_SIZE);
+
+      if (system_memory_pages <= 0 || system_page_size <= 0)
+         return -1;
+
+      const uint64_t system_memory_bytes =
+         (uint64_t) system_memory_pages * (uint64_t) system_page_size;
+
+      const unsigned system_memory_megabytes =
+         (unsigned) (system_memory_bytes / (1024 * 1024));
+
+      return MIN2(system_memory_megabytes, gpu_mappable_megabytes);
+   }
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_MAX_VARYINGS:
+      return (screen->devinfo.ver >= 6) ? 32 : 16;
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+      /* AMD_pinned_memory assumes the flexibility of using client memory
+       * for any buffer (incl. vertex buffers) which rules out the prospect
+       * of using snooped buffers, as using snooped buffers without
+       * cogniscience is likely to be detrimental to performance and require
+       * extensive checking in the driver for correctness, e.g. to prevent
+       * illegal snoop <-> snoop transfers.
+       */
+      return devinfo->has_llc;
+   case PIPE_CAP_THROTTLE:
+      return screen->driconf.disable_throttling ? 0 : 1;
+
+   case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+      return PIPE_CONTEXT_PRIORITY_LOW |
+             PIPE_CONTEXT_PRIORITY_MEDIUM |
+             PIPE_CONTEXT_PRIORITY_HIGH;
+
+   case PIPE_CAP_FRONTEND_NOOP:
+      return true;
+      // XXX: don't hardcode 00:00:02.0 PCI here
+   case PIPE_CAP_PCI_GROUP:
+      return 0;
+   case PIPE_CAP_PCI_BUS:
+      return 0;
+   case PIPE_CAP_PCI_DEVICE:
+      return 2;
+   case PIPE_CAP_PCI_FUNCTION:
+      return 0;
+
+   default:
+      return u_pipe_screen_get_param_defaults(pscreen, param);
+   }
+   return 0;
+}
+
+static float
+crocus_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      if (devinfo->ver >= 6)
+         return 7.375f;
+      else
+         return 7.0f;
+
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 255.0f;
+
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0f;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 15.0f;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
+   default:
+      unreachable("unknown param");
+   }
+}
+
+static int
+crocus_get_shader_param(struct pipe_screen *pscreen,
+                        enum pipe_shader_type p_stage,
+                        enum pipe_shader_cap param)
+{
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   if (devinfo->ver < 6 &&
+       p_stage != PIPE_SHADER_VERTEX &&
+       p_stage != PIPE_SHADER_FRAGMENT)
+      return 0;
+
+   if (devinfo->ver == 6 &&
+       p_stage != PIPE_SHADER_VERTEX &&
+       p_stage != PIPE_SHADER_FRAGMENT &&
+       p_stage != PIPE_SHADER_GEOMETRY)
+      return 0;
+
+   /* this is probably not totally correct.. but it's a start: */
+   switch (param) {
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+      return stage == MESA_SHADER_FRAGMENT ? 1024 : 16384;
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+      return stage == MESA_SHADER_FRAGMENT ? 1024 : 0;
+
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return UINT_MAX;
+
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      if (stage == MESA_SHADER_VERTEX ||
+          stage == MESA_SHADER_GEOMETRY)
+         return 16; /* Gen7 vec4 geom backend */
+      return 32;
+   case PIPE_SHADER_CAP_MAX_OUTPUTS:
+      return 32;
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+      return 16 * 1024 * sizeof(float);
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return devinfo->ver >= 6 ? 16 : 1;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return 256; /* GL_MAX_PROGRAM_TEMPORARIES_ARB */
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      return 0;
+   case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+   case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+      /* Lie about these to avoid st/mesa's GLSL IR lowering of indirects,
+       * which we don't want.  Our compiler backend will check brw_compiler's
+       * options and call nir_lower_indirect_derefs appropriately anyway.
+       */
+      return true;
+   case PIPE_SHADER_CAP_SUBROUTINES:
+      return 0;
+   case PIPE_SHADER_CAP_INTEGERS:
+      return 1;
+   case PIPE_SHADER_CAP_INT64_ATOMICS:
+   case PIPE_SHADER_CAP_FP16:
+      return 0;
+   case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+   case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+      return devinfo->is_haswell ? CROCUS_MAX_TEXTURE_SAMPLERS : 16;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      if (devinfo->ver >= 7 &&
+          (p_stage == PIPE_SHADER_FRAGMENT ||
+           p_stage == PIPE_SHADER_COMPUTE))
+         return CROCUS_MAX_TEXTURE_SAMPLERS;
+      return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return devinfo->ver >= 7 ? (CROCUS_MAX_ABOS + CROCUS_MAX_SSBOS) : 0;
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+   case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+      return 0;
+   case PIPE_SHADER_CAP_PREFERRED_IR:
+      return PIPE_SHADER_IR_NIR;
+   case PIPE_SHADER_CAP_SUPPORTED_IRS:
+      return 1 << PIPE_SHADER_IR_NIR;
+   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
+      return 1;
+   case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+   case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+   case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
+   case PIPE_SHADER_CAP_FP16_DERIVATIVES:
+   case PIPE_SHADER_CAP_INT16:
+   case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS:
+   case PIPE_SHADER_CAP_FP16_CONST_BUFFERS:
+      return 0;
+   default:
+      unreachable("unknown shader param");
+   }
+}
+
+static int
+crocus_get_compute_param(struct pipe_screen *pscreen,
+                         enum pipe_shader_ir ir_type,
+                         enum pipe_compute_cap param,
+                         void *ret)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)pscreen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
+   const uint32_t max_invocations = 32 * max_threads;
+
+   if (devinfo->ver < 7)
+      return 0;
+#define RET(x) do {                  \
+   if (ret)                          \
+      memcpy(ret, x, sizeof(x));     \
+   return sizeof(x);                 \
+} while (0)
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_ADDRESS_BITS:
+      RET((uint32_t []){ 32 });
+
+   case PIPE_COMPUTE_CAP_IR_TARGET:
+      if (ret)
+         strcpy(ret, "gen");
+      return 4;
+
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      RET((uint64_t []) { 3 });
+
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      RET(((uint64_t []) { 65535, 65535, 65535 }));
+
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      /* MaxComputeWorkGroupSize[0..2] */
+      RET(((uint64_t []) {max_invocations, max_invocations, max_invocations}));
+
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      /* MaxComputeWorkGroupInvocations */
+      RET((uint64_t []) { max_invocations });
+
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+      /* MaxComputeSharedMemorySize */
+      RET((uint64_t []) { 64 * 1024 });
+
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 1 });
+
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      RET((uint32_t []) { BRW_SUBGROUP_SIZE });
+
+   case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK:
+      RET((uint64_t []) { max_invocations });
+
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE:
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE:
+
+      // XXX: I think these are for Clover...
+      return 0;
+
+   default:
+      unreachable("unknown compute param");
+   }
+}
+
+static uint64_t
+crocus_get_timestamp(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+   const unsigned TIMESTAMP = 0x2358;
+   uint64_t result;
+
+   crocus_reg_read(screen->bufmgr, TIMESTAMP | 1, &result);
+
+   result = intel_device_info_timebase_scale(&screen->devinfo, result);
+   result &= (1ull << TIMESTAMP_BITS) - 1;
+
+   return result;
+}
+
+void
+crocus_screen_destroy(struct crocus_screen *screen)
+{
+   u_transfer_helper_destroy(screen->base.transfer_helper);
+   crocus_bufmgr_unref(screen->bufmgr);
+   disk_cache_destroy(screen->disk_cache);
+   close(screen->winsys_fd);
+   ralloc_free(screen);
+}
+
+static void
+crocus_screen_unref(struct pipe_screen *pscreen)
+{
+   crocus_pscreen_unref(pscreen);
+}
+
+static void
+crocus_query_memory_info(struct pipe_screen *pscreen,
+                         struct pipe_memory_info *info)
+{
+}
+
+static const void *
+crocus_get_compiler_options(struct pipe_screen *pscreen,
+                            enum pipe_shader_ir ir,
+                            enum pipe_shader_type pstage)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+   gl_shader_stage stage = stage_from_pipe(pstage);
+   assert(ir == PIPE_SHADER_IR_NIR);
+
+   return screen->compiler->glsl_compiler_options[stage].NirOptions;
+}
+
+static struct disk_cache *
+crocus_get_disk_shader_cache(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+   return screen->disk_cache;
+}
+
+static const struct intel_l3_config *
+crocus_get_default_l3_config(const struct intel_device_info *devinfo,
+                             bool compute)
+{
+   bool wants_dc_cache = true;
+   bool has_slm = compute;
+   const struct intel_l3_weights w =
+      intel_get_default_l3_weights(devinfo, wants_dc_cache, has_slm);
+   return intel_get_l3_config(devinfo, w);
+}
+
+static void
+crocus_shader_debug_log(void *data, const char *fmt, ...)
+{
+   struct pipe_debug_callback *dbg = data;
+   unsigned id = 0;
+   va_list args;
+
+   if (!dbg->debug_message)
+      return;
+
+   va_start(args, fmt);
+   dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_SHADER_INFO, fmt, args);
+   va_end(args);
+}
+
+static void
+crocus_shader_perf_log(void *data, const char *fmt, ...)
+{
+   struct pipe_debug_callback *dbg = data;
+   unsigned id = 0;
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (dbg->debug_message) {
+      dbg->debug_message(dbg->data, &id, PIPE_DEBUG_TYPE_PERF_INFO, fmt, args);
+   }
+
+   va_end(args);
+}
+
+static bool
+crocus_detect_swizzling(struct crocus_screen *screen)
+{
+   /* Broadwell PRM says:
+    *
+    *   "Before Gen8, there was a historical configuration control field to
+    *    swizzle address bit[6] for in X/Y tiling modes. This was set in three
+    *    different places: TILECTL[1:0], ARB_MODE[5:4], and
+    *    DISP_ARB_CTL[14:13].
+    *
+    *    For Gen8 and subsequent generations, the swizzle fields are all
+    *    reserved, and the CPU's memory controller performs all address
+    *    swizzling modifications."
+    */
+   uint32_t tiling = I915_TILING_X;
+   uint32_t swizzle_mode = 0;
+   struct crocus_bo *buffer =
+      crocus_bo_alloc_tiled(screen->bufmgr, "swizzle test", 32768,
+                            0, tiling, 512, 0);
+   if (buffer == NULL)
+      return false;
+
+   crocus_bo_get_tiling(buffer, &tiling, &swizzle_mode);
+   crocus_bo_unreference(buffer);
+
+   return swizzle_mode != I915_BIT_6_SWIZZLE_NONE;
+}
+
+struct pipe_screen *
+crocus_screen_create(int fd, const struct pipe_screen_config *config)
+{
+   struct crocus_screen *screen = rzalloc(NULL, struct crocus_screen);
+   if (!screen)
+      return NULL;
+
+   if (!intel_get_device_info_from_fd(fd, &screen->devinfo))
+      return NULL;
+   screen->pci_id = screen->devinfo.chipset_id;
+   screen->no_hw = screen->devinfo.no_hw;
+
+   if (screen->devinfo.ver >= 8)
+      return NULL;
+
+   p_atomic_set(&screen->refcount, 1);
+
+   screen->aperture_bytes = get_aperture_size(fd);
+
+   if (getenv("INTEL_NO_HW") != NULL)
+      screen->no_hw = true;
+
+   bool bo_reuse = false;
+   int bo_reuse_mode = driQueryOptioni(config->options, "bo_reuse");
+   switch (bo_reuse_mode) {
+   case DRI_CONF_BO_REUSE_DISABLED:
+      break;
+   case DRI_CONF_BO_REUSE_ALL:
+      bo_reuse = true;
+      break;
+   }
+
+   screen->bufmgr = crocus_bufmgr_get_for_fd(&screen->devinfo, fd, bo_reuse);
+   if (!screen->bufmgr)
+      return NULL;
+   screen->fd = crocus_bufmgr_get_fd(screen->bufmgr);
+   screen->winsys_fd = fd;
+
+   screen->has_swizzling = crocus_detect_swizzling(screen);
+   brw_process_intel_debug_variable();
+
+   screen->driconf.dual_color_blend_by_location =
+      driQueryOptionb(config->options, "dual_color_blend_by_location");
+   screen->driconf.disable_throttling =
+      driQueryOptionb(config->options, "disable_throttling");
+   screen->driconf.always_flush_cache =
+      driQueryOptionb(config->options, "always_flush_cache");
+
+   screen->precompile = env_var_as_boolean("shader_precompile", true);
+
+   isl_device_init(&screen->isl_dev, &screen->devinfo,
+                   screen->has_swizzling);
+
+   screen->compiler = brw_compiler_create(screen, &screen->devinfo);
+   screen->compiler->shader_debug_log = crocus_shader_debug_log;
+   screen->compiler->shader_perf_log = crocus_shader_perf_log;
+   screen->compiler->supports_pull_constants = false;
+   screen->compiler->supports_shader_constants = false;
+   screen->compiler->compact_params = false;
+   screen->compiler->constant_buffer_0_is_relative = true;
+
+   if (screen->devinfo.ver == 7) {
+      screen->l3_config_3d = crocus_get_default_l3_config(&screen->devinfo, false);
+      screen->l3_config_cs = crocus_get_default_l3_config(&screen->devinfo, true);
+   }
+
+   crocus_disk_cache_init(screen);
+
+   slab_create_parent(&screen->transfer_pool,
+                      sizeof(struct crocus_transfer), 64);
+
+   screen->subslice_total = intel_device_info_subslice_total(&screen->devinfo);
+   assert(screen->subslice_total >= 1);
+
+   struct pipe_screen *pscreen = &screen->base;
+
+   crocus_init_screen_fence_functions(pscreen);
+   crocus_init_screen_resource_functions(pscreen);
+
+   pscreen->destroy = crocus_screen_unref;
+   pscreen->get_name = crocus_get_name;
+   pscreen->get_vendor = crocus_get_vendor;
+   pscreen->get_device_vendor = crocus_get_device_vendor;
+   pscreen->get_param = crocus_get_param;
+   pscreen->get_shader_param = crocus_get_shader_param;
+   pscreen->get_compute_param = crocus_get_compute_param;
+   pscreen->get_paramf = crocus_get_paramf;
+   pscreen->get_compiler_options = crocus_get_compiler_options;
+   pscreen->get_disk_shader_cache = crocus_get_disk_shader_cache;
+   pscreen->is_format_supported = crocus_is_format_supported;
+   pscreen->context_create = crocus_create_context;
+   pscreen->flush_frontbuffer = crocus_flush_frontbuffer;
+   pscreen->get_timestamp = crocus_get_timestamp;
+   pscreen->query_memory_info = crocus_query_memory_info;
+   pscreen->get_driver_query_group_info = crocus_get_monitor_group_info;
+   pscreen->get_driver_query_info = crocus_get_monitor_info;
+
+   genX_call(&screen->devinfo, init_screen_state, screen);
+   genX_call(&screen->devinfo, init_screen_query, screen);
+   return pscreen;
+}
diff --git a/src/gallium/drivers/crocus/crocus_screen.h b/src/gallium/drivers/crocus/crocus_screen.h
new file mode 100644
index 00000000000..4d942eb8415
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_screen.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef CROCUS_SCREEN_H
+#define CROCUS_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "frontend/drm_driver.h"
+#include "util/disk_cache.h"
+#include "util/slab.h"
+#include "util/u_screen.h"
+#include "intel/dev/intel_device_info.h"
+#include "intel/isl/isl.h"
+#include "crocus_bufmgr.h"
+#include "compiler/shader_enums.h"
+
+struct crocus_monitor_config;
+struct crocus_resource;
+struct crocus_context;
+struct crocus_sampler_state;
+struct brw_vue_map;
+struct brw_tcs_prog_key;
+struct brw_tes_prog_key;
+struct brw_cs_prog_key;
+struct brw_wm_prog_key;
+struct brw_vs_prog_key;
+struct brw_gs_prog_key;
+struct shader_info;
+
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+#define WRITE_ONCE(x, v) *(volatile __typeof__(x) *)&(x) = (v)
+
+#define CROCUS_MAX_TEXTURE_SAMPLERS 32
+#define CROCUS_MAX_SOL_BUFFERS 4
+#define CROCUS_MAP_BUFFER_ALIGNMENT 64
+
+
+/**
+ * Virtual table for generation-specific (genxml) function calls.
+ */
+struct crocus_vtable {
+   void (*destroy_state)(struct crocus_context *ice);
+   void (*init_render_context)(struct crocus_batch *batch);
+   void (*init_compute_context)(struct crocus_batch *batch);
+   void (*upload_render_state)(struct crocus_context *ice,
+                               struct crocus_batch *batch,
+                               const struct pipe_draw_info *draw,
+                               unsigned drawid_offset,
+                               const struct pipe_draw_indirect_info *indirect,
+                               const struct pipe_draw_start_count_bias *sc);
+   void (*update_surface_base_address)(struct crocus_batch *batch);
+
+   void (*upload_compute_state)(struct crocus_context *ice,
+                                struct crocus_batch *batch,
+                                const struct pipe_grid_info *grid);
+   void (*rebind_buffer)(struct crocus_context *ice,
+                         struct crocus_resource *res);
+   void (*resolve_conditional_render)(struct crocus_context *ice);
+   void (*emit_compute_predicate)(struct crocus_batch *batch);
+   void (*load_register_reg32)(struct crocus_batch *batch, uint32_t dst,
+                               uint32_t src);
+   void (*load_register_reg64)(struct crocus_batch *batch, uint32_t dst,
+                               uint32_t src);
+   void (*load_register_imm32)(struct crocus_batch *batch, uint32_t reg,
+                               uint32_t val);
+   void (*load_register_imm64)(struct crocus_batch *batch, uint32_t reg,
+                               uint64_t val);
+   void (*load_register_mem32)(struct crocus_batch *batch, uint32_t reg,
+                               struct crocus_bo *bo, uint32_t offset);
+   void (*load_register_mem64)(struct crocus_batch *batch, uint32_t reg,
+                               struct crocus_bo *bo, uint32_t offset);
+   void (*store_register_mem32)(struct crocus_batch *batch, uint32_t reg,
+                                struct crocus_bo *bo, uint32_t offset,
+                                bool predicated);
+   void (*store_register_mem64)(struct crocus_batch *batch, uint32_t reg,
+                                struct crocus_bo *bo, uint32_t offset,
+                                bool predicated);
+   void (*store_data_imm32)(struct crocus_batch *batch,
+                            struct crocus_bo *bo, uint32_t offset,
+                            uint32_t value);
+   void (*store_data_imm64)(struct crocus_batch *batch,
+                            struct crocus_bo *bo, uint32_t offset,
+                            uint64_t value);
+   void (*copy_mem_mem)(struct crocus_batch *batch,
+                        struct crocus_bo *dst_bo, uint32_t dst_offset,
+                        struct crocus_bo *src_bo, uint32_t src_offset,
+                        unsigned bytes);
+   void (*emit_raw_pipe_control)(struct crocus_batch *batch,
+                                 const char *reason, uint32_t flags,
+                                 struct crocus_bo *bo, uint32_t offset,
+                                 uint64_t imm);
+
+   void (*emit_mi_report_perf_count)(struct crocus_batch *batch,
+                                     struct crocus_bo *bo,
+                                     uint32_t offset_in_bytes,
+                                     uint32_t report_id);
+
+   uint32_t *(*create_so_decl_list)(const struct pipe_stream_output_info *sol,
+                                    const struct brw_vue_map *vue_map);
+   void (*populate_vs_key)(const struct crocus_context *ice,
+                           const struct shader_info *info,
+                           gl_shader_stage last_stage,
+                           struct brw_vs_prog_key *key);
+   void (*populate_tcs_key)(const struct crocus_context *ice,
+                            struct brw_tcs_prog_key *key);
+   void (*populate_tes_key)(const struct crocus_context *ice,
+                            const struct shader_info *info,
+                            gl_shader_stage last_stage,
+                            struct brw_tes_prog_key *key);
+   void (*populate_gs_key)(const struct crocus_context *ice,
+                           const struct shader_info *info,
+                           gl_shader_stage last_stage,
+                           struct brw_gs_prog_key *key);
+   void (*populate_fs_key)(const struct crocus_context *ice,
+                           const struct shader_info *info,
+                           struct brw_wm_prog_key *key);
+   void (*populate_cs_key)(const struct crocus_context *ice,
+                           struct brw_cs_prog_key *key);
+   void (*lost_genx_state)(struct crocus_context *ice, struct crocus_batch *batch);
+
+   void (*finish_batch)(struct crocus_batch *batch); /* haswell only */
+
+   void (*upload_urb_fence)(struct crocus_batch *batch); /* gen4/5 only */
+
+   bool (*blit_blt)(struct crocus_batch *batch,
+                    const struct pipe_blit_info *info);
+   bool (*copy_region_blt)(struct crocus_batch *batch,
+                           struct crocus_resource *dst,
+                           unsigned dst_level,
+                           unsigned dstx, unsigned dsty, unsigned dstz,
+                           struct crocus_resource *src,
+                           unsigned src_level,
+                           const struct pipe_box *src_box);
+   bool (*calculate_urb_fence)(struct crocus_batch *batch, unsigned csize,
+                               unsigned vsize, unsigned sfsize);
+   void (*batch_reset_dirty)(struct crocus_batch *batch);
+   unsigned (*translate_prim_type)(enum pipe_prim_type prim, uint8_t verts_per_patch);
+
+   void (*update_so_strides)(struct crocus_context *ice,
+                             uint16_t *strides);
+
+   uint32_t (*get_so_offset)(struct pipe_stream_output_target *tgt);
+};
+
+struct crocus_screen {
+   struct pipe_screen base;
+
+   uint32_t refcount;
+
+   /** Global slab allocator for crocus_transfer_map objects */
+   struct slab_parent_pool transfer_pool;
+
+   /** drm device file descriptor, shared with bufmgr, do not close. */
+   int fd;
+
+   /**
+    * drm device file descriptor to used for window system integration, owned
+    * by iris_screen, can be a different DRM instance than fd.
+    */
+   int winsys_fd;
+
+   /** PCI ID for our GPU device */
+   int pci_id;
+
+   bool no_hw;
+
+   struct crocus_vtable vtbl;
+
+   /** Global program_string_id counter (see get_program_string_id()) */
+   unsigned program_id;
+
+   /** Precompile shaders at link time?  (Can be disabled for debugging.) */
+   bool precompile;
+
+   /** driconf options and application workarounds */
+   struct {
+      /** Dual color blend by location instead of index (for broken apps) */
+      bool dual_color_blend_by_location;
+      bool disable_throttling;
+      bool always_flush_cache;
+   } driconf;
+
+   unsigned subslice_total;
+
+   uint64_t aperture_bytes;
+
+   struct intel_device_info devinfo;
+   struct isl_device isl_dev;
+   struct crocus_bufmgr *bufmgr;
+   struct brw_compiler *compiler;
+   struct crocus_monitor_config *monitor_cfg;
+   bool has_swizzling;
+
+   const struct intel_l3_config *l3_config_3d;
+   const struct intel_l3_config *l3_config_cs;
+
+   struct disk_cache *disk_cache;
+};
+
+struct pipe_screen *
+crocus_screen_create(int fd, const struct pipe_screen_config *config);
+
+void crocus_screen_destroy(struct crocus_screen *screen);
+
+UNUSED static inline struct pipe_screen *
+crocus_pscreen_ref(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+
+   p_atomic_inc(&screen->refcount);
+   return pscreen;
+}
+
+UNUSED static inline void
+crocus_pscreen_unref(struct pipe_screen *pscreen)
+{
+   struct crocus_screen *screen = (struct crocus_screen *) pscreen;
+
+   if (p_atomic_dec_zero(&screen->refcount))
+      crocus_screen_destroy(screen);
+}
+
+bool
+crocus_is_format_supported(struct pipe_screen *pscreen,
+                           enum pipe_format format,
+                           enum pipe_texture_target target,
+                           unsigned sample_count,
+                           unsigned storage_sample_count,
+                           unsigned usage);
+
+void crocus_disk_cache_init(struct crocus_screen *screen);
+
+#endif
diff --git a/src/gallium/drivers/crocus/crocus_state.c b/src/gallium/drivers/crocus/crocus_state.c
new file mode 100644
index 00000000000..7202140df02
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_state.c
@@ -0,0 +1,8382 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file crocus_state.c
+ *
+ * ============================= GENXML CODE =============================
+ *              [This file is compiled once per generation.]
+ * =======================================================================
+ *
+ * This is the main state upload code.
+ *
+ * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
+ * complex, or highly reusable state can be created once, and bound and
+ * rebound multiple times.  This is modeled with the pipe->create_*_state()
+ * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
+ * streamed out on the fly, via pipe->set_*_state() hooks.
+ *
+ * OpenGL involves frequently mutating context state, which is mirrored in
+ * core Mesa by highly mutable data structures.  However, most applications
+ * typically draw the same things over and over - from frame to frame, most
+ * of the same objects are still visible and need to be redrawn.  So, rather
+ * than inventing new state all the time, applications usually mutate to swap
+ * between known states that we've seen before.
+ *
+ * Gallium isolates us from this mutation by tracking API state, and
+ * distilling it into a set of Constant State Objects, or CSOs.  Large,
+ * complex, or typically reusable state can be created once, then reused
+ * multiple times.  Drivers can create and store their own associated data.
+ * This create/bind model corresponds to the pipe->create_*_state() and
+ * pipe->bind_*_state() driver hooks.
+ *
+ * Some state is cheap to create, or expected to be highly dynamic.  Rather
+ * than creating and caching piles of CSOs for these, Gallium simply streams
+ * them out, via the pipe->set_*_state() driver hooks.
+ *
+ * To reduce draw time overhead, we try to compute as much state at create
+ * time as possible.  Wherever possible, we translate the Gallium pipe state
+ * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
+ * we can simply memcpy them into a batch buffer.
+ *
+ * No hardware matches the abstraction perfectly, so some commands require
+ * information from multiple CSOs.  In this case, we can store two copies
+ * of the packet (one in each CSO), and simply | together their DWords at
+ * draw time.  Sometimes the second set is trivial (one or two fields), so
+ * we simply pack it at draw time.
+ *
+ * There are two main components in the file below.  First, the CSO hooks
+ * create/bind/track state.  The second are the draw-time upload functions,
+ * crocus_upload_render_state() and crocus_upload_compute_state(), which read
+ * the context state and emit the commands into the actual batch.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+
+#if HAVE_VALGRIND
+#include <memcheck.h>
+#include <valgrind.h>
+#define VG(x) x
+#ifdef DEBUG
+#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#endif
+#else
+#define VG(x)
+#endif
+
+#include "drm-uapi/i915_drm.h"
+#include "intel/common/intel_l3_config.h"
+#include "intel/common/intel_sample_positions.h"
+#include "intel/compiler/brw_compiler.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/format/u_format.h"
+#include "util/half_float.h"
+#include "util/u_dual_blend.h"
+#include "util/u_framebuffer.h"
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+#include "util/u_transfer.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_viewport.h"
+#include "crocus_batch.h"
+#include "crocus_context.h"
+#include "crocus_defines.h"
+#include "crocus_pipe.h"
+#include "crocus_resource.h"
+
+#include "crocus_genx_macros.h"
+#include "intel/common/intel_guardband.h"
+
+/**
+ * Statically assert that PIPE_* enums match the hardware packets.
+ * (As long as they match, we don't need to translate them.)
+ */
+UNUSED static void pipe_asserts()
+{
+#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
+
+   /* pipe_logicop happens to match the hardware. */
+   PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
+   PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
+   PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
+   PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
+   PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
+   PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
+   PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
+   PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
+   PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
+   PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
+   PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
+   PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
+   PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
+   PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
+   PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
+   PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
+
+   /* pipe_blend_func happens to match the hardware. */
+   PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
+   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
+
+   /* pipe_blend_func happens to match the hardware. */
+   PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
+   PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
+   PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
+   PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
+   PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
+
+   /* pipe_stencil_op happens to match the hardware. */
+   PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
+   PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
+   PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
+   PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
+   PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
+   PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
+   PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
+   PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
+
+#if GFX_VER >= 6
+   /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
+   PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
+   PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
+#endif
+#undef PIPE_ASSERT
+}
+
+static unsigned
+translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
+{
+   static const unsigned map[] = {
+      [PIPE_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
+      [PIPE_PRIM_LINES]                    = _3DPRIM_LINELIST,
+      [PIPE_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
+      [PIPE_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
+      [PIPE_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
+      [PIPE_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
+      [PIPE_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
+      [PIPE_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
+#if GFX_VER >= 6
+      [PIPE_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+#endif
+#if GFX_VER >= 7
+      [PIPE_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
+#endif
+   };
+
+   return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
+}
+
+static unsigned
+translate_compare_func(enum pipe_compare_func pipe_func)
+{
+   static const unsigned map[] = {
+      [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
+      [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
+      [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
+      [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
+      [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
+      [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
+      [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
+      [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
+   };
+   return map[pipe_func];
+}
+
+static unsigned
+translate_shadow_func(enum pipe_compare_func pipe_func)
+{
+   /* Gallium specifies the result of shadow comparisons as:
+    *
+    *    1 if ref <op> texel,
+    *    0 otherwise.
+    *
+    * The hardware does:
+    *
+    *    0 if texel <op> ref,
+    *    1 otherwise.
+    *
+    * So we need to flip the operator and also negate.
+    */
+   static const unsigned map[] = {
+      [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
+      [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
+      [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
+      [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
+      [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
+      [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
+      [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
+      [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
+   };
+   return map[pipe_func];
+}
+
+static unsigned
+translate_cull_mode(unsigned pipe_face)
+{
+   static const unsigned map[4] = {
+      [PIPE_FACE_NONE]           = CULLMODE_NONE,
+      [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
+      [PIPE_FACE_BACK]           = CULLMODE_BACK,
+      [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
+   };
+   return map[pipe_face];
+}
+
+#if GFX_VER >= 6
+static unsigned
+translate_fill_mode(unsigned pipe_polymode)
+{
+   static const unsigned map[4] = {
+      [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
+      [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
+      [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
+      [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
+   };
+   return map[pipe_polymode];
+}
+#endif
+
+static unsigned
+translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
+{
+   static const unsigned map[] = {
+      [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
+      [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
+      [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
+   };
+   return map[pipe_mip];
+}
+
+static uint32_t
+translate_wrap(unsigned pipe_wrap, bool either_nearest)
+{
+   static const unsigned map[] = {
+      [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
+      [PIPE_TEX_WRAP_CLAMP]                  = TCM_CLAMP_BORDER,
+      [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
+      [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
+      [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
+      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
+
+      /* These are unsupported. */
+      [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
+      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
+   };
+   if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
+      return TCM_CLAMP;
+   return map[pipe_wrap];
+}
+
+/**
+ * Equiv if brw_state_batch
+ */
+static uint32_t *
+stream_state(struct crocus_batch *batch,
+             unsigned size,
+             unsigned alignment,
+             uint32_t *out_offset)
+{
+   uint32_t offset = ALIGN(batch->state.used, alignment);
+
+   if (offset + size >= STATE_SZ && !batch->no_wrap) {
+      crocus_batch_flush(batch);
+      offset = ALIGN(batch->state.used, alignment);
+   } else if (offset + size >= batch->state.bo->size) {
+      const unsigned new_size =
+         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
+              MAX_STATE_SIZE);
+      crocus_grow_buffer(batch, true, batch->state.used, new_size);
+      assert(offset + size < batch->state.bo->size);
+   }
+
+   crocus_record_state_size(batch->state_sizes, offset, size);
+
+   batch->state.used = offset + size;
+   *out_offset = offset;
+
+   return (uint32_t *)batch->state.map + (offset >> 2);
+}
+
+/**
+ * stream_state() + memcpy.
+ */
+static uint32_t
+emit_state(struct crocus_batch *batch, const void *data, unsigned size,
+           unsigned alignment)
+{
+   unsigned offset = 0;
+   uint32_t *map = stream_state(batch, size, alignment, &offset);
+
+   if (map)
+      memcpy(map, data, size);
+
+   return offset;
+}
+
+#if GFX_VER <= 5
+static void
+upload_pipelined_state_pointers(struct crocus_batch *batch,
+                                bool gs_active, uint32_t gs_offset,
+                                uint32_t vs_offset, uint32_t sf_offset,
+                                uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
+{
+#if GFX_VER == 5
+   /* Need to flush before changing clip max threads for errata. */
+   crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
+#endif
+
+   crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
+      pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
+      pp.GSEnable = gs_active;
+      if (gs_active)
+         pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
+      pp.ClipEnable = true;
+      pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
+      pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
+      pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
+      pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
+   }
+}
+
+#endif
+/**
+ * Did field 'x' change between 'old_cso' and 'new_cso'?
+ *
+ * (If so, we may want to set some dirty flags.)
+ */
+#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
+#define cso_changed_memcmp(x) \
+   (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
+
+static void
+flush_before_state_base_change(struct crocus_batch *batch)
+{
+#if GFX_VER >= 6
+   /* Flush before emitting STATE_BASE_ADDRESS.
+    *
+    * This isn't documented anywhere in the PRM.  However, it seems to be
+    * necessary prior to changing the surface state base adress.  We've
+    * seen issues in Vulkan where we get GPU hangs when using multi-level
+    * command buffers which clear depth, reset state base address, and then
+    * go render stuff.
+    *
+    * Normally, in GL, we would trust the kernel to do sufficient stalls
+    * and flushes prior to executing our batch.  However, it doesn't seem
+    * as if the kernel's flushing is always sufficient and we don't want to
+    * rely on it.
+    *
+    * We make this an end-of-pipe sync instead of a normal flush because we
+    * do not know the current status of the GPU.  On Haswell at least,
+    * having a fast-clear operation in flight at the same time as a normal
+    * rendering operation can cause hangs.  Since the kernel's flushing is
+    * insufficient, we need to ensure that any rendering operations from
+    * other processes are definitely complete before we try to do our own
+    * rendering.  It's a bit of a big hammer but it appears to work.
+    */
+   const unsigned dc_flush =
+      batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+   crocus_emit_end_of_pipe_sync(batch,
+                                "change STATE_BASE_ADDRESS (flushes)",
+                                PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                dc_flush |
+                                PIPE_CONTROL_DEPTH_CACHE_FLUSH);
+#endif
+}
+
+static void
+flush_after_state_base_change(struct crocus_batch *batch)
+{
+   /* After re-setting the surface state base address, we have to do some
+    * cache flusing so that the sampler engine will pick up the new
+    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
+    * Shared Function > 3D Sampler > State > State Caching (page 96):
+    *
+    *    Coherency with system memory in the state cache, like the texture
+    *    cache is handled partially by software. It is expected that the
+    *    command stream or shader will issue Cache Flush operation or
+    *    Cache_Flush sampler message to ensure that the L1 cache remains
+    *    coherent with system memory.
+    *
+    *    [...]
+    *
+    *    Whenever the value of the Dynamic_State_Base_Addr,
+    *    Surface_State_Base_Addr are altered, the L1 state cache must be
+    *    invalidated to ensure the new surface or sampler state is fetched
+    *    from system memory.
+    *
+    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
+    * which, according the PIPE_CONTROL instruction documentation in the
+    * Broadwell PRM:
+    *
+    *    Setting this bit is independent of any other bit in this packet.
+    *    This bit controls the invalidation of the L1 and L2 state caches
+    *    at the top of the pipe i.e. at the parsing time.
+    *
+    * Unfortunately, experimentation seems to indicate that state cache
+    * invalidation through a PIPE_CONTROL does nothing whatsoever in
+    * regards to surface state and binding tables.  In stead, it seems that
+    * invalidating the texture cache is what is actually needed.
+    *
+    * XXX:  As far as we have been able to determine through
+    * experimentation, shows that flush the texture cache appears to be
+    * sufficient.  The theory here is that all of the sampling/rendering
+    * units cache the binding table in the texture cache.  However, we have
+    * yet to be able to actually confirm this.
+    */
+#if GFX_VER >= 6
+   crocus_emit_end_of_pipe_sync(batch,
+                                "change STATE_BASE_ADDRESS (invalidates)",
+                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+#endif
+}
+
+#if GFX_VER >= 6
+static void
+crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
+                            struct crocus_bo *bo, uint32_t offset,
+                            bool predicated)
+{
+   crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
+      srm.RegisterAddress = reg;
+      srm.MemoryAddress = ggtt_bo(bo, offset);
+#if GFX_VERx10 == 75
+      srm.PredicateEnable = predicated;
+#else
+      if (predicated)
+         unreachable("unsupported predication");
+#endif
+   }
+}
+
+static void
+crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
+                            struct crocus_bo *bo, uint32_t offset,
+                            bool predicated)
+{
+   crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
+   crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
+}
+#endif
+
+#if GFX_VER >= 7
+static void
+_crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
+{
+   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+      lri.RegisterOffset = reg;
+      lri.DataDWord      = val;
+   }
+}
+#define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
+
+#if GFX_VERx10 == 75
+static void
+_crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
+{
+   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
+      lrr.SourceRegisterAddress = src;
+      lrr.DestinationRegisterAddress = dst;
+   }
+}
+
+static void
+crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
+                           uint32_t src)
+{
+   _crocus_emit_lrr(batch, dst, src);
+}
+
+static void
+crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
+                           uint32_t src)
+{
+   _crocus_emit_lrr(batch, dst, src);
+   _crocus_emit_lrr(batch, dst + 4, src + 4);
+}
+#endif
+
+static void
+crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
+                           uint32_t val)
+{
+   _crocus_emit_lri(batch, reg, val);
+}
+
+static void
+crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
+                           uint64_t val)
+{
+   _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
+   _crocus_emit_lri(batch, reg + 4, val >> 32);
+}
+
+/**
+ * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
+ */
+static void
+crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
+                           struct crocus_bo *bo, uint32_t offset)
+{
+   crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+      lrm.RegisterAddress = reg;
+      lrm.MemoryAddress = ro_bo(bo, offset);
+   }
+}
+
+/**
+ * Load a 64-bit value from a buffer into a MMIO register via
+ * two MI_LOAD_REGISTER_MEM commands.
+ */
+static void
+crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
+                           struct crocus_bo *bo, uint32_t offset)
+{
+   crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
+   crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
+}
+
+#if GFX_VERx10 == 75
+static void
+crocus_store_data_imm32(struct crocus_batch *batch,
+                        struct crocus_bo *bo, uint32_t offset,
+                        uint32_t imm)
+{
+   crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
+      sdi.Address = rw_bo(bo, offset);
+#if GFX_VER >= 6
+      sdi.ImmediateData = imm;
+#endif
+   }
+}
+
+static void
+crocus_store_data_imm64(struct crocus_batch *batch,
+                        struct crocus_bo *bo, uint32_t offset,
+                        uint64_t imm)
+{
+   /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
+    * 2 in genxml but it's actually variable length and we need 5 DWords.
+    */
+   void *map = crocus_get_command_space(batch, 4 * 5);
+   _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
+      sdi.DWordLength = 5 - 2;
+      sdi.Address = rw_bo(bo, offset);
+#if GFX_VER >= 6
+      sdi.ImmediateData = imm;
+#endif
+   }
+}
+#endif
+
+static void
+crocus_copy_mem_mem(struct crocus_batch *batch,
+                    struct crocus_bo *dst_bo, uint32_t dst_offset,
+                    struct crocus_bo *src_bo, uint32_t src_offset,
+                    unsigned bytes)
+{
+   assert(bytes % 4 == 0);
+   assert(dst_offset % 4 == 0);
+   assert(src_offset % 4 == 0);
+
+#define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
+   for (unsigned i = 0; i < bytes; i += 4) {
+      crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
+                                 src_bo, src_offset + i);
+      crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
+                                  dst_bo, dst_offset + i, false);
+   }
+}
+#endif
+
+/**
+ * Gallium CSO for rasterizer state.
+ */
+struct crocus_rasterizer_state {
+   struct pipe_rasterizer_state cso;
+#if GFX_VER >= 6
+   uint32_t sf[GENX(3DSTATE_SF_length)];
+   uint32_t clip[GENX(3DSTATE_CLIP_length)];
+#endif
+   uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
+
+   uint8_t num_clip_plane_consts;
+   bool fill_mode_point_or_line;
+};
+
+#if GFX_VER <= 5
+#define URB_VS 0
+#define URB_GS 1
+#define URB_CLP 2
+#define URB_SF 3
+#define URB_CS 4
+
+static const struct {
+   uint32_t min_nr_entries;
+   uint32_t preferred_nr_entries;
+   uint32_t min_entry_size;
+   uint32_t  max_entry_size;
+} limits[URB_CS+1] = {
+   { 16, 32, 1, 5 },                        /* vs */
+   { 4, 8,  1, 5 },                        /* gs */
+   { 5, 10,  1, 5 },                        /* clp */
+   { 1, 8,  1, 12 },                        /* sf */
+   { 1, 4,  1, 32 }                        /* cs */
+};
+
+static bool check_urb_layout(struct crocus_context *ice)
+{
+   ice->urb.vs_start = 0;
+   ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
+   ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
+   ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
+   ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
+
+   return ice->urb.cs_start + ice->urb.nr_cs_entries *
+      ice->urb.csize <= ice->urb.size;
+}
+
+
+static bool
+crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
+                           unsigned vsize, unsigned sfsize)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   struct crocus_context *ice = batch->ice;
+   if (csize < limits[URB_CS].min_entry_size)
+      csize = limits[URB_CS].min_entry_size;
+
+   if (vsize < limits[URB_VS].min_entry_size)
+      vsize = limits[URB_VS].min_entry_size;
+
+   if (sfsize < limits[URB_SF].min_entry_size)
+      sfsize = limits[URB_SF].min_entry_size;
+
+   if (ice->urb.vsize < vsize ||
+       ice->urb.sfsize < sfsize ||
+       ice->urb.csize < csize ||
+       (ice->urb.constrained && (ice->urb.vsize > vsize ||
+                                 ice->urb.sfsize > sfsize ||
+                                 ice->urb.csize > csize))) {
+
+
+      ice->urb.csize = csize;
+      ice->urb.sfsize = sfsize;
+      ice->urb.vsize = vsize;
+
+      ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+      ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
+      ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
+      ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
+      ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
+
+      ice->urb.constrained = 0;
+
+      if (devinfo->ver == 5) {
+         ice->urb.nr_vs_entries = 128;
+         ice->urb.nr_sf_entries = 48;
+         if (check_urb_layout(ice)) {
+            goto done;
+         } else {
+            ice->urb.constrained = 1;
+            ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+            ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
+         }
+      } else if (devinfo->is_g4x) {
+         ice->urb.nr_vs_entries = 64;
+         if (check_urb_layout(ice)) {
+            goto done;
+         } else {
+            ice->urb.constrained = 1;
+            ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
+         }
+      }
+
+      if (!check_urb_layout(ice)) {
+         ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
+         ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
+         ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
+         ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
+         ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
+
+         /* Mark us as operating with constrained nr_entries, so that next
+          * time we recalculate we'll resize the fences in the hope of
+          * escaping constrained mode and getting back to normal performance.
+          */
+         ice->urb.constrained = 1;
+
+         if (!check_urb_layout(ice)) {
+            /* This is impossible, given the maximal sizes of urb
+             * entries and the values for minimum nr of entries
+             * provided above.
+             */
+            fprintf(stderr, "couldn't calculate URB layout!\n");
+            exit(1);
+         }
+
+         if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF)))
+            fprintf(stderr, "URB CONSTRAINED\n");
+      }
+
+done:
+      if (unlikely(INTEL_DEBUG & DEBUG_URB))
+         fprintf(stderr,
+                 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+                 ice->urb.vs_start,
+                 ice->urb.gs_start,
+                 ice->urb.clip_start,
+                 ice->urb.sf_start,
+                 ice->urb.cs_start,
+                 ice->urb.size);
+      return true;
+   }
+   return false;
+}
+
+static void
+crocus_upload_urb_fence(struct crocus_batch *batch)
+{
+   uint32_t urb_fence[3];
+   _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
+      urb.VSUnitURBReallocationRequest = 1;
+      urb.GSUnitURBReallocationRequest = 1;
+      urb.CLIPUnitURBReallocationRequest = 1;
+      urb.SFUnitURBReallocationRequest = 1;
+      urb.VFEUnitURBReallocationRequest = 1;
+      urb.CSUnitURBReallocationRequest = 1;
+
+      urb.VSFence = batch->ice->urb.gs_start;
+      urb.GSFence = batch->ice->urb.clip_start;
+      urb.CLIPFence = batch->ice->urb.sf_start;
+      urb.SFFence = batch->ice->urb.cs_start;
+      urb.CSFence = batch->ice->urb.size;
+   }
+
+   /* erratum: URB_FENCE must not cross a 64byte cacheline */
+   if ((crocus_batch_bytes_used(batch) & 15) > 12) {
+      int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
+      do {
+         *(uint32_t *)batch->command.map_next = 0;
+         batch->command.map_next += sizeof(uint32_t);
+      } while (--pad);
+   }
+
+   crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
+}
+
+static bool
+calculate_curbe_offsets(struct crocus_batch *batch)
+{
+   struct crocus_context *ice = batch->ice;
+
+   unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
+   unsigned total_regs;
+
+   nr_fp_regs = 0;
+   for (int i = 0; i < 4; i++) {
+      const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
+      if (range->length == 0)
+         continue;
+
+      /* ubo range tracks at 256-bit, we need 512-bit */
+      nr_fp_regs += (range->length + 1) / 2;
+   }
+
+   if (ice->state.cso_rast->cso.clip_plane_enable) {
+      unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
+      nr_clip_regs = (nr_planes * 4 + 15) / 16;
+   }
+
+   nr_vp_regs = 0;
+   for (int i = 0; i < 4; i++) {
+      const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
+      if (range->length == 0)
+         continue;
+
+      /* ubo range tracks at 256-bit, we need 512-bit */
+      nr_vp_regs += (range->length + 1) / 2;
+   }
+   if (nr_vp_regs == 0) {
+      /* The pre-gen6 VS requires that some push constants get loaded no
+       * matter what, or the GPU would hang.
+       */
+      nr_vp_regs = 1;
+   }
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* The CURBE allocation size is limited to 32 512-bit units (128 EU
+    * registers, or 1024 floats).  See CS_URB_STATE in the gen4 or gen5
+    * (volume 1, part 1) PRMs.
+    *
+    * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
+    * values as push constants before spilling to pull constants, and in
+    * brw_vec4.cpp we're loading up to 32 registers of push constants.  An EU
+    * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
+    * regs for clip.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > ice->curbe.wm_size ||
+       nr_vp_regs > ice->curbe.vs_size ||
+       nr_clip_regs != ice->curbe.clip_size ||
+       (total_regs < ice->curbe.total_size / 4 &&
+        ice->curbe.total_size > 16)) {
+
+      GLuint reg = 0;
+
+      /* Calculate a new layout:
+       */
+      reg = 0;
+      ice->curbe.wm_start = reg;
+      ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      ice->curbe.clip_start = reg;
+      ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      ice->curbe.vs_start = reg;
+      ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      ice->curbe.total_size = reg;
+
+      if (0)
+         fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
+                 ice->curbe.wm_start,
+                 ice->curbe.wm_size,
+                 ice->curbe.clip_start,
+                 ice->curbe.clip_size,
+                 ice->curbe.vs_start,
+                 ice->curbe.vs_size );
+      return true;
+   }
+   return false;
+}
+
+static void
+upload_shader_consts(struct crocus_context *ice,
+                     gl_shader_stage stage,
+                     uint32_t *map,
+                     unsigned start)
+{
+   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+   uint32_t *cmap;
+   bool found = false;
+   unsigned offset = start * 16;
+   int total = 0;
+   for (int i = 0; i < 4; i++) {
+      const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+      if (range->length == 0)
+         continue;
+
+      unsigned block_index = crocus_bti_to_group_index(
+         &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
+      unsigned len = range->length * 8 * sizeof(float);
+      unsigned start = range->start * 8 * sizeof(float);
+      struct pipe_transfer *transfer;
+
+      cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
+                                   ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
+                                   PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
+      if (cmap)
+         memcpy(&map[offset + (total * 8)], cmap, len);
+      pipe_buffer_unmap(&ice->ctx, transfer);
+      total += range->length;
+      found = true;
+   }
+
+   if (stage == MESA_SHADER_VERTEX && !found) {
+      /* The pre-gen6 VS requires that some push constants get loaded no
+       * matter what, or the GPU would hang.
+       */
+      unsigned len = 16;
+      memset(&map[offset], 0, len);
+   }
+}
+
+static const float fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+static void
+gen4_upload_curbe(struct crocus_batch *batch)
+{
+   struct crocus_context *ice = batch->ice;
+   const unsigned sz = ice->curbe.total_size;
+   const unsigned buf_sz = sz * 16 * sizeof(float);
+
+   if (sz == 0)
+      goto emit;
+
+   uint32_t *map;
+   u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
+                  &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
+
+   /* fragment shader constants */
+   if (ice->curbe.wm_size) {
+      upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
+   }
+
+   /* clipper constants */
+   if (ice->curbe.clip_size) {
+      unsigned offset = ice->curbe.clip_start * 16;
+      float *fmap = (float *)map;
+      unsigned i;
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+         fmap[offset + i * 4 + 0] = fixed_plane[i][0];
+         fmap[offset + i * 4 + 1] = fixed_plane[i][1];
+         fmap[offset + i * 4 + 2] = fixed_plane[i][2];
+         fmap[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
+      struct pipe_clip_state *cp = &ice->state.clip_planes;
+      while (mask) {
+         const int j = u_bit_scan(&mask);
+         fmap[offset + i * 4 + 0] = cp->ucp[j][0];
+         fmap[offset + i * 4 + 1] = cp->ucp[j][1];
+         fmap[offset + i * 4 + 2] = cp->ucp[j][2];
+         fmap[offset + i * 4 + 3] = cp->ucp[j][3];
+         i++;
+      }
+   }
+
+   /* vertex shader constants */
+   if (ice->curbe.vs_size) {
+      upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
+   }
+   if (0) {
+      for (int i = 0; i < sz*16; i+=4) {
+         float *f = (float *)map;
+         fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+                 f[i+0], f[i+1], f[i+2], f[i+3]);
+      }
+   }
+
+emit:
+   crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
+      if (ice->curbe.curbe_res) {
+         cb.BufferLength = ice->curbe.total_size - 1;
+         cb.Valid = 1;
+         cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
+      }
+   }
+
+#if GFX_VER == 4 && GFX_VERx10 != 45
+   /* Work around a Broadwater/Crestline depth interpolator bug.  The
+    * following sequence will cause GPU hangs:
+    *
+    * 1. Change state so that all depth related fields in CC_STATE are
+    *    disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
+    * 2. Emit a CONSTANT_BUFFER packet.
+    * 3. Draw via 3DPRIMITIVE.
+    *
+    * The recommended workaround is to emit a non-pipelined state change after
+    * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
+    *
+    * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
+    * and always emit it when "PS Use Source Depth" is set.  We could be more
+    * precise, but the additional complexity is probably not worth it.
+    *
+    */
+   const struct shader_info *fs_info =
+      crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+   if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
+      ice->state.global_depth_offset_clamp = 0;
+      crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
+   }
+#endif
+}
+#endif
+
+#if GFX_VER == 7
+
+#define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
+#define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
+#define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
+
+static void
+setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
+   const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
+                       cfg->n[INTEL_L3P_ALL];
+   const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
+                      cfg->n[INTEL_L3P_ALL];
+   const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
+                      cfg->n[INTEL_L3P_ALL];
+   const bool has_slm = cfg->n[INTEL_L3P_SLM];
+
+   /* According to the hardware docs, the L3 partitioning can only be changed
+    * while the pipeline is completely drained and the caches are flushed,
+    * which involves a first PIPE_CONTROL flush which stalls the pipeline...
+    */
+   crocus_emit_pipe_control_flush(batch, "l3_config",
+                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+
+   /* ...followed by a second pipelined PIPE_CONTROL that initiates
+    * invalidation of the relevant caches.  Note that because RO invalidation
+    * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
+    * command is processed by the CS) we cannot combine it with the previous
+    * stalling flush as the hardware documentation suggests, because that
+    * would cause the CS to stall on previous rendering *after* RO
+    * invalidation and wouldn't prevent the RO caches from being polluted by
+    * concurrent rendering before the stall completes.  This intentionally
+    * doesn't implement the SKL+ hardware workaround suggesting to enable CS
+    * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
+    * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
+    * already guarantee that there is no concurrent GPGPU kernel execution
+    * (see SKL HSD 2132585).
+    */
+   crocus_emit_pipe_control_flush(batch, "l3 config",
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+
+   /* Now send a third stalling flush to make sure that invalidation is
+    * complete when the L3 configuration registers are modified.
+    */
+   crocus_emit_pipe_control_flush(batch, "l3 config",
+                                  PIPE_CONTROL_DATA_CACHE_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+
+
+   assert(!cfg->n[INTEL_L3P_ALL]);
+
+   /* When enabled SLM only uses a portion of the L3 on half of the banks,
+    * the matching space on the remaining banks has to be allocated to a
+    * client (URB for all validated configurations) set to the
+    * lower-bandwidth 2-bank address hashing mode.
+    */
+   const bool urb_low_bw = has_slm && !devinfo->is_baytrail;
+   assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
+
+   /* Minimum number of ways that can be allocated to the URB. */
+   const unsigned n0_urb = (devinfo->is_baytrail ? 32 : 0);
+   assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
+
+   uint32_t l3sqcr1, l3cr2, l3cr3;
+
+   crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
+      reg.ConvertDC_UC = !has_dc;
+      reg.ConvertIS_UC = !has_is;
+      reg.ConvertC_UC = !has_c;
+      reg.ConvertT_UC = !has_t;
+#if GFX_VERx10 == 75
+      reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
+#else
+      reg.L3SQGeneralPriorityCreditInitialization =
+         devinfo->is_baytrail ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
+#endif
+      reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
+   };
+
+   crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
+      reg.SLMEnable = has_slm;
+      reg.URBLowBandwidth = urb_low_bw;
+      reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
+#if !(GFX_VERx10 == 75)
+      reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
+#endif
+      reg.ROAllocation = cfg->n[INTEL_L3P_RO];
+      reg.DCAllocation = cfg->n[INTEL_L3P_DC];
+   };
+
+   crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
+      reg.ISAllocation = cfg->n[INTEL_L3P_IS];
+      reg.ISLowBandwidth = 0;
+      reg.CAllocation = cfg->n[INTEL_L3P_C];
+      reg.CLowBandwidth = 0;
+      reg.TAllocation = cfg->n[INTEL_L3P_T];
+      reg.TLowBandwidth = 0;
+   };
+
+   /* Set up the L3 partitioning. */
+   crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
+   crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
+   crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
+
+#if GFX_VERSIONx10 == 75
+   /* TODO: Fail screen creation if command parser version < 4 */
+   uint32_t scratch1, chicken3;
+   crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
+      reg.L3AtomicDisable = !has_dc;
+   }
+   crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
+      reg.L3AtomicDisableMask = true;
+      reg.L3AtomicDisable = !has_dc;
+   }
+   crocus_emit_lri(batch, SCRATCH1, scratch1);
+   crocus_emit_lri(batch, CHICKEN3, chicken3);
+#endif
+}
+
+static void
+emit_l3_state(struct crocus_batch *batch, bool compute)
+{
+   const struct intel_l3_config *const cfg =
+      compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
+
+   setup_l3_config(batch, cfg);
+   if (unlikely(INTEL_DEBUG & DEBUG_L3)) {
+      intel_dump_l3_config(cfg, stderr);
+   }
+}
+
+/**
+ * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
+ */
+static void
+gen7_emit_cs_stall_flush(struct crocus_batch *batch)
+{
+   crocus_emit_pipe_control_write(batch,
+                                  "workaround",
+                                  PIPE_CONTROL_CS_STALL
+                                  | PIPE_CONTROL_WRITE_IMMEDIATE,
+                                  batch->ice->workaround_bo,
+                                  batch->ice->workaround_offset, 0);
+}
+#endif
+
+static void
+emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
+{
+#if GFX_VER >= 6
+   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+    * PIPELINE_SELECT [DevBWR+]":
+    *
+    *    "Project: DEVSNB+
+    *
+    *     Software must ensure all the write caches are flushed through a
+    *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
+    *     command to invalidate read only caches prior to programming
+    *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
+    */
+   const unsigned dc_flush =
+      batch->screen->devinfo.ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
+   crocus_emit_pipe_control_flush(batch,
+                                  "workaround: PIPELINE_SELECT flushes (1/2)",
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  dc_flush |
+                                  PIPE_CONTROL_CS_STALL);
+
+   crocus_emit_pipe_control_flush(batch,
+                                  "workaround: PIPELINE_SELECT flushes (2/2)",
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_INSTRUCTION_INVALIDATE);
+#else
+   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
+    * PIPELINE_SELECT [DevBWR+]":
+    *
+    *   Project: PRE-DEVSNB
+    *
+    *   Software must ensure the current pipeline is flushed via an
+    *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
+    */
+   crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
+#endif
+
+   crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
+      sel.PipelineSelection = pipeline;
+   }
+
+#if GFX_VER == 7 && !(GFX_VERx10 == 75)
+   if (pipeline == _3D) {
+      gen7_emit_cs_stall_flush(batch);
+
+      crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+         prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
+      };
+   }
+#endif
+}
+
+/**
+ * The following diagram shows how we partition the URB:
+ *
+ *        16kB or 32kB               Rest of the URB space
+ *   __________-__________   _________________-_________________
+ *  /                     \ /                                   \
+ * +-------------------------------------------------------------+
+ * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
+ * |       Constants       |               Entries               |
+ * +-------------------------------------------------------------+
+ *
+ * Notably, push constants must be stored at the beginning of the URB
+ * space, while entries can be stored anywhere.  Ivybridge and Haswell
+ * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
+ * doubles this (32kB).
+ *
+ * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
+ * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
+ * sized in increments of 2kB.
+ *
+ * Currently we split the constant buffer space evenly among whatever stages
+ * are active.  This is probably not ideal, but simple.
+ *
+ * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
+ * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
+ * Haswell GT3 has 512kB of URB space.
+ *
+ * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
+ * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
+ */
+#if GFX_VER == 7
+static void
+crocus_alloc_push_constants(struct crocus_batch *batch)
+{
+#if GFX_VERx10 == 75
+   const unsigned push_constant_kb = batch->screen->devinfo.gt == 3 ? 32 : 16;
+#else
+   const unsigned push_constant_kb = 16;
+#endif
+   unsigned size_per_stage = push_constant_kb / 5;
+
+   /* For now, we set a static partitioning of the push constant area,
+    * assuming that all stages could be in use.
+    *
+    * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
+    *       see if that improves performance by offering more space to
+    *       the VS/FS when those aren't in use.  Also, try dynamically
+    *       enabling/disabling it like i965 does.  This would be more
+    *       stalls and may not actually help; we don't know yet.
+    */
+   for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
+         alloc._3DCommandSubOpcode = 18 + i;
+         alloc.ConstantBufferOffset = size_per_stage * i;
+         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
+      }
+   }
+
+   /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
+    *
+    *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
+    *     in the ring after this instruction.
+    *
+    * No such restriction exists for Haswell or Baytrail.
+    */
+   if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
+      gen7_emit_cs_stall_flush(batch);
+}
+#endif
+
+/**
+ * Upload the initial GPU state for a render context.
+ *
+ * This sets some invariant state that needs to be programmed a particular
+ * way, but we never actually change.
+ */
+static void
+crocus_init_render_context(struct crocus_batch *batch)
+{
+   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   emit_pipeline_select(batch, _3D);
+
+   crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
+
+#if GFX_VER == 7
+   emit_l3_state(batch, false);
+#endif
+#if GFX_VER == 7 && GFX_VERx10 != 75
+   crocus_emit_reg(batch, GENX(INSTPM), reg) {
+      reg.CONSTANT_BUFFERAddressOffsetDisable = true;
+      reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
+   }
+#endif
+#if GFX_VER >= 5 || GFX_VERx10 == 45
+   /* Use the legacy AA line coverage computation. */
+   crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
+#endif
+
+   /* No polygon stippling offsets are necessary. */
+   /* TODO: may need to set an offset for origin-UL framebuffers */
+   crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
+
+#if GFX_VER == 7
+   crocus_alloc_push_constants(batch);
+#endif
+}
+
+#if GFX_VER == 7
+static void
+crocus_init_compute_context(struct crocus_batch *batch)
+{
+   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   emit_pipeline_select(batch, GPGPU);
+
+#if GFX_VER == 7
+   emit_l3_state(batch, true);
+#endif
+}
+#endif
+
+/**
+ * Generation-specific context state (ice->state.genx->...).
+ *
+ * Most state can go in crocus_context directly, but these encode hardware
+ * packets which vary by generation.
+ */
+struct crocus_genx_state {
+   struct {
+#if GFX_VER == 7
+      struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
+#endif
+   } shaders[MESA_SHADER_STAGES];
+};
+
+/**
+ * The pipe->set_blend_color() driver hook.
+ *
+ * This corresponds to our COLOR_CALC_STATE.
+ */
+static void
+crocus_set_blend_color(struct pipe_context *ctx,
+                       const struct pipe_blend_color *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
+   memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
+#if GFX_VER <= 5
+   ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
+#else
+   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+}
+
+/**
+ * Gallium CSO for blend state (see pipe_blend_state).
+ */
+struct crocus_blend_state {
+   /** copy of BLEND_STATE */
+   struct pipe_blend_state cso;
+
+   /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
+   uint8_t blend_enables;
+
+   /** Bitfield of whether color writes are enabled for RT[i] */
+   uint8_t color_write_enables;
+
+   /** Does RT[0] use dual color blending? */
+   bool dual_color_blending;
+};
+
+#if GFX_VER >= 6
+static enum pipe_blendfactor
+fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
+{
+   if (alpha_to_one) {
+      if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
+         return PIPE_BLENDFACTOR_ONE;
+
+      if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
+         return PIPE_BLENDFACTOR_ZERO;
+   }
+
+   return f;
+}
+#endif
+
+/**
+ * The pipe->create_blend_state() driver hook.
+ *
+ * Translates a pipe_blend_state into crocus_blend_state.
+ */
+static void *
+crocus_create_blend_state(struct pipe_context *ctx,
+                          const struct pipe_blend_state *state)
+{
+   struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
+
+   cso->blend_enables = 0;
+   cso->color_write_enables = 0;
+   STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
+
+   cso->cso = *state;
+   cso->dual_color_blending = util_blend_state_is_dual(state, 0);
+   for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
+      const struct pipe_rt_blend_state *rt =
+         &state->rt[state->independent_blend_enable ? i : 0];
+      if (rt->blend_enable)
+         cso->blend_enables |= 1u << i;
+      if (rt->colormask)
+         cso->color_write_enables |= 1u << i;
+   }
+   return cso;
+}
+
+/**
+ * The pipe->bind_blend_state() driver hook.
+ *
+ * Bind a blending CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_blend_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_blend_state *cso = state;
+
+   ice->state.cso_blend = cso;
+   ice->state.blend_enables = cso ? cso->blend_enables : 0;
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
+   ice->state.dirty |= CROCUS_DIRTY_WM;
+#if GFX_VER >= 6
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+#endif
+#if GFX_VER >= 7
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
+#endif
+   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+   ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
+}
+
+/**
+ * Return true if the FS writes to any color outputs which are not disabled
+ * via color masking.
+ */
+static bool
+has_writeable_rt(const struct crocus_blend_state *cso_blend,
+                 const struct shader_info *fs_info)
+{
+   if (!fs_info)
+      return false;
+
+   unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
+
+   if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
+      rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
+
+   return cso_blend->color_write_enables & rt_outputs;
+}
+
+/**
+ * Gallium CSO for depth, stencil, and alpha testing state.
+ */
+struct crocus_depth_stencil_alpha_state {
+   struct pipe_depth_stencil_alpha_state cso;
+
+   bool depth_writes_enabled;
+   bool stencil_writes_enabled;
+};
+
+/**
+ * The pipe->create_depth_stencil_alpha_state() driver hook.
+ *
+ * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
+ * testing state since we need pieces of it in a variety of places.
+ */
+static void *
+crocus_create_zsa_state(struct pipe_context *ctx,
+                        const struct pipe_depth_stencil_alpha_state *state)
+{
+   struct crocus_depth_stencil_alpha_state *cso =
+      malloc(sizeof(struct crocus_depth_stencil_alpha_state));
+
+   bool two_sided_stencil = state->stencil[1].enabled;
+   cso->cso = *state;
+
+   cso->depth_writes_enabled = state->depth_writemask;
+   cso->stencil_writes_enabled =
+      state->stencil[0].writemask != 0 ||
+      (two_sided_stencil && state->stencil[1].writemask != 0);
+
+   /* The state tracker needs to optimize away EQUAL writes for us. */
+   assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
+
+   return cso;
+}
+
+/**
+ * The pipe->bind_depth_stencil_alpha_state() driver hook.
+ *
+ * Bind a depth/stencil/alpha CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
+   struct crocus_depth_stencil_alpha_state *new_cso = state;
+
+   if (new_cso) {
+      if (cso_changed(cso.alpha_ref_value))
+         ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+
+#if GFX_VER >= 6
+      if (cso_changed(cso.alpha_enabled))
+         ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+
+      if (cso_changed(cso.alpha_func))
+         ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+#endif
+
+      if (cso_changed(depth_writes_enabled))
+         ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+      ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
+      ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
+
+#if GFX_VER <= 5
+      ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+   }
+
+   ice->state.cso_zsa = new_cso;
+   ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+#if GFX_VER >= 6
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
+#endif
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
+}
+
+static float
+get_line_width(const struct pipe_rasterizer_state *state)
+{
+   float line_width = state->line_width;
+
+   /* From the OpenGL 4.4 spec:
+    *
+    * "The actual width of non-antialiased lines is determined by rounding
+    *  the supplied width to the nearest integer, then clamping it to the
+    *  implementation-dependent maximum non-antialiased line width."
+    */
+   if (!state->multisample && !state->line_smooth)
+      line_width = roundf(state->line_width);
+
+   if (!state->multisample && state->line_smooth && line_width < 1.5f) {
+      /* For 1 pixel line thickness or less, the general anti-aliasing
+       * algorithm gives up, and a garbage line is generated.  Setting a
+       * Line Width of 0.0 specifies the rasterization of the "thinnest"
+       * (one-pixel-wide), non-antialiased lines.
+       *
+       * Lines rendered with zero Line Width are rasterized using the
+       * "Grid Intersection Quantization" rules as specified by the
+       * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
+       */
+      line_width = 0.0f;
+   }
+
+   return line_width;
+}
+
+/**
+ * The pipe->create_rasterizer_state() driver hook.
+ */
+static void *
+crocus_create_rasterizer_state(struct pipe_context *ctx,
+                               const struct pipe_rasterizer_state *state)
+{
+   struct crocus_rasterizer_state *cso =
+      malloc(sizeof(struct crocus_rasterizer_state));
+
+   cso->fill_mode_point_or_line =
+      state->fill_front == PIPE_POLYGON_MODE_LINE ||
+      state->fill_front == PIPE_POLYGON_MODE_POINT ||
+      state->fill_back == PIPE_POLYGON_MODE_LINE ||
+      state->fill_back == PIPE_POLYGON_MODE_POINT;
+
+   if (state->clip_plane_enable != 0)
+      cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
+   else
+      cso->num_clip_plane_consts = 0;
+
+   cso->cso = *state;
+
+#if GFX_VER >= 6
+   float line_width = get_line_width(state);
+
+   crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
+      sf.StatisticsEnable = true;
+      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+      sf.LineEndCapAntialiasingRegionWidth =
+         state->line_smooth ? _10pixels : _05pixels;
+      sf.LastPixelEnable = state->line_last_pixel;
+      sf.LineWidth = line_width;
+      sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
+      sf.PointWidth = state->point_size;
+
+      if (state->flatshade_first) {
+         sf.TriangleFanProvokingVertexSelect = 1;
+      } else {
+         sf.TriangleStripListProvokingVertexSelect = 2;
+         sf.TriangleFanProvokingVertexSelect = 2;
+         sf.LineStripListProvokingVertexSelect = 1;
+      }
+
+      sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
+      sf.CullMode = translate_cull_mode(state->cull_face);
+
+      sf.ScissorRectangleEnable = true;
+
+#if GFX_VER == 6
+      sf.AttributeSwizzleEnable = true;
+      if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+         sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
+      else
+         sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
+#endif
+
+#if GFX_VER >= 6
+      sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
+      sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
+      sf.GlobalDepthOffsetEnablePoint = state->offset_point;
+      sf.GlobalDepthOffsetConstant = state->offset_units * 2;
+      sf.GlobalDepthOffsetScale = state->offset_scale;
+      sf.GlobalDepthOffsetClamp = state->offset_clamp;
+
+      sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
+      sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
+#endif
+
+#if GFX_VERx10 == 75
+      sf.LineStippleEnable = state->line_stipple_enable;
+#endif
+   }
+#endif
+
+#if GFX_VER >= 6
+   crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
+      /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
+       * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
+       */
+#if GFX_VER >= 7
+      cl.EarlyCullEnable = true;
+#endif
+
+#if GFX_VER == 7
+      cl.FrontWinding = state->front_ccw ? 1 : 0;
+      cl.CullMode = translate_cull_mode(state->cull_face);
+#endif
+      cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
+      cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
+      cl.GuardbandClipTestEnable = true;
+      cl.ClipEnable = true;
+      cl.MinimumPointWidth = 0.125;
+      cl.MaximumPointWidth = 255.875;
+      cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
+
+      if (state->flatshade_first) {
+         cl.TriangleFanProvokingVertexSelect = 1;
+      } else {
+         cl.TriangleStripListProvokingVertexSelect = 2;
+         cl.TriangleFanProvokingVertexSelect = 2;
+         cl.LineStripListProvokingVertexSelect = 1;
+      }
+   }
+#endif
+
+   /* Remap from 0..255 back to 1..256 */
+   const unsigned line_stipple_factor = state->line_stipple_factor + 1;
+
+   crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
+      if (state->line_stipple_enable) {
+         line.LineStipplePattern = state->line_stipple_pattern;
+         line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
+         line.LineStippleRepeatCount = line_stipple_factor;
+      }
+   }
+
+   return cso;
+}
+
+/**
+ * The pipe->bind_rasterizer_state() driver hook.
+ *
+ * Bind a rasterizer CSO and flag related dirty bits.
+ */
+static void
+crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
+   struct crocus_rasterizer_state *new_cso = state;
+
+   if (new_cso) {
+      /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
+      if (cso_changed_memcmp(line_stipple))
+         ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
+#if GFX_VER >= 6
+      if (cso_changed(cso.half_pixel_center))
+         ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
+      if (cso_changed(cso.scissor))
+         ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#else
+      if (cso_changed(cso.scissor))
+         ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+#endif
+
+      if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
+         ice->state.dirty |= CROCUS_DIRTY_WM;
+
+#if GFX_VER >= 6
+      if (cso_changed(cso.rasterizer_discard))
+         ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
+
+      if (cso_changed(cso.flatshade_first))
+         ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+#endif
+
+      if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
+          cso_changed(cso.clip_halfz))
+         ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+
+#if GFX_VER >= 7
+      if (cso_changed(cso.sprite_coord_enable) ||
+          cso_changed(cso.sprite_coord_mode) ||
+          cso_changed(cso.light_twoside))
+         ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
+#endif
+#if GFX_VER <= 5
+      if (cso_changed(cso.clip_plane_enable))
+         ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+   }
+
+   ice->state.cso_rast = new_cso;
+   ice->state.dirty |= CROCUS_DIRTY_RASTER;
+   ice->state.dirty |= CROCUS_DIRTY_CLIP;
+#if GFX_VER <= 5
+   ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
+   ice->state.dirty |= CROCUS_DIRTY_WM;
+#endif
+#if GFX_VER <= 6
+   ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+#endif
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
+}
+
+/**
+ * Return true if the given wrap mode requires the border color to exist.
+ *
+ * (We can skip uploading it if the sampler isn't going to use it.)
+ */
+static bool
+wrap_mode_needs_border_color(unsigned wrap_mode)
+{
+   return wrap_mode == TCM_CLAMP_BORDER;
+}
+
+/**
+ * Gallium CSO for sampler state.
+ */
+struct crocus_sampler_state {
+   struct pipe_sampler_state pstate;
+   union pipe_color_union border_color;
+   bool needs_border_color;
+   unsigned wrap_s;
+   unsigned wrap_t;
+   unsigned wrap_r;
+   unsigned mag_img_filter;
+   float min_lod;
+};
+
+/**
+ * The pipe->create_sampler_state() driver hook.
+ *
+ * We fill out SAMPLER_STATE (except for the border color pointer), and
+ * store that on the CPU.  It doesn't make sense to upload it to a GPU
+ * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
+ * all bound sampler states to be in contiguous memor.
+ */
+static void *
+crocus_create_sampler_state(struct pipe_context *ctx,
+                            const struct pipe_sampler_state *state)
+{
+   struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
+
+   if (!cso)
+      return NULL;
+
+   STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
+   STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
+
+   bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+      state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
+   cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
+   cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
+   cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
+
+   cso->pstate = *state;
+
+   memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
+
+   cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
+                             wrap_mode_needs_border_color(cso->wrap_t) ||
+                             wrap_mode_needs_border_color(cso->wrap_r);
+
+   cso->min_lod = state->min_lod;
+   cso->mag_img_filter = state->mag_img_filter;
+
+   // XXX: explain this code ported from ilo...I don't get it at all...
+   if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
+       state->min_lod > 0.0f) {
+      cso->min_lod = 0.0f;
+      cso->mag_img_filter = state->min_img_filter;
+   }
+
+   return cso;
+}
+
+/**
+ * The pipe->bind_sampler_states() driver hook.
+ */
+static void
+crocus_bind_sampler_states(struct pipe_context *ctx,
+                           enum pipe_shader_type p_stage,
+                           unsigned start, unsigned count,
+                           void **states)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+   assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
+
+   bool dirty = false;
+
+   for (int i = 0; i < count; i++) {
+      if (shs->samplers[start + i] != states[i]) {
+         shs->samplers[start + i] = states[i];
+         dirty = true;
+      }
+   }
+
+   if (dirty) {
+#if GFX_VER <= 5
+      if (p_stage == PIPE_SHADER_FRAGMENT)
+         ice->state.dirty |= CROCUS_DIRTY_WM;
+      else if (p_stage == PIPE_SHADER_VERTEX)
+         ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+#endif
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
+      ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
+   }
+}
+
+enum samp_workaround {
+   SAMP_NORMAL,
+   SAMP_CUBE_CLAMP,
+   SAMP_CUBE_CUBE,
+   SAMP_T_WRAP,
+};
+
+static void
+crocus_upload_sampler_state(struct crocus_batch *batch,
+                            struct crocus_sampler_state *cso,
+                            uint32_t border_color_offset,
+                            enum samp_workaround samp_workaround,
+                            uint32_t first_level,
+                            void *map)
+{
+   struct pipe_sampler_state *state = &cso->pstate;
+   uint32_t wrap_s, wrap_t, wrap_r;
+
+   wrap_s = cso->wrap_s;
+   wrap_t = cso->wrap_t;
+   wrap_r = cso->wrap_r;
+
+   switch (samp_workaround) {
+   case SAMP_CUBE_CLAMP:
+      wrap_s = TCM_CLAMP;
+      wrap_t = TCM_CLAMP;
+      wrap_r = TCM_CLAMP;
+      break;
+   case SAMP_CUBE_CUBE:
+      wrap_s = TCM_CUBE;
+      wrap_t = TCM_CUBE;
+      wrap_r = TCM_CUBE;
+      break;
+   case SAMP_T_WRAP:
+      wrap_t = TCM_WRAP;
+      break;
+   default:
+      break;
+   }
+
+   _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
+      samp.TCXAddressControlMode = wrap_s;
+      samp.TCYAddressControlMode = wrap_t;
+      samp.TCZAddressControlMode = wrap_r;
+
+#if GFX_VER >= 6
+      samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
+#endif
+      samp.MinModeFilter = state->min_img_filter;
+      samp.MagModeFilter = cso->mag_img_filter;
+      samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
+      samp.MaximumAnisotropy = RATIO21;
+
+      if (state->max_anisotropy >= 2) {
+         if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
+#if GFX_VER >= 7
+            samp.AnisotropicAlgorithm = EWAApproximation;
+#endif
+         }
+
+         if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
+            samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
+
+         samp.MaximumAnisotropy =
+            MIN2((state->max_anisotropy - 2) / 2, RATIO161);
+      }
+
+      /* Set address rounding bits if not using nearest filtering. */
+      if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
+         samp.UAddressMinFilterRoundingEnable = true;
+         samp.VAddressMinFilterRoundingEnable = true;
+         samp.RAddressMinFilterRoundingEnable = true;
+      }
+
+      if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
+         samp.UAddressMagFilterRoundingEnable = true;
+         samp.VAddressMagFilterRoundingEnable = true;
+         samp.RAddressMagFilterRoundingEnable = true;
+      }
+
+      if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+         samp.ShadowFunction = translate_shadow_func(state->compare_func);
+
+      const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
+
+      samp.LODPreClampEnable = true;
+      samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
+      samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
+      samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
+
+#if GFX_VER == 6
+      samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
+      samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
+#endif
+
+#if GFX_VER < 6
+      samp.BorderColorPointer =
+         ro_bo(batch->state.bo, border_color_offset);
+#else
+      samp.BorderColorPointer = border_color_offset;
+#endif
+   }
+}
+
+static void
+crocus_upload_border_color(struct crocus_batch *batch,
+                           struct crocus_sampler_state *cso,
+                           struct crocus_sampler_view *tex,
+                           uint32_t *bc_offset)
+{
+   /* We may need to swizzle the border color for format faking.
+    * A/LA formats are faked as R/RG with 000R or R00G swizzles.
+    * This means we need to move the border color's A channel into
+    * the R or G channels so that those read swizzles will move it
+    * back into A.
+    */
+   enum pipe_format internal_format = PIPE_FORMAT_NONE;
+   union pipe_color_union *color = &cso->border_color;
+   union pipe_color_union tmp;
+   if (tex) {
+      internal_format = tex->res->internal_format;
+
+      if (util_format_is_alpha(internal_format)) {
+         unsigned char swz[4] = {
+            PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
+            PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
+         };
+         util_format_apply_color_swizzle(&tmp, color, swz, true);
+         color = &tmp;
+      } else if (util_format_is_luminance_alpha(internal_format) &&
+                 internal_format != PIPE_FORMAT_L8A8_SRGB) {
+         unsigned char swz[4] = {
+            PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
+            PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
+         };
+         util_format_apply_color_swizzle(&tmp, color, swz, true);
+         color = &tmp;
+      }
+   }
+   bool is_integer_format = util_format_is_pure_integer(internal_format);
+   unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
+   const int sbc_align = (GFX_VERx10 == 75 && is_integer_format) ? 512 : 32;
+   uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
+
+   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
+
+#define ASSIGN(dst, src)                        \
+   do {                                         \
+      dst = src;                                \
+   } while (0)
+
+#define ASSIGNu16(dst, src)                     \
+   do {                                         \
+      dst = (uint16_t)src;                      \
+   } while (0)
+
+#define ASSIGNu8(dst, src)                      \
+   do {                                         \
+      dst = (uint8_t)src;                       \
+   } while (0)
+
+#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
+   macro(state.BorderColor ## _color_type ## Red, src[0]);      \
+   macro(state.BorderColor ## _color_type ## Green, src[1]);    \
+   macro(state.BorderColor ## _color_type ## Blue, src[2]);     \
+   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
+
+#if GFX_VERx10 == 75
+   if (is_integer_format) {
+      const struct util_format_description *format_desc =
+         util_format_description(internal_format);
+
+      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
+       * "If any color channel is missing from the surface format,
+       *  corresponding border color should be programmed as zero and if
+       *  alpha channel is missing, corresponding Alpha border color should
+       *  be programmed as 1."
+       */
+      unsigned c[4] = { 0, 0, 0, 1 };
+      for (int i = 0; i < 4; i++) {
+         if (format_desc->channel[i].size)
+            c[i] = color->ui[i];
+      }
+
+      switch (format_desc->channel[0].size) {
+      case 8:
+         /* Copy RGBA in order. */
+         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
+         break;
+      case 10:
+         /* R10G10B10A2_UINT is treated like a 16-bit format. */
+      case 16:
+         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
+         break;
+      case 32:
+         if (format_desc->channel[1].size && !format_desc->channel[2].size) {
+            /* Careful inspection of the tables reveals that for RG32 formats,
+             * the green channel needs to go where blue normally belongs.
+             */
+            state.BorderColor32bitRed = c[0];
+            state.BorderColor32bitBlue = c[1];
+            state.BorderColor32bitAlpha = 1;
+         } else {
+            /* Copy RGBA in order. */
+            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
+         }
+         break;
+      default:
+         assert(!"Invalid number of bits per channel in integer format.");
+         break;
+      }
+   } else {
+      BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+   }
+#elif GFX_VER == 5 || GFX_VER == 6
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
+   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
+
+#define MESA_FLOAT_TO_HALF(dst, src)            \
+   dst = _mesa_float_to_half(src);
+
+   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
+
+#undef MESA_FLOAT_TO_HALF
+
+   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
+   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
+   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
+   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
+
+   BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+
+#elif GFX_VER == 4
+   BORDER_COLOR_ATTR(ASSIGN, , color->f);
+#else
+   BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
+#endif
+
+#undef ASSIGN
+#undef BORDER_COLOR_ATTR
+
+   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
+}
+
+/**
+ * Upload the sampler states into a contiguous area of GPU memory, for
+ * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
+ *
+ * Also fill out the border color state pointers.
+ */
+static void
+crocus_upload_sampler_states(struct crocus_context *ice,
+                             struct crocus_batch *batch, gl_shader_stage stage)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+   /* We assume the state tracker will call pipe->bind_sampler_states()
+    * if the program's number of textures changes.
+    */
+   unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
+
+   if (!count)
+      return;
+
+   /* Assemble the SAMPLER_STATEs into a contiguous table that lives
+    * in the dynamic state memory zone, so we can point to it via the
+    * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
+    */
+   unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
+   uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
+
+   if (unlikely(!map))
+      return;
+
+   for (int i = 0; i < count; i++) {
+      struct crocus_sampler_state *state = shs->samplers[i];
+      struct crocus_sampler_view *tex = shs->textures[i];
+
+      if (!state || !tex) {
+         memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
+      } else {
+         unsigned border_color_offset = 0;
+         if (state->needs_border_color) {
+            crocus_upload_border_color(batch, state, tex, &border_color_offset);
+         }
+
+         enum samp_workaround wa = SAMP_NORMAL;
+         /* There's a bug in 1D texture sampling - it actually pays
+          * attention to the wrap_t value, though it should not.
+          * Override the wrap_t value here to GL_REPEAT to keep
+          * any nonexistent border pixels from floating in.
+          */
+         if (tex->base.target == PIPE_TEXTURE_1D)
+            wa = SAMP_T_WRAP;
+         else if (tex->base.target == PIPE_TEXTURE_CUBE ||
+                  tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
+            /* Cube maps must use the same wrap mode for all three coordinate
+             * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
+             *
+             * Ivybridge and Baytrail seem to have problems with CUBE mode and
+             * integer formats.  Fall back to CLAMP for now.
+             */
+            if (state->pstate.seamless_cube_map &&
+                !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
+               wa = SAMP_CUBE_CUBE;
+            else
+               wa = SAMP_CUBE_CLAMP;
+         }
+
+         uint32_t first_level = 0;
+         if (tex->base.target != PIPE_BUFFER)
+            first_level = tex->base.u.tex.first_level;
+
+         crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
+      }
+
+      map += GENX(SAMPLER_STATE_length);
+   }
+}
+
+/**
+ * The pipe->create_sampler_view() driver hook.
+ */
+static struct pipe_sampler_view *
+crocus_create_sampler_view(struct pipe_context *ctx,
+                           struct pipe_resource *tex,
+                           const struct pipe_sampler_view *tmpl)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
+
+   if (!isv)
+      return NULL;
+
+   /* initialize base object */
+   isv->base = *tmpl;
+   isv->base.context = ctx;
+   isv->base.texture = NULL;
+   pipe_reference_init(&isv->base.reference, 1);
+   pipe_resource_reference(&isv->base.texture, tex);
+
+   if (util_format_is_depth_or_stencil(tmpl->format)) {
+      struct crocus_resource *zres, *sres;
+      const struct util_format_description *desc =
+         util_format_description(tmpl->format);
+
+      crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
+
+      tex = util_format_has_depth(desc) ? &zres->base : &sres->base;
+
+      if (tex->format == PIPE_FORMAT_S8_UINT)
+         if (devinfo->ver == 7 && sres->shadow)
+            tex = &sres->shadow->base;
+   }
+
+   isv->res = (struct crocus_resource *) tex;
+
+   isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
+
+   if (isv->base.target == PIPE_TEXTURE_CUBE ||
+       isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
+      usage |= ISL_SURF_USAGE_CUBE_BIT;
+
+   const struct crocus_format_info fmt =
+      crocus_format_for_usage(devinfo, tmpl->format, usage);
+
+   enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
+   crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
+
+   /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
+   if (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
+       tmpl->format == PIPE_FORMAT_X24S8_UINT) {
+      isv->swizzle[0] = tmpl->swizzle_g;
+      isv->swizzle[1] = tmpl->swizzle_g;
+      isv->swizzle[2] = tmpl->swizzle_g;
+      isv->swizzle[3] = tmpl->swizzle_g;
+   }
+
+   isv->clear_color = isv->res->aux.clear_color;
+
+   isv->view = (struct isl_view) {
+      .format = fmt.fmt,
+#if GFX_VERx10 >= 75
+      .swizzle = (struct isl_swizzle) {
+         .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
+         .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
+         .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
+         .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
+      },
+#else
+      /* swizzling handled in shader code */
+      .swizzle = ISL_SWIZZLE_IDENTITY,
+#endif
+      .usage = usage,
+   };
+
+   /* Fill out SURFACE_STATE for this view. */
+   if (tmpl->target != PIPE_BUFFER) {
+      isv->view.base_level = tmpl->u.tex.first_level;
+      isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
+      // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
+      isv->view.base_array_layer = tmpl->u.tex.first_layer;
+      isv->view.array_len =
+         tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
+   }
+#if GFX_VER >= 6
+   /* just create a second view struct for texture gather just in case */
+   isv->gather_view = isv->view;
+
+#if GFX_VER >= 7
+   if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
+       fmt.fmt == ISL_FORMAT_R32G32_SINT ||
+       fmt.fmt == ISL_FORMAT_R32G32_UINT) {
+      isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
+#if GFX_VERx10 >= 75
+      isv->gather_view.swizzle = (struct isl_swizzle) {
+         .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
+         .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
+         .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
+         .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
+      };
+#endif
+   }
+#endif
+#if GFX_VER == 6
+   /* Sandybridge's gather4 message is broken for integer formats.
+    * To work around this, we pretend the surface is UNORM for
+    * 8 or 16-bit formats, and emit shader instructions to recover
+    * the real INT/UINT value.  For 32-bit formats, we pretend
+    * the surface is FLOAT, and simply reinterpret the resulting
+    * bits.
+    */
+   switch (fmt.fmt) {
+   case ISL_FORMAT_R8_SINT:
+   case ISL_FORMAT_R8_UINT:
+      isv->gather_view.format = ISL_FORMAT_R8_UNORM;
+      break;
+
+   case ISL_FORMAT_R16_SINT:
+   case ISL_FORMAT_R16_UINT:
+      isv->gather_view.format = ISL_FORMAT_R16_UNORM;
+      break;
+
+   case ISL_FORMAT_R32_SINT:
+   case ISL_FORMAT_R32_UINT:
+      isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
+      break;
+
+   default:
+      break;
+   }
+#endif
+#endif
+   /* Fill out SURFACE_STATE for this view. */
+   if (tmpl->target != PIPE_BUFFER) {
+      if (crocus_resource_unfinished_aux_import(isv->res))
+         crocus_resource_finish_aux_import(&screen->base, isv->res);
+
+   }
+
+   return &isv->base;
+}
+
+static void
+crocus_sampler_view_destroy(struct pipe_context *ctx,
+                            struct pipe_sampler_view *state)
+{
+   struct crocus_sampler_view *isv = (void *) state;
+   pipe_resource_reference(&state->texture, NULL);
+   free(isv);
+}
+
+/**
+ * The pipe->create_surface() driver hook.
+ *
+ * In Gallium nomenclature, "surfaces" are a view of a resource that
+ * can be bound as a render target or depth/stencil buffer.
+ */
+static struct pipe_surface *
+crocus_create_surface(struct pipe_context *ctx,
+                      struct pipe_resource *tex,
+                      const struct pipe_surface *tmpl)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+
+   isl_surf_usage_flags_t usage = 0;
+   if (tmpl->writable)
+      usage = ISL_SURF_USAGE_STORAGE_BIT;
+   else if (util_format_is_depth_or_stencil(tmpl->format))
+      usage = ISL_SURF_USAGE_DEPTH_BIT;
+   else
+      usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
+
+   const struct crocus_format_info fmt =
+      crocus_format_for_usage(devinfo, tmpl->format, usage);
+
+   if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
+       !isl_format_supports_rendering(devinfo, fmt.fmt)) {
+      /* Framebuffer validation will reject this invalid case, but it
+       * hasn't had the opportunity yet.  In the meantime, we need to
+       * avoid hitting ISL asserts about unsupported formats below.
+       */
+      return NULL;
+   }
+
+   struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
+   struct pipe_surface *psurf = &surf->base;
+   struct crocus_resource *res = (struct crocus_resource *) tex;
+
+   if (!surf)
+      return NULL;
+
+   pipe_reference_init(&psurf->reference, 1);
+   pipe_resource_reference(&psurf->texture, tex);
+   psurf->context = ctx;
+   psurf->format = tmpl->format;
+   psurf->width = tex->width0;
+   psurf->height = tex->height0;
+   psurf->texture = tex;
+   psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
+   psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
+   psurf->u.tex.level = tmpl->u.tex.level;
+
+   uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
+
+   struct isl_view *view = &surf->view;
+   *view = (struct isl_view) {
+      .format = fmt.fmt,
+      .base_level = tmpl->u.tex.level,
+      .levels = 1,
+      .base_array_layer = tmpl->u.tex.first_layer,
+      .array_len = array_len,
+      .swizzle = ISL_SWIZZLE_IDENTITY,
+      .usage = usage,
+   };
+
+#if GFX_VER >= 6
+   struct isl_view *read_view = &surf->read_view;
+   *read_view = (struct isl_view) {
+      .format = fmt.fmt,
+      .base_level = tmpl->u.tex.level,
+      .levels = 1,
+      .base_array_layer = tmpl->u.tex.first_layer,
+      .array_len = array_len,
+      .swizzle = ISL_SWIZZLE_IDENTITY,
+      .usage = ISL_SURF_USAGE_TEXTURE_BIT,
+   };
+#endif
+
+   surf->clear_color = res->aux.clear_color;
+
+   /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
+   if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
+                          ISL_SURF_USAGE_STENCIL_BIT))
+      return psurf;
+
+   if (!isl_format_is_compressed(res->surf.format)) {
+      if (crocus_resource_unfinished_aux_import(res))
+         crocus_resource_finish_aux_import(&screen->base, res);
+
+      memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
+      uint32_t temp_offset, temp_x, temp_y;
+
+      isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
+                                          res->base.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
+                                          res->base.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
+                                          &temp_offset, &temp_x, &temp_y);
+      if (!devinfo->has_surface_tile_offset &&
+          (temp_x || temp_y)) {
+         /* Original gfx4 hardware couldn't draw to a non-tile-aligned
+          * destination.
+          */
+         /* move to temp */
+         struct pipe_resource wa_templ = (struct pipe_resource) {
+            .width0 = u_minify(res->base.width0, tmpl->u.tex.level),
+            .height0 = u_minify(res->base.height0, tmpl->u.tex.level),
+            .depth0 = 1,
+            .array_size = 1,
+            .format = res->base.format,
+            .target = PIPE_TEXTURE_2D,
+            .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
+         };
+         surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
+         view->base_level = 0;
+         view->base_array_layer = 0;
+         view->array_len = 1;
+         struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
+         memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
+      }
+      return psurf;
+   }
+
+   /* The resource has a compressed format, which is not renderable, but we
+    * have a renderable view format.  We must be attempting to upload blocks
+    * of compressed data via an uncompressed view.
+    *
+    * In this case, we can assume there are no auxiliary buffers, a single
+    * miplevel, and that the resource is single-sampled.  Gallium may try
+    * and create an uncompressed view with multiple layers, however.
+    */
+   assert(!isl_format_is_compressed(fmt.fmt));
+   assert(res->surf.samples == 1);
+   assert(view->levels == 1);
+
+   /* TODO: compressed pbo uploads aren't working here */
+   return NULL;
+
+   uint32_t offset_B = 0, tile_x_sa = 0, tile_y_sa = 0;
+
+   if (view->base_level > 0) {
+      /* We can't rely on the hardware's miplevel selection with such
+       * a substantial lie about the format, so we select a single image
+       * using the Tile X/Y Offset fields.  In this case, we can't handle
+       * multiple array slices.
+       *
+       * On Broadwell, HALIGN and VALIGN are specified in pixels and are
+       * hard-coded to align to exactly the block size of the compressed
+       * texture.  This means that, when reinterpreted as a non-compressed
+       * texture, the tile offsets may be anything and we can't rely on
+       * X/Y Offset.
+       *
+       * Return NULL to force the state tracker to take fallback paths.
+       */
+      // TODO: check if the gen7 check is right, originally gen8
+      if (view->array_len > 1 || GFX_VER == 7)
+         return NULL;
+
+      const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
+      isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
+                              view->base_level,
+                              is_3d ? 0 : view->base_array_layer,
+                              is_3d ? view->base_array_layer : 0,
+                              &surf->surf,
+                              &offset_B, &tile_x_sa, &tile_y_sa);
+
+      /* We use address and tile offsets to access a single level/layer
+       * as a subimage, so reset level/layer so it doesn't offset again.
+       */
+      view->base_array_layer = 0;
+      view->base_level = 0;
+   } else {
+      /* Level 0 doesn't require tile offsets, and the hardware can find
+       * array slices using QPitch even with the format override, so we
+       * can allow layers in this case.  Copy the original ISL surface.
+       */
+      memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
+   }
+
+   /* Scale down the image dimensions by the block size. */
+   const struct isl_format_layout *fmtl =
+      isl_format_get_layout(res->surf.format);
+   surf->surf.format = fmt.fmt;
+   surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
+   surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
+   tile_x_sa /= fmtl->bw;
+   tile_y_sa /= fmtl->bh;
+
+   psurf->width = surf->surf.logical_level0_px.width;
+   psurf->height = surf->surf.logical_level0_px.height;
+
+   return psurf;
+}
+
+#if GFX_VER == 7
+static void
+fill_default_image_param(struct brw_image_param *param)
+{
+   memset(param, 0, sizeof(*param));
+   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
+    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
+    * detailed explanation of these parameters.
+    */
+   param->swizzling[0] = 0xff;
+   param->swizzling[1] = 0xff;
+}
+
+static void
+fill_buffer_image_param(struct brw_image_param *param,
+                        enum pipe_format pfmt,
+                        unsigned size)
+{
+   const unsigned cpp = util_format_get_blocksize(pfmt);
+
+   fill_default_image_param(param);
+   param->size[0] = size / cpp;
+   param->stride[0] = cpp;
+}
+
+#endif
+
+/**
+ * The pipe->set_shader_images() driver hook.
+ */
+static void
+crocus_set_shader_images(struct pipe_context *ctx,
+                         enum pipe_shader_type p_stage,
+                         unsigned start_slot, unsigned count,
+                         unsigned unbind_num_trailing_slots,
+                         const struct pipe_image_view *p_images)
+{
+#if GFX_VER == 7
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   struct crocus_genx_state *genx = ice->state.genx;
+   struct brw_image_param *image_params = genx->shaders[stage].image_param;
+
+   shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
+
+   for (unsigned i = 0; i < count; i++) {
+      struct crocus_image_view *iv = &shs->image[start_slot + i];
+
+      if (p_images && p_images[i].resource) {
+         const struct pipe_image_view *img = &p_images[i];
+         struct crocus_resource *res = (void *) img->resource;
+
+         util_copy_image_view(&iv->base, img);
+
+         shs->bound_image_views |= 1 << (start_slot + i);
+
+         res->bind_history |= PIPE_BIND_SHADER_IMAGE;
+         res->bind_stages |= 1 << stage;
+
+         isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
+         struct crocus_format_info fmt =
+            crocus_format_for_usage(devinfo, img->format, usage);
+
+         struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
+         if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
+            /* On Gen8, try to use typed surfaces reads (which support a
+             * limited number of formats), and if not possible, fall back
+             * to untyped reads.
+             */
+            if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
+               fmt.fmt = ISL_FORMAT_RAW;
+            else
+               fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
+         }
+
+         if (res->base.target != PIPE_BUFFER) {
+            struct isl_view view = {
+               .format = fmt.fmt,
+               .base_level = img->u.tex.level,
+               .levels = 1,
+               .base_array_layer = img->u.tex.first_layer,
+               .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
+               .swizzle = swiz,
+               .usage = usage,
+            };
+
+            iv->view = view;
+
+            isl_surf_fill_image_param(&screen->isl_dev,
+                                      &image_params[start_slot + i],
+                                      &res->surf, &view);
+         } else {
+            struct isl_view view = {
+               .format = fmt.fmt,
+               .swizzle = swiz,
+               .usage = usage,
+            };
+            iv->view = view;
+
+            util_range_add(&res->base, &res->valid_buffer_range, img->u.buf.offset,
+                           img->u.buf.offset + img->u.buf.size);
+            fill_buffer_image_param(&image_params[start_slot + i],
+                                    img->format, img->u.buf.size);
+         }
+      } else {
+         pipe_resource_reference(&iv->base.resource, NULL);
+         fill_default_image_param(&image_params[start_slot + i]);
+      }
+   }
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+   ice->state.dirty |=
+      stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
+                                   : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+   /* Broadwell also needs brw_image_params re-uploaded */
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
+   shs->sysvals_need_upload = true;
+#endif
+}
+
+
+/**
+ * The pipe->set_sampler_views() driver hook.
+ */
+static void
+crocus_set_sampler_views(struct pipe_context *ctx,
+                         enum pipe_shader_type p_stage,
+                         unsigned start, unsigned count,
+                         unsigned unbind_num_trailing_slots,
+                         struct pipe_sampler_view **views)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+   shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
+
+   for (unsigned i = 0; i < count; i++) {
+      struct pipe_sampler_view *pview = views ? views[i] : NULL;
+      pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                  &shs->textures[start + i], pview);
+      struct crocus_sampler_view *view = (void *) pview;
+      if (view) {
+         view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
+         view->res->bind_stages |= 1 << stage;
+
+         shs->bound_sampler_views |= 1 << (start + i);
+      }
+   }
+
+   ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
+   ice->state.dirty |=
+      stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
+                                   : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
+}
+
+/**
+ * The pipe->set_tess_state() driver hook.
+ */
+static void
+crocus_set_tess_state(struct pipe_context *ctx,
+                      const float default_outer_level[4],
+                      const float default_inner_level[2])
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
+
+   memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
+   memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+   shs->sysvals_need_upload = true;
+}
+
+static void
+crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
+{
+   struct crocus_surface *surf = (void *) p_surf;
+   pipe_resource_reference(&p_surf->texture, NULL);
+
+   pipe_resource_reference(&surf->align_res, NULL);
+   free(surf);
+}
+
+static void
+crocus_set_clip_state(struct pipe_context *ctx,
+                      const struct pipe_clip_state *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
+   struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+   struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
+
+   memcpy(&ice->state.clip_planes, state, sizeof(*state));
+
+#if GFX_VER <= 5
+   ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+                             CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+   shs->sysvals_need_upload = true;
+   gshs->sysvals_need_upload = true;
+   tshs->sysvals_need_upload = true;
+}
+
+/**
+ * The pipe->set_polygon_stipple() driver hook.
+ */
+static void
+crocus_set_polygon_stipple(struct pipe_context *ctx,
+                           const struct pipe_poly_stipple *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   memcpy(&ice->state.poly_stipple, state, sizeof(*state));
+   ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
+}
+
+/**
+ * The pipe->set_sample_mask() driver hook.
+ */
+static void
+crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   /* We only support 16x MSAA, so we have 16 bits of sample maks.
+    * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
+    */
+   ice->state.sample_mask = sample_mask & 0xff;
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
+}
+
+static void
+crocus_fill_scissor_rect(struct crocus_context *ice,
+                         int idx,
+                         struct pipe_scissor_state *ss)
+{
+   struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+   struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+   const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
+   struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
+      .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
+      .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
+      .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
+      .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
+   };
+   if (cso_state->scissor) {
+      struct pipe_scissor_state *s = &ice->state.scissors[idx];
+      scissor.minx = MAX2(scissor.minx, s->minx);
+      scissor.miny = MAX2(scissor.miny, s->miny);
+      scissor.maxx = MIN2(scissor.maxx, s->maxx);
+      scissor.maxy = MIN2(scissor.maxy, s->maxy);
+   }
+   *ss = scissor;
+}
+
+/**
+ * The pipe->set_scissor_states() driver hook.
+ *
+ * This corresponds to our SCISSOR_RECT state structures.  It's an
+ * exact match, so we just store them, and memcpy them out later.
+ */
+static void
+crocus_set_scissor_states(struct pipe_context *ctx,
+                          unsigned start_slot,
+                          unsigned num_scissors,
+                          const struct pipe_scissor_state *rects)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   for (unsigned i = 0; i < num_scissors; i++) {
+      if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
+         /* If the scissor was out of bounds and got clamped to 0 width/height
+          * at the bounds, the subtraction of 1 from maximums could produce a
+          * negative number and thus not clip anything.  Instead, just provide
+          * a min > max scissor inside the bounds, which produces the expected
+          * no rendering.
+          */
+         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
+            .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
+         };
+      } else {
+         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
+            .minx = rects[i].minx,     .miny = rects[i].miny,
+            .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
+         };
+      }
+   }
+
+#if GFX_VER < 6
+   ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
+#else
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+   ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+
+}
+
+/**
+ * The pipe->set_stencil_ref() driver hook.
+ *
+ * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
+ */
+static void
+crocus_set_stencil_ref(struct pipe_context *ctx,
+                       const struct pipe_stencil_ref ref)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   ice->state.stencil_ref = ref;
+   ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+}
+
+/**
+ * The pipe->set_viewport_states() driver hook.
+ *
+ * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
+ * the guardband yet, as we need the framebuffer dimensions, but we can
+ * at least fill out the rest.
+ */
+static void
+crocus_set_viewport_states(struct pipe_context *ctx,
+                           unsigned start_slot,
+                           unsigned count,
+                           const struct pipe_viewport_state *states)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
+
+   ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+   ice->state.dirty |= CROCUS_DIRTY_RASTER;
+#if GFX_VER >= 6
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+
+   if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
+                               !ice->state.cso_rast->cso.depth_clip_far))
+      ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
+}
+
+/**
+ * The pipe->set_framebuffer_state() driver hook.
+ *
+ * Sets the current draw FBO, including color render targets, depth,
+ * and stencil buffers.
+ */
+static void
+crocus_set_framebuffer_state(struct pipe_context *ctx,
+                             const struct pipe_framebuffer_state *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+#if 0
+   struct isl_device *isl_dev = &screen->isl_dev;
+   struct crocus_resource *zres;
+   struct crocus_resource *stencil_res;
+#endif
+
+   unsigned samples = util_framebuffer_get_num_samples(state);
+   unsigned layers = util_framebuffer_get_num_layers(state);
+
+#if GFX_VER >= 6
+   if (cso->samples != samples) {
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
+      ice->state.dirty |= CROCUS_DIRTY_RASTER;
+#if GFX_VERx10 == 75
+      ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
+#endif
+   }
+#endif
+
+#if GFX_VER >= 6
+   if (cso->nr_cbufs != state->nr_cbufs) {
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+   }
+#endif
+
+   if ((cso->layers == 0) != (layers == 0)) {
+      ice->state.dirty |= CROCUS_DIRTY_CLIP;
+   }
+
+   if (cso->width != state->width || cso->height != state->height) {
+      ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
+      ice->state.dirty |= CROCUS_DIRTY_RASTER;
+      ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
+#if GFX_VER >= 6
+      ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+#endif
+   }
+
+   if (cso->zsbuf || state->zsbuf) {
+      ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
+
+      /* update SF's depth buffer format */
+      if (GFX_VER == 7 && cso->zsbuf)
+         ice->state.dirty |= CROCUS_DIRTY_RASTER;
+   }
+
+   /* wm thread dispatch enable */
+   ice->state.dirty |= CROCUS_DIRTY_WM;
+   util_copy_framebuffer_state(cso, state);
+   cso->samples = samples;
+   cso->layers = layers;
+
+   if (cso->zsbuf) {
+      struct crocus_resource *zres;
+      struct crocus_resource *stencil_res;
+      enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
+      crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
+                                         &stencil_res);
+      if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
+         aux_usage = zres->aux.usage;
+      }
+      ice->state.hiz_usage = aux_usage;
+   }
+
+   /* Render target change */
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
+
+   ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
+
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
+}
+
+/**
+ * The pipe->set_constant_buffer() driver hook.
+ *
+ * This uploads any constant data in user buffers, and references
+ * any UBO resources containing constant data.
+ */
+static void
+crocus_set_constant_buffer(struct pipe_context *ctx,
+                           enum pipe_shader_type p_stage, unsigned index,
+                           bool take_ownership,
+                           const struct pipe_constant_buffer *input)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
+
+   util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
+
+   if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
+      shs->bound_cbufs |= 1u << index;
+
+      if (input->user_buffer) {
+         void *map = NULL;
+         pipe_resource_reference(&cbuf->buffer, NULL);
+         u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
+                        &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
+
+         if (!cbuf->buffer) {
+            /* Allocation was unsuccessful - just unbind */
+            crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
+            return;
+         }
+
+         assert(map);
+         memcpy(map, input->user_buffer, input->buffer_size);
+      }
+      cbuf->buffer_size =
+         MIN2(input->buffer_size,
+              crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
+
+      struct crocus_resource *res = (void *) cbuf->buffer;
+      res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
+      res->bind_stages |= 1 << stage;
+   } else {
+      shs->bound_cbufs &= ~(1u << index);
+   }
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
+}
+
+static void
+upload_sysvals(struct crocus_context *ice,
+               gl_shader_stage stage)
+{
+   UNUSED struct crocus_genx_state *genx = ice->state.genx;
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+   if (!shader || shader->num_system_values == 0)
+      return;
+
+   assert(shader->num_cbufs > 0);
+
+   unsigned sysval_cbuf_index = shader->num_cbufs - 1;
+   struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
+   unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
+   uint32_t *map = NULL;
+
+   assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
+   u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
+                  &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
+
+   for (int i = 0; i < shader->num_system_values; i++) {
+      uint32_t sysval = shader->system_values[i];
+      uint32_t value = 0;
+
+      if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
+#if GFX_VER == 7
+         unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
+         unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
+         struct brw_image_param *param =
+            &genx->shaders[stage].image_param[img];
+
+         assert(offset < sizeof(struct brw_image_param));
+         value = ((uint32_t *) param)[offset];
+#endif
+      } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
+         value = 0;
+      } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
+         int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
+         int comp  = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
+         value = fui(ice->state.clip_planes.ucp[plane][comp]);
+      } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
+         if (stage == MESA_SHADER_TESS_CTRL) {
+            value = ice->state.vertices_per_patch;
+         } else {
+            assert(stage == MESA_SHADER_TESS_EVAL);
+            const struct shader_info *tcs_info =
+               crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
+            if (tcs_info)
+               value = tcs_info->tess.tcs_vertices_out;
+            else
+               value = ice->state.vertices_per_patch;
+         }
+      } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
+                 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
+         unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
+         value = fui(ice->state.default_outer_level[i]);
+      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
+         value = fui(ice->state.default_inner_level[0]);
+      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
+         value = fui(ice->state.default_inner_level[1]);
+      } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
+                 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
+         unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
+         value = ice->state.last_block[i];
+      } else {
+         assert(!"unhandled system value");
+      }
+
+      *map++ = value;
+   }
+
+   cbuf->buffer_size = upload_size;
+   shs->sysvals_need_upload = false;
+}
+
+/**
+ * The pipe->set_shader_buffers() driver hook.
+ *
+ * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
+ * SURFACE_STATE here, as the buffer offset may change each time.
+ */
+static void
+crocus_set_shader_buffers(struct pipe_context *ctx,
+                          enum pipe_shader_type p_stage,
+                          unsigned start_slot, unsigned count,
+                          const struct pipe_shader_buffer *buffers,
+                          unsigned writable_bitmask)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   gl_shader_stage stage = stage_from_pipe(p_stage);
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+   unsigned modified_bits = u_bit_consecutive(start_slot, count);
+
+   shs->bound_ssbos &= ~modified_bits;
+   shs->writable_ssbos &= ~modified_bits;
+   shs->writable_ssbos |= writable_bitmask << start_slot;
+
+   for (unsigned i = 0; i < count; i++) {
+      if (buffers && buffers[i].buffer) {
+         struct crocus_resource *res = (void *) buffers[i].buffer;
+         struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
+         pipe_resource_reference(&ssbo->buffer, &res->base);
+         ssbo->buffer_offset = buffers[i].buffer_offset;
+         ssbo->buffer_size =
+            MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
+
+         shs->bound_ssbos |= 1 << (start_slot + i);
+
+         res->bind_history |= PIPE_BIND_SHADER_BUFFER;
+         res->bind_stages |= 1 << stage;
+
+         util_range_add(&res->base, &res->valid_buffer_range, ssbo->buffer_offset,
+                        ssbo->buffer_offset + ssbo->buffer_size);
+      } else {
+         pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
+      }
+   }
+
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
+}
+
+static void
+crocus_delete_state(struct pipe_context *ctx, void *state)
+{
+   free(state);
+}
+
+/**
+ * The pipe->set_vertex_buffers() driver hook.
+ *
+ * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
+ */
+static void
+crocus_set_vertex_buffers(struct pipe_context *ctx,
+                          unsigned start_slot, unsigned count,
+                          unsigned unbind_num_trailing_slots,
+                          bool take_ownership,
+                          const struct pipe_vertex_buffer *buffers)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
+   const unsigned padding =
+      (!(GFX_VERx10 == 75) && !screen->devinfo.is_baytrail) * 2;
+   ice->state.bound_vertex_buffers &=
+      ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
+
+   util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
+                                buffers, start_slot, count, unbind_num_trailing_slots,
+                                take_ownership);
+
+   for (unsigned i = 0; i < count; i++) {
+      struct pipe_vertex_buffer *state =
+         &ice->state.vertex_buffers[start_slot + i];
+
+      if (!state->is_user_buffer && state->buffer.resource) {
+         struct crocus_resource *res = (void *)state->buffer.resource;
+         res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
+      }
+
+      uint32_t end = 0;
+      if (state->buffer.resource)
+         end = state->buffer.resource->width0 + padding;
+      ice->state.vb_end[start_slot + i] = end;
+   }
+   ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
+}
+
+#if !(GFX_VERx10 == 75)
+static uint8_t get_wa_flags(enum isl_format format)
+{
+   uint8_t wa_flags = 0;
+
+   switch (format) {
+   case ISL_FORMAT_R10G10B10A2_USCALED:
+      wa_flags = BRW_ATTRIB_WA_SCALE;
+      break;
+   case ISL_FORMAT_R10G10B10A2_SSCALED:
+      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
+      break;
+   case ISL_FORMAT_R10G10B10A2_UNORM:
+      wa_flags = BRW_ATTRIB_WA_NORMALIZE;
+      break;
+   case ISL_FORMAT_R10G10B10A2_SNORM:
+      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
+      break;
+   case ISL_FORMAT_R10G10B10A2_SINT:
+      wa_flags = BRW_ATTRIB_WA_SIGN;
+      break;
+   case ISL_FORMAT_B10G10R10A2_USCALED:
+      wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
+      break;
+   case ISL_FORMAT_B10G10R10A2_SSCALED:
+      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
+      break;
+   case ISL_FORMAT_B10G10R10A2_UNORM:
+      wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
+      break;
+   case ISL_FORMAT_B10G10R10A2_SNORM:
+      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
+      break;
+   case ISL_FORMAT_B10G10R10A2_SINT:
+      wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
+      break;
+   case ISL_FORMAT_B10G10R10A2_UINT:
+      wa_flags = BRW_ATTRIB_WA_BGRA;
+      break;
+   default:
+      break;
+   }
+   return wa_flags;
+}
+#endif
+
+/**
+ * Gallium CSO for vertex elements.
+ */
+struct crocus_vertex_element_state {
+   uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
+   uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
+   uint32_t step_rate[16];
+   uint8_t wa_flags[33];
+   unsigned count;
+};
+
+/**
+ * The pipe->create_vertex_elements() driver hook.
+ *
+ * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
+ * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
+ * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
+ * needed. In these cases we will need information available at draw time.
+ * We setup edgeflag_ve and edgeflag_vfi as alternatives last
+ * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
+ * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
+ */
+static void *
+crocus_create_vertex_elements(struct pipe_context *ctx,
+                              unsigned count,
+                              const struct pipe_vertex_element *state)
+{
+   struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_vertex_element_state *cso =
+      malloc(sizeof(struct crocus_vertex_element_state));
+
+   cso->count = count;
+
+   crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
+      ve.DWordLength =
+         1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
+   }
+
+   uint32_t *ve_pack_dest = &cso->vertex_elements[1];
+
+   if (count == 0) {
+      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+         ve.Valid = true;
+         ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
+         ve.Component0Control = VFCOMP_STORE_0;
+         ve.Component1Control = VFCOMP_STORE_0;
+         ve.Component2Control = VFCOMP_STORE_0;
+         ve.Component3Control = VFCOMP_STORE_1_FP;
+      }
+   }
+
+   for (int i = 0; i < count; i++) {
+      const struct crocus_format_info fmt =
+         crocus_format_for_usage(devinfo, state[i].src_format, 0);
+      unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
+                           VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
+      enum isl_format actual_fmt = fmt.fmt;
+
+#if !(GFX_VERx10 == 75)
+      cso->wa_flags[i] = get_wa_flags(fmt.fmt);
+
+      if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
+          fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
+          fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
+          fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
+          fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
+          fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
+         actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
+      if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
+         actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
+      if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
+         actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
+      if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
+         actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
+      if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
+         actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
+#endif
+
+      cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
+
+      switch (isl_format_get_num_channels(fmt.fmt)) {
+      case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
+      case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
+      case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
+      case 3:
+         comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
+            : VFCOMP_STORE_1_FP;
+         break;
+      }
+      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+#if GFX_VER >= 6
+         ve.EdgeFlagEnable = false;
+#endif
+         ve.VertexBufferIndex = state[i].vertex_buffer_index;
+         ve.Valid = true;
+         ve.SourceElementOffset = state[i].src_offset;
+         ve.SourceElementFormat = actual_fmt;
+         ve.Component0Control = comp[0];
+         ve.Component1Control = comp[1];
+         ve.Component2Control = comp[2];
+         ve.Component3Control = comp[3];
+#if GFX_VER < 5
+         ve.DestinationElementOffset = i * 4;
+#endif
+      }
+
+      ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+   }
+
+   /* An alternative version of the last VE and VFI is stored so it
+    * can be used at draw time in case Vertex Shader uses EdgeFlag
+    */
+   if (count) {
+      const unsigned edgeflag_index = count - 1;
+      const struct crocus_format_info fmt =
+         crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
+      crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
+#if GFX_VER >= 6
+         ve.EdgeFlagEnable = true;
+#endif
+         ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
+         ve.Valid = true;
+         ve.SourceElementOffset = state[edgeflag_index].src_offset;
+         ve.SourceElementFormat = fmt.fmt;
+         ve.Component0Control = VFCOMP_STORE_SRC;
+         ve.Component1Control = VFCOMP_STORE_0;
+         ve.Component2Control = VFCOMP_STORE_0;
+         ve.Component3Control = VFCOMP_STORE_0;
+      }
+   }
+
+   return cso;
+}
+
+/**
+ * The pipe->bind_vertex_elements_state() driver hook.
+ */
+static void
+crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   ice->state.cso_vertex_elements = state;
+   ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
+   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
+}
+
+#if GFX_VER >= 6
+struct crocus_streamout_counter {
+   uint32_t offset_start;
+   uint32_t offset_end;
+
+   uint64_t accum;
+};
+
+/**
+ * Gallium CSO for stream output (transform feedback) targets.
+ */
+struct crocus_stream_output_target {
+   struct pipe_stream_output_target base;
+
+   /** Stride (bytes-per-vertex) during this transform feedback operation */
+   uint16_t stride;
+
+   /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
+   bool zeroed;
+
+   struct crocus_resource *offset_res;
+   uint32_t offset_offset;
+
+#if GFX_VER == 6
+   void *prim_map;
+   struct crocus_streamout_counter prev_count;
+   struct crocus_streamout_counter count;
+#endif
+};
+
+#if GFX_VER >= 7
+static uint32_t
+crocus_get_so_offset(struct pipe_stream_output_target *so)
+{
+   struct crocus_stream_output_target *tgt = (void *)so;
+   struct pipe_transfer *transfer;
+   struct pipe_box box;
+   uint32_t result;
+   u_box_1d(tgt->offset_offset, 4, &box);
+   void *val = so->context->buffer_map(so->context, &tgt->offset_res->base,
+                                       0, PIPE_MAP_DIRECTLY,
+                                       &box, &transfer);
+   assert(val);
+   result = *(uint32_t *)val;
+   so->context->buffer_unmap(so->context, transfer);
+
+   return result / tgt->stride;
+}
+#endif
+
+#if GFX_VER == 6
+static void
+compute_vertices_written_so_far(struct crocus_context *ice,
+                                struct crocus_stream_output_target *tgt,
+                                struct crocus_streamout_counter *count,
+                                uint64_t *svbi);
+
+static uint32_t
+crocus_get_so_offset(struct pipe_stream_output_target *so)
+{
+   struct crocus_stream_output_target *tgt = (void *)so;
+   struct crocus_context *ice = (void *)so->context;
+
+   uint64_t vert_written;
+   compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
+   return vert_written;
+}
+#endif
+
+/**
+ * The pipe->create_stream_output_target() driver hook.
+ *
+ * "Target" here refers to a destination buffer.  We translate this into
+ * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
+ * know which buffer this represents, or whether we ought to zero the
+ * write-offsets, or append.  Those are handled in the set() hook.
+ */
+static struct pipe_stream_output_target *
+crocus_create_stream_output_target(struct pipe_context *ctx,
+                                   struct pipe_resource *p_res,
+                                   unsigned buffer_offset,
+                                   unsigned buffer_size)
+{
+   struct crocus_resource *res = (void *) p_res;
+   struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
+   if (!cso)
+      return NULL;
+
+   res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
+
+   pipe_reference_init(&cso->base.reference, 1);
+   pipe_resource_reference(&cso->base.buffer, p_res);
+   cso->base.buffer_offset = buffer_offset;
+   cso->base.buffer_size = buffer_size;
+   cso->base.context = ctx;
+
+   util_range_add(&res->base, &res->valid_buffer_range, buffer_offset,
+                  buffer_offset + buffer_size);
+#if GFX_VER >= 7
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   void *temp;
+   u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
+                  &cso->offset_offset,
+                  (struct pipe_resource **)&cso->offset_res,
+                  &temp);
+#endif
+
+   return &cso->base;
+}
+
+static void
+crocus_stream_output_target_destroy(struct pipe_context *ctx,
+                                    struct pipe_stream_output_target *state)
+{
+   struct crocus_stream_output_target *cso = (void *) state;
+
+   pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
+   pipe_resource_reference(&cso->base.buffer, NULL);
+
+   free(cso);
+}
+
+#define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
+#define GEN7_SO_WRITE_OFFSET(n)         (0x5280 + (n) * 4)
+
+#if GFX_VER == 6
+static void
+aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
+                         struct crocus_streamout_counter *counter)
+{
+   uint64_t *prim_counts = tgt->prim_map;
+
+   if (crocus_batch_references(batch, tgt->offset_res->bo)) {
+      struct pipe_fence_handle *out_fence = NULL;
+      batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
+      batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
+      batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
+   }
+
+   for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
+      counter->accum += prim_counts[i + 1] - prim_counts[i];
+   }
+   tgt->count.offset_start = tgt->count.offset_end = 0;
+}
+
+static void
+crocus_stream_store_prims_written(struct crocus_batch *batch,
+                                  struct crocus_stream_output_target *tgt)
+{
+   if (!tgt->offset_res) {
+      u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
+                     &tgt->offset_offset,
+                     (struct pipe_resource **)&tgt->offset_res,
+                     &tgt->prim_map);
+      tgt->count.offset_start = tgt->count.offset_end = 0;
+   }
+
+   if (tgt->count.offset_end + 16 >= 4096) {
+      aggregate_stream_counter(batch, tgt, &tgt->prev_count);
+      aggregate_stream_counter(batch, tgt, &tgt->count);
+   }
+
+   crocus_emit_mi_flush(batch);
+   crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
+                               tgt->offset_res->bo,
+                               tgt->count.offset_end + tgt->offset_offset, false);
+   tgt->count.offset_end += 8;
+}
+
+static void
+compute_vertices_written_so_far(struct crocus_context *ice,
+                                struct crocus_stream_output_target *tgt,
+                                struct crocus_streamout_counter *counter,
+                                uint64_t *svbi)
+{
+   //TODO vertices per prim
+   aggregate_stream_counter(&ice->batches[0], tgt, counter);
+
+   *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
+}
+#endif
+/**
+ * The pipe->set_stream_output_targets() driver hook.
+ *
+ * At this point, we know which targets are bound to a particular index,
+ * and also whether we want to append or start over.  We can finish the
+ * 3DSTATE_SO_BUFFER packets we started earlier.
+ */
+static void
+crocus_set_stream_output_targets(struct pipe_context *ctx,
+                                 unsigned num_targets,
+                                 struct pipe_stream_output_target **targets,
+                                 const unsigned *offsets)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+   struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
+   const bool active = num_targets > 0;
+   if (ice->state.streamout_active != active) {
+      ice->state.streamout_active = active;
+#if GFX_VER >= 7
+      ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+#else
+      ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
+#endif
+
+      /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
+       * it's a non-pipelined command.  If we're switching streamout on, we
+       * may have missed emitting it earlier, so do so now.  (We're already
+       * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
+       */
+      if (active) {
+#if GFX_VER >= 7
+         ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
+#endif
+      } else {
+         uint32_t flush = 0;
+         for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+            struct crocus_stream_output_target *tgt =
+               (void *) ice->state.so_target[i];
+            if (tgt) {
+               struct crocus_resource *res = (void *) tgt->base.buffer;
+
+               flush |= crocus_flush_bits_for_history(res);
+               crocus_dirty_for_history(ice, res);
+            }
+         }
+         crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
+                                        "make streamout results visible", flush);
+      }
+   }
+
+   ice->state.so_targets = num_targets;
+   for (int i = 0; i < 4; i++) {
+      pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
+      pipe_so_target_reference(&ice->state.so_target[i],
+                               i < num_targets ? targets[i] : NULL);
+   }
+
+#if GFX_VER == 6
+   bool stored_num_prims = false;
+   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      if (num_targets) {
+         struct crocus_stream_output_target *tgt =
+            (void *) ice->state.so_target[i];
+
+         if (!tgt)
+            continue;
+         if (offsets[i] == 0) {
+            // This means that we're supposed to ignore anything written to
+            // the buffer before. We can do this by just clearing out the
+            // count of writes to the prim count buffer.
+            tgt->count.offset_start = tgt->count.offset_end;
+            tgt->count.accum = 0;
+            ice->state.svbi = 0;
+         } else {
+            if (tgt->offset_res) {
+               compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
+               tgt->count.offset_start = tgt->count.offset_end;
+            }
+         }
+
+         if (!stored_num_prims) {
+            crocus_stream_store_prims_written(batch, tgt);
+            stored_num_prims = true;
+         }
+      } else {
+         struct crocus_stream_output_target *tgt =
+            (void *) old_tgt[i];
+         if (tgt) {
+            if (!stored_num_prims) {
+               crocus_stream_store_prims_written(batch, tgt);
+               stored_num_prims = true;
+            }
+
+            if (tgt->offset_res) {
+               tgt->prev_count = tgt->count;
+            }
+         }
+      }
+      pipe_so_target_reference(&old_tgt[i], NULL);
+   }
+
+#else
+   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      if (num_targets) {
+         struct crocus_stream_output_target *tgt =
+            (void *) ice->state.so_target[i];
+
+         if (offsets[i] == 0)
+            crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
+         else if (tgt)
+            crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
+                                       tgt->offset_res->bo,
+                                       tgt->offset_offset);
+      } else {
+         struct crocus_stream_output_target *tgt =
+            (void *) old_tgt[i];
+         if (tgt)
+            crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
+                                        tgt->offset_res->bo,
+                                        tgt->offset_offset, false);
+      }
+      pipe_so_target_reference(&old_tgt[i], NULL);
+   }
+#endif
+   /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
+   if (!active)
+      return;
+#if GFX_VER >= 7
+   ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
+#elif GFX_VER == 6
+   ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
+   ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
+#endif
+}
+
+#endif
+
+#if GFX_VER >= 7
+/**
+ * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
+ * 3DSTATE_STREAMOUT packets.
+ *
+ * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
+ * hardware to record.  We can create it entirely based on the shader, with
+ * no dynamic state dependencies.
+ *
+ * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
+ * state-based settings.  We capture the shader-related ones here, and merge
+ * the rest in at draw time.
+ */
+static uint32_t *
+crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
+                           const struct brw_vue_map *vue_map)
+{
+   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
+   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
+   int max_decls = 0;
+   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
+
+   memset(so_decl, 0, sizeof(so_decl));
+
+   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
+    * command feels strange -- each dword pair contains a SO_DECL per stream.
+    */
+   for (unsigned i = 0; i < info->num_outputs; i++) {
+      const struct pipe_stream_output *output = &info->output[i];
+      const int buffer = output->output_buffer;
+      const int varying = output->register_index;
+      const unsigned stream_id = output->stream;
+      assert(stream_id < MAX_VERTEX_STREAMS);
+
+      buffer_mask[stream_id] |= 1 << buffer;
+
+      assert(vue_map->varying_to_slot[varying] >= 0);
+
+      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
+       * array.  Instead, it simply increments DstOffset for the following
+       * input by the number of components that should be skipped.
+       *
+       * Our hardware is unusual in that it requires us to program SO_DECLs
+       * for fake "hole" components, rather than simply taking the offset
+       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
+       * program as many size = 4 holes as we can, then a final hole to
+       * accommodate the final 1, 2, or 3 remaining.
+       */
+      int skip_components = output->dst_offset - next_offset[buffer];
+
+      while (skip_components > 0) {
+         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+            .HoleFlag = 1,
+            .OutputBufferSlot = output->output_buffer,
+            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
+         };
+         skip_components -= 4;
+      }
+
+      next_offset[buffer] = output->dst_offset + output->num_components;
+
+      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
+         .OutputBufferSlot = output->output_buffer,
+         .RegisterIndex = vue_map->varying_to_slot[varying],
+         .ComponentMask =
+            ((1 << output->num_components) - 1) << output->start_component,
+      };
+
+      if (decls[stream_id] > max_decls)
+         max_decls = decls[stream_id];
+   }
+
+   unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
+   uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
+   uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
+
+   crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
+      int urb_entry_read_offset = 0;
+      int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
+         urb_entry_read_offset;
+
+      /* We always read the whole vertex.  This could be reduced at some
+       * point by reading less and offsetting the register index in the
+       * SO_DECLs.
+       */
+      sol.Stream0VertexReadOffset = urb_entry_read_offset;
+      sol.Stream0VertexReadLength = urb_entry_read_length - 1;
+      sol.Stream1VertexReadOffset = urb_entry_read_offset;
+      sol.Stream1VertexReadLength = urb_entry_read_length - 1;
+      sol.Stream2VertexReadOffset = urb_entry_read_offset;
+      sol.Stream2VertexReadLength = urb_entry_read_length - 1;
+      sol.Stream3VertexReadOffset = urb_entry_read_offset;
+      sol.Stream3VertexReadLength = urb_entry_read_length - 1;
+
+      // TODO: Double-check that stride == 0 means no buffer. Probably this
+      // needs to go elsewhere, where the buffer enable stuff is actually
+      // known.
+      sol.SOBufferEnable0 = !!info->stride[0];
+      sol.SOBufferEnable1 = !!info->stride[1];
+      sol.SOBufferEnable2 = !!info->stride[2];
+      sol.SOBufferEnable3 = !!info->stride[3];
+   }
+
+   crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
+      list.DWordLength = 3 + 2 * max_decls - 2;
+      list.StreamtoBufferSelects0 = buffer_mask[0];
+      list.StreamtoBufferSelects1 = buffer_mask[1];
+      list.StreamtoBufferSelects2 = buffer_mask[2];
+      list.StreamtoBufferSelects3 = buffer_mask[3];
+      list.NumEntries0 = decls[0];
+      list.NumEntries1 = decls[1];
+      list.NumEntries2 = decls[2];
+      list.NumEntries3 = decls[3];
+   }
+
+   for (int i = 0; i < max_decls; i++) {
+      crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
+         entry.Stream0Decl = so_decl[0][i];
+         entry.Stream1Decl = so_decl[1][i];
+         entry.Stream2Decl = so_decl[2][i];
+         entry.Stream3Decl = so_decl[3][i];
+      }
+   }
+
+   return map;
+}
+#endif
+
+#if GFX_VER == 6
+static void
+crocus_emit_so_svbi(struct crocus_context *ice)
+{
+   struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
+
+   unsigned max_vertex = 0xffffffff;
+   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      struct crocus_stream_output_target *tgt =
+         (void *) ice->state.so_target[i];
+      if (tgt)
+         max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
+   }
+
+   crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
+      svbi.IndexNumber = 0;
+      svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
+      svbi.MaximumIndex = max_vertex;
+   }
+
+   /* initialize the rest of the SVBI's to reasonable values so that we don't
+    * run out of room writing the regular data.
+    */
+   for (int i = 1; i < 4; i++) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
+         svbi.IndexNumber = i;
+         svbi.StreamedVertexBufferIndex = 0;
+         svbi.MaximumIndex = 0xffffffff;
+      }
+   }
+}
+
+#endif
+
+
+#if GFX_VER >= 6
+static bool
+crocus_is_drawing_points(const struct crocus_context *ice)
+{
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+   if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
+       cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
+      return true;
+
+   if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
+      const struct brw_gs_prog_data *gs_prog_data =
+         (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
+      return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
+   } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
+      const struct brw_tes_prog_data *tes_data =
+         (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
+      return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else {
+      return ice->state.prim_mode == PIPE_PRIM_POINTS;
+   }
+}
+#endif
+
+#if GFX_VER >= 6
+static void
+get_attr_override(
+   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
+   const struct brw_vue_map *vue_map,
+   int urb_entry_read_offset, int fs_attr,
+   bool two_side_color, uint32_t *max_source_attr)
+{
+   /* Find the VUE slot for this attribute. */
+   int slot = vue_map->varying_to_slot[fs_attr];
+
+   /* Viewport and Layer are stored in the VUE header.  We need to override
+    * them to zero if earlier stages didn't write them, as GL requires that
+    * they read back as zero when not explicitly set.
+    */
+   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
+      attr->ComponentOverrideX = true;
+      attr->ComponentOverrideW = true;
+      attr->ConstantSource = CONST_0000;
+
+      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
+         attr->ComponentOverrideY = true;
+      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
+         attr->ComponentOverrideZ = true;
+
+      return;
+   }
+
+   /* If there was only a back color written but not front, use back
+    * as the color instead of undefined
+    */
+   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
+      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
+   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
+      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
+
+   if (slot == -1) {
+      /* This attribute does not exist in the VUE--that means that the vertex
+       * shader did not write to it.  This means that either:
+       *
+       * (a) This attribute is a texture coordinate, and it is going to be
+       * replaced with point coordinates (as a consequence of a call to
+       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
+       * hardware will ignore whatever attribute override we supply.
+       *
+       * (b) This attribute is read by the fragment shader but not written by
+       * the vertex shader, so its value is undefined.  Therefore the
+       * attribute override we supply doesn't matter.
+       *
+       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
+       * previous shader stage.
+       *
+       * Note that we don't have to worry about the cases where the attribute
+       * is gl_PointCoord or is undergoing point sprite coordinate
+       * replacement, because in those cases, this function isn't called.
+       *
+       * In case (c), we need to program the attribute overrides so that the
+       * primitive ID will be stored in this slot.  In every other case, the
+       * attribute override we supply doesn't matter.  So just go ahead and
+       * program primitive ID in every case.
+       */
+      attr->ComponentOverrideW = true;
+      attr->ComponentOverrideX = true;
+      attr->ComponentOverrideY = true;
+      attr->ComponentOverrideZ = true;
+      attr->ConstantSource = PRIM_ID;
+      return;
+   }
+
+   /* Compute the location of the attribute relative to urb_entry_read_offset.
+    * Each increment of urb_entry_read_offset represents a 256-bit value, so
+    * it counts for two 128-bit VUE slots.
+    */
+   int source_attr = slot - 2 * urb_entry_read_offset;
+   assert(source_attr >= 0 && source_attr < 32);
+
+   /* If we are doing two-sided color, and the VUE slot following this one
+    * represents a back-facing color, then we need to instruct the SF unit to
+    * do back-facing swizzling.
+    */
+   bool swizzling = two_side_color &&
+      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
+        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
+       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
+        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
+
+   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
+   if (*max_source_attr < source_attr + swizzling)
+      *max_source_attr = source_attr + swizzling;
+
+   attr->SourceAttribute = source_attr;
+   if (swizzling)
+      attr->SwizzleSelect = INPUTATTR_FACING;
+}
+
+static void
+calculate_attr_overrides(
+   const struct crocus_context *ice,
+   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
+   uint32_t *point_sprite_enables,
+   uint32_t *urb_entry_read_length,
+   uint32_t *urb_entry_read_offset)
+{
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+   const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+   uint32_t max_source_attr = 0;
+   const struct shader_info *fs_info =
+      crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+   int first_slot =
+      brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
+
+   /* Each URB offset packs two varying slots */
+   assert(first_slot % 2 == 0);
+   *urb_entry_read_offset = first_slot / 2;
+   *point_sprite_enables = 0;
+
+   for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
+      const int input_index = wm_prog_data->urb_setup[fs_attr];
+
+      if (input_index < 0)
+         continue;
+
+      bool point_sprite = false;
+      if (crocus_is_drawing_points(ice)) {
+         if (fs_attr >= VARYING_SLOT_TEX0 &&
+             fs_attr <= VARYING_SLOT_TEX7 &&
+             cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
+            point_sprite = true;
+
+         if (fs_attr == VARYING_SLOT_PNTC)
+            point_sprite = true;
+
+         if (point_sprite)
+            *point_sprite_enables |= 1U << input_index;
+      }
+
+      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
+      if (!point_sprite) {
+         get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
+                           cso_rast->cso.light_twoside, &max_source_attr);
+      }
+
+      /* The hardware can only do the overrides on 16 overrides at a
+       * time, and the other up to 16 have to be lined up so that the
+       * input index = the output index.  We'll need to do some
+       * tweaking to make sure that's the case.
+       */
+      if (input_index < 16)
+         attr_overrides[input_index] = attribute;
+      else
+         assert(attribute.SourceAttribute == input_index);
+   }
+
+   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
+    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
+    *
+    * "This field should be set to the minimum length required to read the
+    *  maximum source attribute.  The maximum source attribute is indicated
+    *  by the maximum value of the enabled Attribute # Source Attribute if
+    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
+    *  enable is not set.
+    *  read_length = ceiling((max_source_attr + 1) / 2)
+    *
+    *  [errata] Corruption/Hang possible if length programmed larger than
+    *  recommended"
+    *
+    * Similar text exists for Ivy Bridge.
+    */
+   *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
+}
+#endif
+
+#if GFX_VER == 7
+static void
+crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
+{
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+   const struct brw_wm_prog_data *wm_prog_data = (void *)
+      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
+
+   uint32_t urb_entry_read_length;
+   uint32_t urb_entry_read_offset;
+   uint32_t point_sprite_enables;
+
+   crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
+      sbe.AttributeSwizzleEnable = true;
+      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+      sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
+
+      calculate_attr_overrides(ice,
+                               sbe.Attribute,
+                               &point_sprite_enables,
+                               &urb_entry_read_length,
+                               &urb_entry_read_offset);
+      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
+      sbe.VertexURBEntryReadLength = urb_entry_read_length;
+      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+      sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+   }
+}
+#endif
+
+/* ------------------------------------------------------------------- */
+
+/**
+ * Populate VS program key fields based on the current state.
+ */
+static void
+crocus_populate_vs_key(const struct crocus_context *ice,
+                       const struct shader_info *info,
+                       gl_shader_stage last_stage,
+                       struct brw_vs_prog_key *key)
+{
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+   if (info->clip_distance_array_size == 0 &&
+       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+       last_stage == MESA_SHADER_VERTEX)
+      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+
+#if GFX_VER <= 5
+   key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
+                         cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
+   key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
+#endif
+
+   key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
+
+#if !(GFX_VERx10 == 75)
+   uint64_t inputs_read = info->inputs_read;
+   int ve_idx = 0;
+   while (inputs_read) {
+      int i = u_bit_scan64(&inputs_read);
+      key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
+      ve_idx++;
+   }
+#endif
+}
+
+/**
+ * Populate TCS program key fields based on the current state.
+ */
+static void
+crocus_populate_tcs_key(const struct crocus_context *ice,
+                        struct brw_tcs_prog_key *key)
+{
+}
+
+/**
+ * Populate TES program key fields based on the current state.
+ */
+static void
+crocus_populate_tes_key(const struct crocus_context *ice,
+                        const struct shader_info *info,
+                        gl_shader_stage last_stage,
+                        struct brw_tes_prog_key *key)
+{
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+   if (info->clip_distance_array_size == 0 &&
+       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+       last_stage == MESA_SHADER_TESS_EVAL)
+      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+}
+
+/**
+ * Populate GS program key fields based on the current state.
+ */
+static void
+crocus_populate_gs_key(const struct crocus_context *ice,
+                       const struct shader_info *info,
+                       gl_shader_stage last_stage,
+                       struct brw_gs_prog_key *key)
+{
+   const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+   if (info->clip_distance_array_size == 0 &&
+       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
+       last_stage == MESA_SHADER_GEOMETRY)
+      key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
+}
+
+/**
+ * Populate FS program key fields based on the current state.
+ */
+static void
+crocus_populate_fs_key(const struct crocus_context *ice,
+                       const struct shader_info *info,
+                       struct brw_wm_prog_key *key)
+{
+   struct crocus_screen *screen = (void *) ice->ctx.screen;
+   const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+   const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
+   const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
+   const struct crocus_blend_state *blend = ice->state.cso_blend;
+
+#if GFX_VER < 6
+   uint32_t lookup = 0;
+
+   if (info->fs.uses_discard || zsa->cso.alpha_enabled)
+      lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+      lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
+
+   if (fb->zsbuf && zsa->cso.depth_enabled) {
+      lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
+
+      if (zsa->cso.depth_writemask)
+         lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   }
+   if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
+      lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
+      if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
+         lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
+   }
+   key->iz_lookup = lookup;
+   key->stats_wm = ice->state.stats_wm;
+#endif
+
+   uint32_t line_aa = BRW_WM_AA_NEVER;
+   if (rast->cso.line_smooth) {
+      int reduced_prim = u_reduced_prim(ice->state.prim_mode);
+      if (reduced_prim == PIPE_PRIM_LINES)
+         line_aa = BRW_WM_AA_ALWAYS;
+      else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
+         if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
+            line_aa = BRW_WM_AA_SOMETIMES;
+
+            if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
+                rast->cso.cull_face == PIPE_FACE_BACK)
+               line_aa = BRW_WM_AA_ALWAYS;
+         } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
+            line_aa = BRW_WM_AA_SOMETIMES;
+
+            if (rast->cso.cull_face == PIPE_FACE_FRONT)
+               line_aa = BRW_WM_AA_ALWAYS;
+         }
+      }
+   }
+   key->line_aa = line_aa;
+
+   key->nr_color_regions = fb->nr_cbufs;
+
+   key->clamp_fragment_color = rast->cso.clamp_fragment_color;
+
+   key->alpha_to_coverage = blend->cso.alpha_to_coverage;
+
+   key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
+
+   key->flat_shade = rast->cso.flatshade &&
+      (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
+
+   key->persample_interp = rast->cso.force_persample_interp;
+   key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
+
+   key->ignore_sample_mask_out = !key->multisample_fbo;
+   key->coherent_fb_fetch = false; // TODO: needed?
+
+   key->force_dual_color_blend =
+      screen->driconf.dual_color_blend_by_location &&
+      (blend->blend_enables & 1) && blend->dual_color_blending;
+
+   /* TODO: Respect glHint for key->high_quality_derivatives */
+
+#if GFX_VER <= 5
+   if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
+      key->alpha_test_func = zsa->cso.alpha_func;
+      key->alpha_test_ref = zsa->cso.alpha_ref_value;
+   }
+#endif
+}
+
+static void
+crocus_populate_cs_key(const struct crocus_context *ice,
+                       struct brw_cs_prog_key *key)
+{
+}
+
+#if GFX_VER == 4
+#define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
+#elif GFX_VER >= 5
+static uint64_t
+KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
+{
+   return shader->offset;
+}
+#endif
+
+/* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
+ * prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
+ * this WA on C0 stepping.
+ *
+ * TODO: Fill out SamplerCount for prefetching?
+ */
+
+#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                 \
+   pkt.KernelStartPointer = KSP(ice, shader);                           \
+   pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;              \
+   pkt.FloatingPointMode = prog_data->use_alt_mode;                     \
+                                                                        \
+   pkt.DispatchGRFStartRegisterForURBData =                             \
+      prog_data->dispatch_grf_start_reg;                                \
+   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;     \
+   pkt.prefix##URBEntryReadOffset = 0;                                  \
+                                                                        \
+   pkt.StatisticsEnable = true;                                         \
+   pkt.Enable           = true;                                         \
+                                                                        \
+   if (prog_data->total_scratch) {                                      \
+      struct crocus_bo *bo =                                            \
+         crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
+      pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;   \
+      pkt.ScratchSpaceBasePointer = rw_bo(bo, 0);                       \
+   }
+
+/* ------------------------------------------------------------------- */
+#if GFX_VER >= 6
+static const uint32_t push_constant_opcodes[] = {
+   [MESA_SHADER_VERTEX]    = 21,
+   [MESA_SHADER_TESS_CTRL] = 25, /* HS */
+   [MESA_SHADER_TESS_EVAL] = 26, /* DS */
+   [MESA_SHADER_GEOMETRY]  = 22,
+   [MESA_SHADER_FRAGMENT]  = 23,
+   [MESA_SHADER_COMPUTE]   = 0,
+};
+#endif
+
+static void
+emit_sized_null_surface(struct crocus_batch *batch,
+                        unsigned width, unsigned height,
+                        unsigned layers, unsigned levels,
+                        unsigned minimum_array_element,
+                        uint32_t *out_offset)
+{
+   struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t *surf = stream_state(batch, isl_dev->ss.size,
+                                 isl_dev->ss.align,
+                                 out_offset);
+   //TODO gen 6 multisample crash
+   isl_null_fill_state(isl_dev, surf,
+                       .size = isl_extent3d(width, height, layers),
+                       .levels = levels,
+                       .minimum_array_element = minimum_array_element);
+}
+static void
+emit_null_surface(struct crocus_batch *batch,
+                  uint32_t *out_offset)
+{
+   emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
+}
+
+static void
+emit_null_fb_surface(struct crocus_batch *batch,
+                     struct crocus_context *ice,
+                     uint32_t *out_offset)
+{
+   uint32_t width, height, layers, level, layer;
+   /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
+   if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
+      emit_null_surface(batch, out_offset);
+      return;
+   }
+
+   struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+   width = MAX2(cso->width, 1);
+   height = MAX2(cso->height, 1);
+   layers = cso->layers ? cso->layers : 1;
+   level = 0;
+   layer = 0;
+
+   if (cso->nr_cbufs == 0 && cso->zsbuf) {
+      width = cso->zsbuf->width;
+      height = cso->zsbuf->height;
+      level = cso->zsbuf->u.tex.level;
+      layer = cso->zsbuf->u.tex.first_layer;
+   }
+   emit_sized_null_surface(batch, width, height,
+                           layers, level, layer,
+                           out_offset);
+}
+
+static void
+emit_surface_state(struct crocus_batch *batch,
+                   struct crocus_resource *res,
+                   const struct isl_surf *in_surf,
+                   bool adjust_surf,
+                   struct isl_view *view,
+                   bool writeable,
+                   enum isl_aux_usage aux_usage,
+                   bool blend_enable,
+                   uint32_t write_disables,
+                   uint32_t *surf_state,
+                   uint32_t addr_offset)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t reloc = RELOC_32BIT;
+   uint32_t offset = res->offset, tile_x_sa = 0, tile_y_sa = 0;
+
+   if (writeable)
+      reloc |= RELOC_WRITE;
+
+   struct isl_surf surf = *in_surf;
+   if (adjust_surf) {
+      if (res->base.target == PIPE_TEXTURE_3D && view->array_len == 1) {
+         isl_surf_get_image_surf(isl_dev, in_surf,
+                                 view->base_level, 0,
+                                 view->base_array_layer,
+                                 &surf, &offset,
+                                 &tile_x_sa, &tile_y_sa);
+         view->base_array_layer = 0;
+         view->base_level = 0;
+      } else if (res->base.target == PIPE_TEXTURE_CUBE && devinfo->ver == 4) {
+         isl_surf_get_image_surf(isl_dev, in_surf,
+                                 view->base_level, view->base_array_layer,
+                                 0,
+                                 &surf, &offset,
+                                 &tile_x_sa, &tile_y_sa);
+         view->base_array_layer = 0;
+         view->base_level = 0;
+      } else if (res->base.target == PIPE_TEXTURE_1D_ARRAY)
+         surf.dim = ISL_SURF_DIM_2D;
+   }
+
+   union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
+   struct crocus_bo *aux_bo = NULL;
+   uint32_t aux_offset = 0;
+   struct isl_surf *aux_surf = NULL;
+   if (aux_usage != ISL_AUX_USAGE_NONE) {
+      aux_surf = &res->aux.surf;
+      aux_offset = res->aux.offset;
+      aux_bo = res->aux.bo;
+
+      clear_color = crocus_resource_get_clear_color(res);
+   }
+
+   isl_surf_fill_state(isl_dev, surf_state,
+                       .surf = &surf,
+                       .view = view,
+                       .address = crocus_state_reloc(batch,
+                                                     addr_offset + isl_dev->ss.addr_offset,
+                                                     res->bo, offset, reloc),
+                       .aux_surf = aux_surf,
+                       .aux_usage = aux_usage,
+                       .aux_address = aux_offset,
+                       .mocs = crocus_mocs(res->bo, isl_dev),
+                       .clear_color = clear_color,
+                       .use_clear_address = false,
+                       .clear_address = 0,
+                       .x_offset_sa = tile_x_sa,
+                       .y_offset_sa = tile_y_sa,
+#if GFX_VER <= 5
+                       .blend_enable = blend_enable,
+                       .write_disables = write_disables,
+#endif
+      );
+
+   if (aux_surf) {
+      /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
+       * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
+       * contain other control information.  Since buffer addresses are always
+       * on 4k boundaries (and thus have their lower 12 bits zero), we can use
+       * an ordinary reloc to do the necessary address translation.
+       *
+       * FIXME: move to the point of assignment.
+       */
+      uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
+      *aux_addr = crocus_state_reloc(batch,
+                                     addr_offset + isl_dev->ss.aux_addr_offset,
+                                     aux_bo, *aux_addr,
+                                     reloc);
+   }
+
+}
+
+static uint32_t
+emit_surface(struct crocus_batch *batch,
+             struct crocus_surface *surf,
+             enum isl_aux_usage aux_usage,
+             bool blend_enable,
+             uint32_t write_disables)
+{
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   struct isl_device *isl_dev = &batch->screen->isl_dev;
+   struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
+   struct isl_view *view = &surf->view;
+   uint32_t offset = 0;
+   enum pipe_texture_target target = res->base.target;
+   bool adjust_surf = false;
+
+   if (devinfo->ver == 4 && target == PIPE_TEXTURE_CUBE)
+      adjust_surf = true;
+
+   if (surf->align_res)
+      res = (struct crocus_resource *)surf->align_res;
+
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
+
+   emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
+                      aux_usage, blend_enable,
+                      write_disables,
+                      surf_state, offset);
+   return offset;
+}
+
+static uint32_t
+emit_rt_surface(struct crocus_batch *batch,
+                struct crocus_surface *surf,
+                enum isl_aux_usage aux_usage)
+{
+   struct isl_device *isl_dev = &batch->screen->isl_dev;
+   struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
+   struct isl_view *view = &surf->read_view;
+   uint32_t offset = 0;
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
+
+   emit_surface_state(batch, res, &surf->surf, true, view, false,
+                      aux_usage, 0, false,
+                      surf_state, offset);
+   return offset;
+}
+
+static uint32_t
+emit_grid(struct crocus_context *ice,
+          struct crocus_batch *batch)
+{
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+   struct crocus_state_ref *grid_ref = &ice->state.grid_size;
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+   isl_buffer_fill_state(isl_dev, surf_state,
+                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                       crocus_resource_bo(grid_ref->res),
+                                                       grid_ref->offset,
+                                                       RELOC_32BIT),
+                         .size_B = 12,
+                         .format = ISL_FORMAT_RAW,
+                         .stride_B = 1,
+                         .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
+   return offset;
+}
+
+static uint32_t
+emit_ubo_buffer(struct crocus_context *ice,
+                struct crocus_batch *batch,
+                struct pipe_constant_buffer *buffer)
+{
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+   isl_buffer_fill_state(isl_dev, surf_state,
+                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                       crocus_resource_bo(buffer->buffer),
+                                                       buffer->buffer_offset,
+                                                       RELOC_32BIT),
+                         .size_B = buffer->buffer_size,
+                         .format = 0,
+                         .swizzle = ISL_SWIZZLE_IDENTITY,
+                         .stride_B = 1,
+                         .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
+
+   return offset;
+}
+
+static uint32_t
+emit_ssbo_buffer(struct crocus_context *ice,
+                 struct crocus_batch *batch,
+                 struct pipe_shader_buffer *buffer, bool writeable)
+{
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+   uint32_t reloc = RELOC_32BIT;
+
+   if (writeable)
+      reloc |= RELOC_WRITE;
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+   isl_buffer_fill_state(isl_dev, surf_state,
+                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                       crocus_resource_bo(buffer->buffer),
+                                                       buffer->buffer_offset,
+                                                       reloc),
+                         .size_B = buffer->buffer_size,
+                         .format = ISL_FORMAT_RAW,
+                         .swizzle = ISL_SWIZZLE_IDENTITY,
+                         .stride_B = 1,
+                         .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
+
+   return offset;
+}
+
+static uint32_t
+emit_sampler_view(struct crocus_context *ice,
+                  struct crocus_batch *batch,
+                  bool for_gather,
+                  struct crocus_sampler_view *isv)
+{
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+
+   if (isv->base.target == PIPE_BUFFER) {
+      const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
+      const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
+      unsigned final_size =
+         MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
+              CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
+      isl_buffer_fill_state(isl_dev, surf_state,
+                            .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                          isv->res->bo,
+                                                          isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
+                            .size_B = final_size,
+                            .format = isv->view.format,
+                            .swizzle = isv->view.swizzle,
+                            .stride_B = cpp,
+                            .mocs = crocus_mocs(isv->res->bo, isl_dev)
+         );
+   } else {
+      enum isl_aux_usage aux_usage =
+         crocus_resource_texture_aux_usage(isv->res);
+
+      emit_surface_state(batch, isv->res, &isv->res->surf, false,
+                         for_gather ? &isv->gather_view : &isv->view,
+                         false, aux_usage, false,
+                         0, surf_state, offset);
+   }
+   return offset;
+}
+
+static uint32_t
+emit_image_view(struct crocus_context *ice,
+                struct crocus_batch *batch,
+                struct crocus_image_view *iv)
+{
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+
+   struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+   bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
+   uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
+   if (res->base.target == PIPE_BUFFER) {
+      const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
+      const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
+      unsigned final_size =
+         MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
+              CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
+      isl_buffer_fill_state(isl_dev, surf_state,
+                            .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                          res->bo,
+                                                          res->offset + iv->base.u.buf.offset, reloc),
+                            .size_B = final_size,
+                            .format = iv->view.format,
+                            .swizzle = iv->view.swizzle,
+                            .stride_B = cpp,
+                            .mocs = crocus_mocs(res->bo, isl_dev)
+         );
+   } else {
+      if (iv->view.format == ISL_FORMAT_RAW) {
+         isl_buffer_fill_state(isl_dev, surf_state,
+                               .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                             res->bo,
+                                                             res->offset, reloc),
+                               .size_B = res->bo->size - res->offset,
+                               .format = iv->view.format,
+                               .swizzle = iv->view.swizzle,
+                               .stride_B = 1,
+                               .mocs = crocus_mocs(res->bo, isl_dev),
+            );
+
+
+      } else {
+         emit_surface_state(batch, res,
+                            &res->surf, false, &iv->view,
+                            write, 0, false,
+                            0, surf_state, offset);
+      }
+   }
+
+   return offset;
+}
+
+#if GFX_VER == 6
+static uint32_t
+emit_sol_surface(struct crocus_batch *batch,
+                 struct pipe_stream_output_info *so_info,
+                 uint32_t idx)
+{
+   struct crocus_context *ice = batch->ice;
+
+   if (idx >= so_info->num_outputs || !ice->state.streamout_active)
+      return 0;
+   const struct pipe_stream_output *output = &so_info->output[idx];
+   const int buffer = output->output_buffer;
+   assert(output->stream == 0);
+
+   struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
+   unsigned stride_dwords = so_info->stride[buffer];
+   unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
+
+   size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
+   unsigned num_vector_components = output->num_components;
+   unsigned num_elements;
+   /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
+    * too big to map using a single binding table entry?
+    */
+   //   assert((size_dwords - offset_dwords) / stride_dwords
+   //          <= BRW_MAX_NUM_BUFFER_ENTRIES);
+
+   if (size_dwords > offset_dwords + num_vector_components) {
+      /* There is room for at least 1 transform feedback output in the buffer.
+       * Compute the number of additional transform feedback outputs the
+       * buffer has room for.
+       */
+      num_elements =
+         (size_dwords - offset_dwords - num_vector_components);
+   } else {
+      /* There isn't even room for a single transform feedback output in the
+       * buffer.  We can't configure the binding table entry to prevent output
+       * entirely; we'll have to rely on the geometry shader to detect
+       * overflow.  But to minimize the damage in case of a bug, set up the
+       * binding table entry to just allow a single output.
+       */
+      num_elements = 0;
+   }
+   num_elements += stride_dwords;
+
+   uint32_t surface_format;
+   switch (num_vector_components) {
+   case 1:
+      surface_format = ISL_FORMAT_R32_FLOAT;
+      break;
+   case 2:
+      surface_format = ISL_FORMAT_R32G32_FLOAT;
+      break;
+   case 3:
+      surface_format = ISL_FORMAT_R32G32B32_FLOAT;
+      break;
+   case 4:
+      surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
+      break;
+   default:
+      unreachable("Invalid vector size for transform feedback output");
+   }
+
+   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
+   uint32_t offset = 0;
+
+   uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
+                                       isl_dev->ss.align, &offset);
+   isl_buffer_fill_state(isl_dev, surf_state,
+                         .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
+                                                       crocus_resource_bo(&buf->base),
+                                                       offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
+                         .size_B = num_elements * 4,
+                         .stride_B = stride_dwords * 4,
+                         .swizzle = ISL_SWIZZLE_IDENTITY,
+                         .format = surface_format);
+   return offset;
+}
+#endif
+
+#define foreach_surface_used(index, group)                      \
+   for (int index = 0; index < bt->sizes[group]; index++)       \
+      if (crocus_group_index_to_bti(bt, group, index) !=        \
+          CROCUS_SURFACE_NOT_USED)
+
+static void
+crocus_populate_binding_table(struct crocus_context *ice,
+                              struct crocus_batch *batch,
+                              gl_shader_stage stage, bool ff_gs)
+{
+   struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
+   struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
+   if (!shader)
+      return;
+
+   struct crocus_binding_table *bt = &shader->bt;
+   int s = 0;
+   uint32_t *surf_offsets = shader->surf_offset;
+
+   const struct shader_info *info = crocus_get_shader_info(ice, stage);
+
+   if (stage == MESA_SHADER_FRAGMENT) {
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
+      if (cso_fb->nr_cbufs) {
+         for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
+            uint32_t write_disables = 0;
+            bool blend_enable = false;
+#if GFX_VER <= 5
+            const struct pipe_rt_blend_state *rt =
+               &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
+            write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
+            write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
+            write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
+            write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
+            blend_enable = rt->blend_enable;
+#endif
+            if (cso_fb->cbufs[i]) {
+               surf_offsets[s] = emit_surface(batch,
+                                              (struct crocus_surface *)cso_fb->cbufs[i],
+                                              ice->state.draw_aux_usage[i],
+                                              blend_enable,
+                                              write_disables);
+            } else {
+               emit_null_fb_surface(batch, ice, &surf_offsets[s]);
+            }
+            s++;
+         }
+      } else {
+         emit_null_fb_surface(batch, ice, &surf_offsets[s]);
+         s++;
+      }
+
+      foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
+         struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+         if (cso_fb->cbufs[i]) {
+            surf_offsets[s++] = emit_rt_surface(batch,
+                                                (struct crocus_surface *)cso_fb->cbufs[i],
+                                                ice->state.draw_aux_usage[i]);
+         }
+      }
+   }
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
+         surf_offsets[s] = emit_grid(ice, batch);
+         s++;
+      }
+   }
+
+#if GFX_VER == 6
+   if (stage == MESA_SHADER_GEOMETRY) {
+      struct pipe_stream_output_info *so_info;
+      if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
+         so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
+      else
+         so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
+
+      foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
+         surf_offsets[s] = emit_sol_surface(batch, so_info, i);
+         s++;
+      }
+   }
+#endif
+
+   foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
+      struct crocus_sampler_view *view = shs->textures[i];
+      if (view)
+         surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
+      else
+         emit_null_surface(batch, &surf_offsets[s]);
+      s++;
+   }
+
+   if (info && info->uses_texture_gather) {
+      foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
+         struct crocus_sampler_view *view = shs->textures[i];
+         if (view)
+            surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
+         else
+            emit_null_surface(batch, &surf_offsets[s]);
+         s++;
+      }
+   }
+
+   foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
+      struct crocus_image_view *view = &shs->image[i];
+      if (view->base.resource)
+         surf_offsets[s] = emit_image_view(ice, batch, view);
+      else
+         emit_null_surface(batch, &surf_offsets[s]);
+      s++;
+   }
+   foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
+      if (shs->constbufs[i].buffer)
+         surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
+      else
+         emit_null_surface(batch, &surf_offsets[s]);
+      s++;
+   }
+   foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
+      if (shs->ssbo[i].buffer)
+         surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
+                                            !!(shs->writable_ssbos & (1 << i)));
+      else
+         emit_null_surface(batch, &surf_offsets[s]);
+      s++;
+   }
+
+}
+/* ------------------------------------------------------------------- */
+static uint32_t
+crocus_upload_binding_table(struct crocus_context *ice,
+                            struct crocus_batch *batch,
+                            uint32_t *table,
+                            uint32_t size)
+
+{
+   if (size == 0)
+      return 0;
+   return emit_state(batch, table, size, 32);
+}
+
+/**
+ * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
+ */
+
+static void
+crocus_update_surface_base_address(struct crocus_batch *batch)
+{
+   if (batch->state_base_address_emitted)
+      return;
+#if GFX_VER >= 6
+   uint32_t mocs = batch->screen->isl_dev.mocs.internal;
+#endif
+   flush_before_state_base_change(batch);
+
+   crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
+
+      sba.SurfaceStateBaseAddressModifyEnable = true;
+      sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
+
+#if GFX_VER >= 5
+      sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
+#endif
+
+      sba.GeneralStateBaseAddressModifyEnable   = true;
+      sba.IndirectObjectBaseAddressModifyEnable = true;
+#if GFX_VER >= 5
+      sba.InstructionBaseAddressModifyEnable    = true;
+#endif
+
+      sba.GeneralStateAccessUpperBoundModifyEnable = true;
+#if GFX_VER >= 5
+      sba.IndirectObjectAccessUpperBoundModifyEnable = true;
+      sba.InstructionAccessUpperBoundModifyEnable = true;
+#endif
+#if GFX_VER <= 5
+      sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
+#endif
+#if GFX_VER >= 6
+      /* The hardware appears to pay attention to the MOCS fields even
+       * if you don't set the "Address Modify Enable" bit for the base.
+       */
+      sba.GeneralStateMOCS            = mocs;
+      sba.StatelessDataPortAccessMOCS = mocs;
+
+      sba.DynamicStateBaseAddressModifyEnable   = true;
+
+      sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
+
+      /* Dynamic state upper bound.  Although the documentation says that
+       * programming it to zero will cause it to be ignored, that is a lie.
+       * If this isn't programmed to a real bound, the sampler border color
+       * pointer is rejected, causing border color to mysteriously fail.
+       */
+      sba.DynamicStateAccessUpperBoundModifyEnable = true;
+      sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
+#endif
+   }
+
+   flush_after_state_base_change(batch);
+
+   /* According to section 3.6.1 of VOL1 of the 965 PRM,
+    * STATE_BASE_ADDRESS updates require a reissue of:
+    *
+    * 3DSTATE_PIPELINE_POINTERS
+    * 3DSTATE_BINDING_TABLE_POINTERS
+    * MEDIA_STATE_POINTERS
+    *
+    * and this continues through Ironlake.  The Sandy Bridge PRM, vol
+    * 1 part 1 says that the folowing packets must be reissued:
+    *
+    * 3DSTATE_CC_POINTERS
+    * 3DSTATE_BINDING_TABLE_POINTERS
+    * 3DSTATE_SAMPLER_STATE_POINTERS
+    * 3DSTATE_VIEWPORT_STATE_POINTERS
+    * MEDIA_STATE_POINTERS
+    *
+    * Those are always reissued following SBA updates anyway (new
+    * batch time), except in the case of the program cache BO
+    * changing.  Having a separate state flag makes the sequence more
+    * obvious.
+    */
+#if GFX_VER <= 5
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+#elif GFX_VER == 6
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
+#endif
+   batch->state_base_address_emitted = true;
+}
+
+static inline void
+crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+                          bool window_space_position, float *zmin, float *zmax)
+{
+   if (window_space_position) {
+      *zmin = 0.f;
+      *zmax = 1.f;
+      return;
+   }
+   util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
+}
+
+struct push_bos {
+   struct {
+      struct crocus_address addr;
+      uint32_t length;
+   } buffers[4];
+   int buffer_count;
+   uint32_t max_length;
+};
+
+#if GFX_VER >= 6
+static void
+setup_constant_buffers(struct crocus_context *ice,
+                       struct crocus_batch *batch,
+                       int stage,
+                       struct push_bos *push_bos)
+{
+   struct crocus_shader_state *shs = &ice->state.shaders[stage];
+   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
+
+   uint32_t push_range_sum = 0;
+
+   int n = 0;
+   for (int i = 0; i < 4; i++) {
+      const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
+
+      if (range->length == 0)
+         continue;
+
+      push_range_sum += range->length;
+
+      if (range->length > push_bos->max_length)
+         push_bos->max_length = range->length;
+
+      /* Range block is a binding table index, map back to UBO index. */
+      unsigned block_index = crocus_bti_to_group_index(
+         &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
+      assert(block_index != CROCUS_SURFACE_NOT_USED);
+
+      struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
+      struct crocus_resource *res = (void *) cbuf->buffer;
+
+      assert(cbuf->buffer_offset % 32 == 0);
+
+      push_bos->buffers[n].length = range->length;
+      push_bos->buffers[n].addr =
+         res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
+         : ro_bo(batch->ice->workaround_bo,
+                 batch->ice->workaround_offset);
+      n++;
+   }
+
+   /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
+    *
+    *    "The sum of all four read length fields must be less than or
+    *    equal to the size of 64."
+    */
+   assert(push_range_sum <= 64);
+
+   push_bos->buffer_count = n;
+}
+
+#if GFX_VER == 7
+static void
+gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
+{
+   ASSERTED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   assert(devinfo->ver == 7);
+   crocus_emit_pipe_control_write(batch,
+                                  "vs workaround",
+                                  PIPE_CONTROL_WRITE_IMMEDIATE
+                                  | PIPE_CONTROL_DEPTH_STALL,
+                                  batch->ice->workaround_bo,
+                                  batch->ice->workaround_offset, 0);
+}
+#endif
+
+static void
+emit_push_constant_packets(struct crocus_context *ice,
+                           struct crocus_batch *batch,
+                           int stage,
+                           const struct push_bos *push_bos)
+{
+   struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+   struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
+
+#if GFX_VER == 7
+   if (stage == MESA_SHADER_VERTEX) {
+      if (!(GFX_VERx10 == 75) && !batch->screen->devinfo.is_baytrail)
+         gen7_emit_vs_workaround_flush(batch);
+   }
+#endif
+   crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
+      pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
+#if GFX_VER == 7
+      if (prog_data) {
+         /* The Skylake PRM contains the following restriction:
+          *
+          *    "The driver must ensure The following case does not occur
+          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
+          *     buffer 3 read length equal to zero committed followed by a
+          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
+          *     zero committed."
+          *
+          * To avoid this, we program the buffers in the highest slots.
+          * This way, slot 0 is only used if slot 3 is also used.
+          */
+         int n = push_bos->buffer_count;
+         assert(n <= 4);
+#if GFX_VERx10 >= 75
+         const unsigned shift = 4 - n;
+#else
+         const unsigned shift = 0;
+#endif
+         for (int i = 0; i < n; i++) {
+            pkt.ConstantBody.ReadLength[i + shift] =
+               push_bos->buffers[i].length;
+            pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
+         }
+      }
+#else
+      if (prog_data) {
+         int n = push_bos->buffer_count;
+         assert (n <= 1);
+         if (n == 1) {
+            pkt.Buffer0Valid = true;
+            pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
+            pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
+         }
+      }
+#endif
+   }
+}
+
+#endif
+
+#if GFX_VER >= 6
+typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
+#else
+typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
+#endif
+
+static inline void
+set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
+{
+   struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+   ds->DepthTestEnable = cso->cso.depth_enabled;
+   ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
+   ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
+
+   ds->StencilFailOp = cso->cso.stencil[0].fail_op;
+   ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
+   ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
+   ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
+
+   ds->StencilTestMask = cso->cso.stencil[0].valuemask;
+   ds->StencilWriteMask = cso->cso.stencil[0].writemask;
+
+   ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
+   ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
+   ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
+   ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
+
+   ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
+   ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
+   ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
+   ds->StencilTestEnable = cso->cso.stencil[0].enabled;
+   ds->StencilBufferWriteEnable =
+      cso->cso.stencil[0].writemask != 0 ||
+      (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
+}
+
+static void
+emit_vertex_buffer_state(struct crocus_batch *batch,
+                         unsigned buffer_id,
+                         struct crocus_bo *bo,
+                         unsigned start_offset,
+                         unsigned end_offset,
+                         unsigned stride,
+                         unsigned step_rate,
+                         uint32_t **map)
+{
+   const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
+   _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
+      vb.BufferStartingAddress = ro_bo(bo, start_offset);
+      vb.VertexBufferIndex = buffer_id;
+      vb.BufferPitch = stride;
+#if GFX_VER == 7
+      vb.AddressModifyEnable = true;
+#endif
+#if GFX_VER >= 6
+      vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
+#endif
+      vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
+      vb.InstanceDataStepRate = step_rate;
+#if GFX_VER >= 5
+      vb.EndAddress = ro_bo(bo, end_offset - 1);
+#endif
+   }
+   *map += vb_dwords;
+}
+
+static bool
+can_emit_logic_op(struct crocus_context *ice)
+{
+   /* all pre gen8 have logicop restricted to unorm */
+   enum pipe_format pformat = PIPE_FORMAT_NONE;
+   for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
+      if (ice->state.framebuffer.cbufs[i]) {
+         pformat = ice->state.framebuffer.cbufs[i]->format;
+         break;
+      }
+   }
+   return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
+}
+
+#if GFX_VER >= 6
+static uint32_t
+determine_sample_mask(struct crocus_context *ice)
+{
+   uint32_t num_samples = ice->state.framebuffer.samples;
+
+   if (num_samples <= 1)
+      return 1;
+
+   uint32_t fb_mask = (1 << num_samples) - 1;
+   return ice->state.sample_mask & fb_mask;
+}
+#endif
+
+static void
+crocus_upload_dirty_render_state(struct crocus_context *ice,
+                               struct crocus_batch *batch,
+                               const struct pipe_draw_info *draw)
+{
+   uint64_t dirty = ice->state.dirty;
+   uint64_t stage_dirty = ice->state.stage_dirty;
+
+   if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
+       !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
+      return;
+
+   if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
+         vf.StatisticsEnable = true;
+      }
+   }
+
+#if GFX_VER <= 5
+   if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
+                      CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
+      bool ret = calculate_curbe_offsets(batch);
+      if (ret) {
+         dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
+         stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+      }
+   }
+
+   if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
+       stage_dirty & CROCUS_STAGE_DIRTY_VS) {
+     bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
+                                           brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
+                                           ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
+     if (ret)
+        dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+   }
+#endif
+   if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
+      const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+      uint32_t cc_vp_address;
+
+      /* XXX: could avoid streaming for depth_clip [0,1] case. */
+      uint32_t *cc_vp_map =
+         stream_state(batch,
+                      4 * ice->state.num_viewports *
+                      GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
+      for (int i = 0; i < ice->state.num_viewports; i++) {
+         float zmin, zmax;
+         crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
+                                 ice->state.window_space_position,
+                                 &zmin, &zmax);
+         if (cso_rast->cso.depth_clip_near)
+            zmin = 0.0;
+         if (cso_rast->cso.depth_clip_far)
+            zmax = 1.0;
+
+         crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
+            ccv.MinimumDepth = zmin;
+            ccv.MaximumDepth = zmax;
+         }
+
+         cc_vp_map += GENX(CC_VIEWPORT_length);
+      }
+
+#if GFX_VER >= 7
+      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
+         ptr.CCViewportPointer = cc_vp_address;
+      }
+#elif GFX_VER == 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+         vp.CCViewportStateChange = 1;
+         vp.PointertoCC_VIEWPORT = cc_vp_address;
+      }
+#else
+      ice->state.cc_vp_address = cc_vp_address;
+      dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
+#endif
+   }
+
+   if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+#if GFX_VER == 7
+      uint32_t sf_cl_vp_address;
+      uint32_t *vp_map =
+         stream_state(batch,
+                      4 * ice->state.num_viewports *
+                      GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
+#else
+      uint32_t *vp_map =
+         stream_state(batch,
+                      4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
+                      32, &ice->state.sf_vp_address);
+      uint32_t *clip_map =
+         stream_state(batch,
+                      4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
+                      32, &ice->state.clip_vp_address);
+#endif
+
+      for (unsigned i = 0; i < ice->state.num_viewports; i++) {
+         const struct pipe_viewport_state *state = &ice->state.viewports[i];
+         float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
+
+         intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
+                                        state->scale[0], state->scale[1],
+                                        state->translate[0], state->translate[1],
+                                        &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
+#if GFX_VER == 7
+         crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
+#else
+         crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
+#endif
+         {
+            vp.ViewportMatrixElementm00 = state->scale[0];
+            vp.ViewportMatrixElementm11 = state->scale[1];
+            vp.ViewportMatrixElementm22 = state->scale[2];
+            vp.ViewportMatrixElementm30 = state->translate[0];
+            vp.ViewportMatrixElementm31 = state->translate[1];
+            vp.ViewportMatrixElementm32 = state->translate[2];
+#if GFX_VER < 6
+            struct pipe_scissor_state scissor;
+            crocus_fill_scissor_rect(ice, 0, &scissor);
+            vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
+            vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
+            vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
+            vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
+#endif
+
+#if GFX_VER == 7
+            vp.XMinClipGuardband = gb_xmin;
+            vp.XMaxClipGuardband = gb_xmax;
+            vp.YMinClipGuardband = gb_ymin;
+            vp.YMaxClipGuardband = gb_ymax;
+#endif
+         }
+#if GFX_VER < 7
+         crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
+            clip.XMinClipGuardband = gb_xmin;
+            clip.XMaxClipGuardband = gb_xmax;
+            clip.YMinClipGuardband = gb_ymin;
+            clip.YMaxClipGuardband = gb_ymax;
+         }
+#endif
+#if GFX_VER == 7
+         vp_map += GENX(SF_CLIP_VIEWPORT_length);
+#else
+         vp_map += GENX(SF_VIEWPORT_length);
+         clip_map += GENX(CLIP_VIEWPORT_length);
+#endif
+      }
+#if GFX_VER == 7
+      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
+         ptr.SFClipViewportPointer = sf_cl_vp_address;
+      }
+#elif GFX_VER == 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
+         vp.SFViewportStateChange = 1;
+         vp.CLIPViewportStateChange = 1;
+         vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
+         vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
+      }
+#endif
+   }
+
+#if GFX_VER >= 6
+   if (dirty & CROCUS_DIRTY_GEN6_URB) {
+#if GFX_VER == 6
+      bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
+         || ice->shaders.ff_gs_prog;
+
+      struct brw_vue_prog_data *vue_prog_data =
+         (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
+      const unsigned vs_size = vue_prog_data->urb_entry_size;
+      unsigned gs_size = vs_size;
+      if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
+         struct brw_vue_prog_data *gs_vue_prog_data =
+            (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
+         gs_size = gs_vue_prog_data->urb_entry_size;
+      }
+
+      genX(upload_urb)(batch, vs_size, gs_present, gs_size);
+#endif
+#if GFX_VER == 7
+      const struct intel_device_info *devinfo = &batch->screen->devinfo;
+      bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
+      bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
+      unsigned entry_size[4];
+
+      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+         if (!ice->shaders.prog[i]) {
+            entry_size[i] = 1;
+         } else {
+            struct brw_vue_prog_data *vue_prog_data =
+               (void *) ice->shaders.prog[i]->prog_data;
+            entry_size[i] = vue_prog_data->urb_entry_size;
+         }
+         assert(entry_size[i] != 0);
+      }
+
+      /* If we're just switching between programs with the same URB requirements,
+       * skip the rest of the logic.
+       */
+      bool no_change = false;
+      if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
+          ice->urb.gs_present == gs_present &&
+          ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
+          ice->urb.tess_present == tess_present &&
+          ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
+          ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
+         no_change = true;
+      }
+
+      if (!no_change) {
+         ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
+         ice->urb.gs_present = gs_present;
+         ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
+         ice->urb.tess_present = tess_present;
+         ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
+         ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
+
+         unsigned entries[4];
+         unsigned start[4];
+         bool constrained;
+         intel_get_urb_config(devinfo,
+                              batch->screen->l3_config_3d,
+                              tess_present,
+                              gs_present,
+                              entry_size,
+                              entries, start, NULL, &constrained);
+
+         if (!(GFX_VERx10 == 75) && !devinfo->is_baytrail)
+            gen7_emit_vs_workaround_flush(batch);
+         for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
+            crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
+               urb._3DCommandSubOpcode += i;
+               urb.VSURBStartingAddress     = start[i];
+               urb.VSURBEntryAllocationSize = entry_size[i] - 1;
+               urb.VSNumberofURBEntries     = entries[i];
+            }
+         }
+      }
+#endif
+   }
+
+   if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
+      struct crocus_blend_state *cso_blend = ice->state.cso_blend;
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
+
+      STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
+
+      const int rt_dwords =
+         MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
+
+      uint32_t blend_offset;
+      uint32_t *blend_map =
+         stream_state(batch,
+                      4 * rt_dwords, 64, &blend_offset);
+
+      bool indep_alpha_blend = false;
+      for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
+         const struct pipe_rt_blend_state *rt =
+            &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
+
+         enum pipe_blendfactor src_rgb =
+            fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
+         enum pipe_blendfactor src_alpha =
+            fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
+         enum pipe_blendfactor dst_rgb =
+            fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
+         enum pipe_blendfactor dst_alpha =
+            fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
+
+         if (rt->rgb_func != rt->alpha_func ||
+             src_rgb != src_alpha || dst_rgb != dst_alpha)
+            indep_alpha_blend = true;
+
+         crocus_pack_state(GENX(BLEND_STATE_ENTRY), blend_map, be) {
+            if (can_emit_logic_op(ice)) {
+               be.LogicOpEnable = cso_blend->cso.logicop_enable;
+               be.LogicOpFunction = cso_blend->cso.logicop_func;
+            }
+
+            be.ColorClampRange = COLORCLAMP_RTFORMAT;
+            be.PreBlendColorClampEnable = true;
+            be.PostBlendColorClampEnable = true;
+
+            if (i == 0) {
+               struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
+               struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+               be.ColorBufferBlendEnable = rt->blend_enable &&
+                  (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
+            } else
+               be.ColorBufferBlendEnable = rt->blend_enable;
+
+            be.ColorBlendFunction          = rt->rgb_func;
+            be.AlphaBlendFunction          = rt->alpha_func;
+            be.SourceBlendFactor           = (int) src_rgb;
+            be.SourceAlphaBlendFactor      = (int) src_alpha;
+            be.DestinationBlendFactor      = (int) dst_rgb;
+            be.DestinationAlphaBlendFactor = (int) dst_alpha;
+
+            be.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
+            be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
+            be.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
+            be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
+
+            be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
+            be.IndependentAlphaBlendEnable = indep_alpha_blend;
+            be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
+            be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage;
+            be.ColorDitherEnable = cso_blend->cso.dither;
+
+            /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
+            // Except they're not... fix that. Can't be done here since it needs
+            // to be conditional on non-integer RT's
+            be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
+            be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
+         }
+         blend_map += GENX(BLEND_STATE_ENTRY_length);
+      }
+
+#if GFX_VER < 7
+      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.PointertoBLEND_STATE = blend_offset;
+         ptr.BLEND_STATEChange = true;
+      }
+#else
+      crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
+         ptr.BlendStatePointer = blend_offset;
+      }
+#endif
+   }
+#endif
+
+   if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
+      struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
+      UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
+      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
+      uint32_t cc_offset;
+      void *cc_map =
+         stream_state(batch,
+                      sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
+                      64, &cc_offset);
+#if GFX_VER <= 5
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+      int blend_idx = 0;
+
+      if (cso_blend->cso.independent_blend_enable) {
+         for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+            if (cso_blend->cso.rt[i].blend_enable) {
+               blend_idx = i;
+               break;
+            }
+         }
+      }
+      const struct pipe_rt_blend_state *rt = &cso_blend->cso.rt[blend_idx];
+#endif
+      _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
+         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+         cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
+
+#if GFX_VER <= 5
+
+         set_depth_stencil_bits(ice, &cc);
+
+         cc.ColorBufferBlendEnable = rt->blend_enable;
+
+         if (cso_blend->cso.logicop_enable) {
+            if (can_emit_logic_op(ice)) {
+               cc.LogicOpEnable = cso_blend->cso.logicop_enable;
+               cc.LogicOpFunction = cso_blend->cso.logicop_func;
+            }
+         }
+         cc.ColorDitherEnable = cso_blend->cso.dither;
+         cc.ColorBlendFunction = rt->rgb_func;
+         cc.AlphaBlendFunction = rt->alpha_func;
+         cc.SourceBlendFactor = rt->rgb_src_factor;
+         cc.SourceAlphaBlendFactor = rt->alpha_src_factor;
+         cc.DestinationBlendFactor = rt->rgb_dst_factor;
+         cc.DestinationAlphaBlendFactor = rt->alpha_dst_factor;
+
+         if (rt->rgb_func != rt->alpha_func ||
+             rt->rgb_src_factor != rt->alpha_src_factor ||
+             rt->rgb_dst_factor != rt->alpha_dst_factor)
+            cc.IndependentAlphaBlendEnable = true;
+
+         if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
+            cc.AlphaTestEnable = cso->cso.alpha_enabled;
+            cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
+         }
+         cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
+         cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
+#else
+         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
+         cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
+
+         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
+         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
+         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+#endif
+         cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
+         cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
+      }
+      ice->shaders.cc_offset = cc_offset;
+#if GFX_VER >= 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.ColorCalcStatePointer = cc_offset;
+#if GFX_VER != 7
+         ptr.ColorCalcStatePointerValid = true;
+#endif
+      }
+#endif
+   }
+#if GFX_VER <= 5
+   if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
+         blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
+         blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
+         blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
+         blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
+      }
+   }
+#endif
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
+         continue;
+
+      struct crocus_shader_state *shs = &ice->state.shaders[stage];
+      struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
+
+      if (!shader)
+         continue;
+
+      if (shs->sysvals_need_upload)
+         upload_sysvals(ice, stage);
+
+#if GFX_VER <= 5
+      dirty |= CROCUS_DIRTY_GEN4_CURBE;
+#endif
+#if GFX_VER >= 7
+      struct push_bos push_bos = {};
+      setup_constant_buffers(ice, batch, stage, &push_bos);
+
+      emit_push_constant_packets(ice, batch, stage, &push_bos);
+#endif
+   }
+
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
+         if (ice->shaders.prog[stage]) {
+#if GFX_VER <= 6
+            dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+#endif
+            crocus_populate_binding_table(ice, batch, stage, false);
+            ice->shaders.prog[stage]->bind_bo_offset =
+               crocus_upload_binding_table(ice, batch,
+                                           ice->shaders.prog[stage]->surf_offset,
+                                           ice->shaders.prog[stage]->bt.size_bytes);
+
+#if GFX_VER == 7
+            crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
+               ptr._3DCommandSubOpcode = 38 + stage;
+               ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
+            }
+#endif
+#if GFX_VER == 6
+         } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
+            dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
+            crocus_populate_binding_table(ice, batch, stage, true);
+            ice->shaders.ff_gs_prog->bind_bo_offset =
+               crocus_upload_binding_table(ice, batch,
+                                           ice->shaders.ff_gs_prog->surf_offset,
+                                           ice->shaders.ff_gs_prog->bt.size_bytes);
+#endif
+         }
+      }
+   }
+#if GFX_VER <= 6
+   if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
+      struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
+      if (gs == NULL)
+         gs = ice->shaders.ff_gs_prog;
+      crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
+         ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
+         ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
+#if GFX_VER == 6
+         ptr.VSBindingTableChange = true;
+         ptr.PSBindingTableChange = true;
+         ptr.GSBindingTableChange = gs ? true : false;
+         ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
+#endif
+      }
+   }
+#endif
+
+   bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
+   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
+      if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
+          !ice->shaders.prog[stage])
+         continue;
+
+      crocus_upload_sampler_states(ice, batch, stage);
+
+      sampler_updates = true;
+
+#if GFX_VER >= 7
+      struct crocus_shader_state *shs = &ice->state.shaders[stage];
+
+      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
+         ptr._3DCommandSubOpcode = 43 + stage;
+         ptr.PointertoVSSamplerState = shs->sampler_offset;
+      }
+#endif
+   }
+
+   if (sampler_updates) {
+#if GFX_VER == 6
+      struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
+      struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
+      struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
+      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
+         if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
+             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
+            ptr.VSSamplerStateChange = true;
+            ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
+         }
+         if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
+             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
+            ptr.GSSamplerStateChange = true;
+            ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
+         }
+         if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
+             (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
+              stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
+            ptr.PSSamplerStateChange = true;
+            ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
+         }
+      }
+#endif
+   }
+
+#if GFX_VER >= 6
+   if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
+         ms.PixelLocation =
+            ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
+         if (ice->state.framebuffer.samples > 0)
+            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
+#if GFX_VER == 6
+         INTEL_SAMPLE_POS_4X(ms.Sample);
+#elif GFX_VER == 7
+         switch (ice->state.framebuffer.samples) {
+         case 1:
+            INTEL_SAMPLE_POS_1X(ms.Sample);
+            break;
+         case 2:
+            INTEL_SAMPLE_POS_2X(ms.Sample);
+            break;
+         case 4:
+            INTEL_SAMPLE_POS_4X(ms.Sample);
+            break;
+         case 8:
+            INTEL_SAMPLE_POS_8X(ms.Sample);
+            break;
+         default:
+            break;
+         }
+#endif
+      }
+   }
+
+   if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
+         ms.SampleMask = determine_sample_mask(ice);
+      }
+   }
+#endif
+
+#if GFX_VER >= 7
+   struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
+   if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
+      struct brw_stage_prog_data *prog_data = shader->prog_data;
+      struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
+
+      crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
+         ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+         ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = KSP(ice, shader) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+         ps.KernelStartPointer1 = KSP(ice, shader) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+         ps.KernelStartPointer2 = KSP(ice, shader) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
+
+#if GFX_VERx10 == 75
+         ps.SampleMask = determine_sample_mask(ice);
+#endif
+         // XXX: WABTPPrefetchDisable, see above, drop at C0
+         ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
+         ps.FloatingPointMode = prog_data->use_alt_mode;
+         ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
+
+         ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
+
+         ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+         ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
+         ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
+         /* From the documentation for this packet:
+          * "If the PS kernel does not need the Position XY Offsets to
+          *  compute a Position Value, then this field should be programmed
+          *  to POSOFFSET_NONE."
+          *
+          * "SW Recommendation: If the PS kernel needs the Position Offsets
+          *  to compute a Position XY value, this field should match Position
+          *  ZW Interpolation Mode to ensure a consistent position.xyzw
+          *  computation."
+          *
+          * We only require XY sample offsets. So, this recommendation doesn't
+          * look useful at the moment.  We might need this in future.
+          */
+         ps.PositionXYOffsetSelect =
+            wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
+
+         if (wm_prog_data->base.total_scratch) {
+            struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
+            ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
+            ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
+         }
+      }
+   }
+#endif
+
+#if GFX_VER >= 7
+   if (ice->state.streamout_active) {
+      if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
+         for (int i = 0; i < 4; i++) {
+            struct crocus_stream_output_target *tgt =
+               (void *) ice->state.so_target[i];
+
+            if (!tgt) {
+               crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
+                  sob.SOBufferIndex = i;
+               }
+               continue;
+            }
+            struct crocus_resource *res = (void *) tgt->base.buffer;
+            uint32_t start = tgt->base.buffer_offset;
+            uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
+            crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
+               sob.SOBufferIndex = i;
+
+               sob.SurfaceBaseAddress = rw_bo(res->bo, start);
+               sob.SurfacePitch = tgt->stride;
+               sob.SurfaceEndAddress = rw_bo(res->bo, end);
+            }
+         }
+      }
+
+      if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
+         uint32_t *decl_list =
+            ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
+         crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
+      }
+
+      if (dirty & CROCUS_DIRTY_STREAMOUT) {
+         const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+
+         uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
+         crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
+            sol.SOFunctionEnable = true;
+            sol.SOStatisticsEnable = true;
+
+            sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
+                                   !ice->state.prims_generated_query_active;
+            sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
+         }
+
+         assert(ice->state.streamout);
+
+         crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
+                         GENX(3DSTATE_STREAMOUT_length));
+      }
+   } else {
+      if (dirty & CROCUS_DIRTY_STREAMOUT) {
+         crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
+      }
+   }
+#endif
+#if GFX_VER == 6
+   if (ice->state.streamout_active) {
+      if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
+         crocus_emit_so_svbi(ice);
+      }
+   }
+#endif
+
+   if (dirty & CROCUS_DIRTY_CLIP) {
+#if GFX_VER < 6
+      const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
+      struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+
+      uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+      _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
+         clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
+         clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+         clip.SingleProgramFlow = true;
+         clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
+
+         clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
+         clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
+
+         clip.DispatchGRFStartRegisterForURBData = 1;
+         clip.VertexURBEntryReadOffset = 0;
+         clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
+
+         clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
+         clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+
+         if (batch->ice->urb.nr_clip_entries >= 10) {
+            /* Half of the URB entries go to each thread, and it has to be an
+             * even number.
+             */
+            assert(batch->ice->urb.nr_clip_entries % 2 == 0);
+
+            /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
+             * only 2 threads can output VUEs at a time.
+             */
+            clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
+         } else {
+            assert(batch->ice->urb.nr_clip_entries >= 5);
+            clip.MaximumNumberofThreads = 1 - 1;
+         }
+         clip.VertexPositionSpace = VPOS_NDCSPACE;
+         clip.UserClipFlagsMustClipEnable = true;
+         clip.GuardbandClipTestEnable = true;
+
+         clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
+         clip.ScreenSpaceViewportXMin = -1.0;
+         clip.ScreenSpaceViewportXMax = 1.0;
+         clip.ScreenSpaceViewportYMin = -1.0;
+         clip.ScreenSpaceViewportYMax = 1.0;
+         clip.ViewportXYClipTestEnable = true;
+         clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
+
+#if GFX_VER == 5 || GFX_VERx10 == 45
+         clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
+#else
+         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
+          * workaround.
+          */
+         clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
+#endif
+
+         clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
+         clip.GuardbandClipTestEnable = true;
+
+         clip.ClipMode = clip_prog_data->clip_mode;
+#if GFX_VERx10 == 45
+         clip.NegativeWClipTestEnable = true;
+#endif
+      }
+
+#else //if GFX_VER >= 6
+      struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
+      const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
+      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
+      bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
+                       ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+      bool points_or_lines = cso_rast->fill_mode_point_or_line ||
+         (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
+                    : ice->state.prim_is_points_or_lines);
+      uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
+      crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
+         cl.StatisticsEnable = ice->state.statistics_counters_enabled;
+         if (cso_rast->cso.rasterizer_discard)
+            cl.ClipMode = CLIPMODE_REJECT_ALL;
+         else if (ice->state.window_space_position)
+            cl.ClipMode = CLIPMODE_ACCEPT_ALL;
+         else
+            cl.ClipMode = CLIPMODE_NORMAL;
+
+         cl.PerspectiveDivideDisable = ice->state.window_space_position;
+         cl.ViewportXYClipTestEnable = !points_or_lines;
+
+         cl.UserClipDistanceCullTestEnableBitmask =
+            brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
+
+         if (wm_prog_data->barycentric_interp_modes &
+             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
+            cl.NonPerspectiveBarycentricEnable = true;
+
+         cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
+         cl.MaximumVPIndex = ice->state.num_viewports - 1;
+      }
+      crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
+                      ARRAY_SIZE(cso_rast->clip));
+#endif
+   }
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
+      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
+      const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+      const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
+#if GFX_VER == 7
+      if (batch->screen->devinfo.is_ivybridge)
+         gen7_emit_vs_workaround_flush(batch);
+#endif
+
+
+#if GFX_VER == 6
+      struct push_bos push_bos = {};
+      setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
+
+      emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
+#endif
+#if GFX_VER >= 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
+#else
+      uint32_t *vs_ptr = stream_state(batch,
+                                      GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+      _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
+#endif
+      {
+         INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
+
+         vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
+
+#if GFX_VER < 6
+         vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
+         vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
+         vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
+
+         vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
+         vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+
+         vs.MaximumNumberofThreads =
+            CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
+         vs.StatisticsEnable = false;
+         vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
+#endif
+#if GFX_VER == 5
+         /* Force single program flow on Ironlake.  We cannot reliably get
+          * all applications working without it.  See:
+          * https://bugs.freedesktop.org/show_bug.cgi?id=29172
+          *
+          * The most notable and reliably failing application is the Humus
+          * demo "CelShading"
+          */
+         vs.SingleProgramFlow = true;
+         vs.SamplerCount = 0; /* hardware requirement */
+
+#endif
+      }
+
+#if GFX_VER == 6
+      crocus_emit_pipe_control_flush(batch,
+                                     "post VS const",
+                                     PIPE_CONTROL_DEPTH_STALL |
+                                     PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+                                     PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+#endif
+   }
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
+      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
+      bool active = GFX_VER >= 6 && shader;
+#if GFX_VER == 6
+      struct push_bos push_bos = {};
+      if (shader)
+         setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
+
+      emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
+#endif
+#if GFX_VER >= 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
+#else
+      uint32_t *gs_ptr = stream_state(batch,
+                                      GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+      _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
+#endif
+     {
+#if GFX_VER >= 6
+         if (active) {
+            const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
+            const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+            const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
+
+            INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
+#if GFX_VER >= 7
+            gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
+            gs.OutputTopology = gs_prog_data->output_topology;
+            gs.ControlDataHeaderSize =
+               gs_prog_data->control_data_header_size_hwords;
+
+            gs.InstanceControl = gs_prog_data->invocations - 1;
+            gs.DispatchMode = vue_prog_data->dispatch_mode;
+
+            gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
+
+            gs.ControlDataFormat = gs_prog_data->control_data_format;
+#endif
+
+            /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
+             * Ivy Bridge and Haswell.
+             *
+             * On Ivy Bridge, setting this bit causes the vertices of a triangle
+             * strip to be delivered to the geometry shader in an order that does
+             * not strictly follow the OpenGL spec, but preserves triangle
+             * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
+             * the geometry shader sees triangles:
+             *
+             * (1, 2, 3), (2, 4, 3), (3, 4, 5)
+             *
+             * (Clearing the bit is even worse, because it fails to preserve
+             * orientation).
+             *
+             * Triangle strips with adjacency always ordered in a way that preserves
+             * triangle orientation but does not strictly follow the OpenGL spec,
+             * regardless of the setting of this bit.
+             *
+             * On Haswell, both triangle strips and triangle strips with adjacency
+             * are always ordered in a way that preserves triangle orientation.
+             * Setting this bit causes the ordering to strictly follow the OpenGL
+             * spec.
+             *
+             * So in either case we want to set the bit.  Unfortunately on Ivy
+             * Bridge this will get the order close to correct but not perfect.
+             */
+            gs.ReorderMode = TRAILING;
+            gs.MaximumNumberofThreads = (batch->screen->devinfo.max_gs_threads - 1);
+
+#if GFX_VER < 7
+            gs.SOStatisticsEnable = true;
+            if (gs_prog_data->num_transform_feedback_bindings)
+               gs.SVBIPayloadEnable = ice->state.streamout_active;
+
+            /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
+             * was previously done for gen6.
+             *
+             * TODO: test with both disabled to see if the HW is behaving
+             * as expected, like in gen7.
+             */
+            gs.SingleProgramFlow = true;
+            gs.VectorMaskEnable = true;
+#endif
+         }
+#endif
+#if GFX_VER <= 6
+         if (!active && ice->shaders.ff_gs_prog) {
+            const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
+            /* In gen6, transform feedback for the VS stage is done with an
+             * ad-hoc GS program. This function provides the needed 3DSTATE_GS
+             * for this.
+             */
+            gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
+            gs.SingleProgramFlow = true;
+            gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
+            gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
+
+#if GFX_VER <= 5
+            gs.GRFRegisterCount =
+               DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
+            /* BRW_NEW_URB_FENCE */
+            gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
+            gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+            gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
+            gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+#else
+            gs.Enable = true;
+            gs.VectorMaskEnable = true;
+            gs.SVBIPayloadEnable = true;
+            gs.SVBIPostIncrementEnable = true;
+            gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
+            gs.SOStatisticsEnable = true;
+            gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
+#endif
+         }
+#endif
+         if (!active && !ice->shaders.ff_gs_prog) {
+            gs.DispatchGRFStartRegisterForURBData = 1;
+#if GFX_VER >= 7
+            gs.IncludeVertexHandles = true;
+#endif
+         }
+#if GFX_VER >= 6
+         gs.StatisticsEnable = true;
+#endif
+#if GFX_VER == 5 || GFX_VER == 6
+         gs.RenderingEnabled = true;
+#endif
+#if GFX_VER <= 5
+         gs.MaximumVPIndex = ice->state.num_viewports - 1;
+#endif
+      }
+   }
+
+#if GFX_VER >= 7
+   if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
+      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
+
+      if (shader) {
+         const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
+         const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+         const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
+
+         crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
+            INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
+            hs.InstanceCount = tcs_prog_data->instances - 1;
+            hs.IncludeVertexHandles = true;
+            hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
+         }
+      } else {
+         crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
+      }
+
+   }
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
+      struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
+      if (shader) {
+         const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
+         const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
+         const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
+
+         crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
+            te.Partitioning = tes_prog_data->partitioning;
+            te.OutputTopology = tes_prog_data->output_topology;
+            te.TEDomain = tes_prog_data->domain;
+            te.TEEnable = true;
+            te.MaximumTessellationFactorOdd = 63.0;
+            te.MaximumTessellationFactorNotOdd = 64.0;
+         };
+         crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
+            INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
+
+            ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
+            ds.ComputeWCoordinateEnable =
+               tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
+         };
+      } else {
+         crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
+         crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
+      }
+   }
+#endif
+   if (dirty & CROCUS_DIRTY_RASTER) {
+
+#if GFX_VER < 6
+      const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
+      struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
+      uint32_t *sf_ptr = stream_state(batch,
+                                      GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+      _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
+         sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
+         sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+         sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
+         sf.DispatchGRFStartRegisterForURBData = 3;
+         sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+         sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
+         sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
+         sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
+         sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+
+         sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
+
+         sf.MaximumNumberofThreads =
+            MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
+
+         sf.SpritePointEnable = cso_state->point_quad_rasterization;
+         sf.DestinationOriginHorizontalBias = 0.5;
+         sf.DestinationOriginVerticalBias = 0.5;
+
+         sf.LastPixelEnable = cso_state->line_last_pixel;
+         sf.LineWidth = get_line_width(cso_state);
+         sf.PointWidth = cso_state->point_size;
+         sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
+#if GFX_VERx10 == 45 || GFX_VER >= 5
+         sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
+#endif
+         sf.ViewportTransformEnable = true;
+         sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
+         sf.ScissorRectangleEnable = true;
+         sf.CullMode = translate_cull_mode(cso_state->cull_face);
+
+         if (cso_state->flatshade_first) {
+            sf.TriangleFanProvokingVertexSelect = 1;
+         } else {
+            sf.TriangleStripListProvokingVertexSelect = 2;
+            sf.TriangleFanProvokingVertexSelect = 2;
+            sf.LineStripListProvokingVertexSelect = 1;
+         }
+      }
+#else
+      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+      uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
+      crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
+         sf.ViewportTransformEnable = !ice->state.window_space_position;
+
+#if GFX_VER == 6
+         const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+         uint32_t urb_entry_read_length;
+         uint32_t urb_entry_read_offset;
+         uint32_t point_sprite_enables;
+         calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
+                                  &urb_entry_read_length,
+                                  &urb_entry_read_offset);
+         sf.VertexURBEntryReadLength = urb_entry_read_length;
+         sf.VertexURBEntryReadOffset = urb_entry_read_offset;
+         sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
+         sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
+         sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+#endif
+
+#if GFX_VER >= 6
+         if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
+            sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+#endif
+#if GFX_VER == 7
+         if (ice->state.framebuffer.zsbuf) {
+            struct crocus_resource *zres, *sres;
+               crocus_get_depth_stencil_resources(&batch->screen->devinfo,
+                                                  ice->state.framebuffer.zsbuf->texture,
+                                                  &zres, &sres);
+            /* ANV thinks that the stencil-ness doesn't matter, this is just
+             * about handling polygon offset scaling.
+             */
+            sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
+         }
+#endif
+      }
+      crocus_emit_merge(batch, cso->sf, dynamic_sf,
+                      ARRAY_SIZE(dynamic_sf));
+#endif
+   }
+
+   if (dirty & CROCUS_DIRTY_WM) {
+      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+      const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
+      UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
+      UNUSED const struct shader_info *fs_info =
+         crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
+
+#if GFX_VER == 6
+      struct push_bos push_bos = {};
+      setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
+
+      emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
+#endif
+#if GFX_VER >= 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
+#else
+      uint32_t *wm_ptr = stream_state(batch,
+                                      GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
+
+      dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
+
+      _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
+#endif
+     {
+#if GFX_VER <= 6
+         wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+#if GFX_VER == 4
+      /* On gen4, we only have one shader kernel */
+         if (brw_wm_state_has_ksp(wm, 0)) {
+            wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
+            wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+            wm.DispatchGRFStartRegisterForConstantSetupData0 =
+               wm_prog_data->base.dispatch_grf_start_reg;
+         }
+#elif GFX_VER == 5
+         wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+         wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+         wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+         wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+         wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
+
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            wm_prog_data->base.dispatch_grf_start_reg;
+#elif GFX_VER == 6
+         wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+         wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+         wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
+            brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+         wm.DispatchGRFStartRegisterForConstantSetupData2 =
+           brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
+#endif
+#if GFX_VER <= 5
+         wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
+         wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
+         wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+         wm.SetupURBEntryReadOffset = 0;
+         wm.EarlyDepthTestEnable = true;
+         wm.LineAntialiasingRegionWidth = _05pixels;
+         wm.LineEndCapAntialiasingRegionWidth = _10pixels;
+         wm.DepthCoefficientURBReadOffset = 1;
+
+         if (cso->cso.offset_tri) {
+            wm.GlobalDepthOffsetEnable = true;
+
+         /* Something weird going on with legacy_global_depth_bias,
+          * offset_constant, scaling and MRD.  This value passes glean
+          * but gives some odd results elsewere (eg. the
+          * quad-offset-units test).
+          */
+            wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
+            wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
+         }
+         wm.SamplerStatePointer = ro_bo(batch->state.bo,
+                                        ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
+#endif
+
+         wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
+            ice->state.statistics_counters_enabled : 0;
+
+#if GFX_VER >= 6
+         wm.LineAntialiasingRegionWidth = _10pixels;
+         wm.LineEndCapAntialiasingRegionWidth = _05pixels;
+
+         wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
+         wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
+#endif
+#if GFX_VER == 6
+      wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
+         ice->state.cso_blend->dual_color_blending;
+      wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
+      wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
+
+      /* From the SNB PRM, volume 2 part 1, page 281:
+       * "If the PS kernel does not need the Position XY Offsets
+       * to compute a Position XY value, then this field should be
+       * programmed to POSOFFSET_NONE."
+       *
+       * "SW Recommendation: If the PS kernel needs the Position Offsets
+       * to compute a Position XY value, this field should match Position
+       * ZW Interpolation Mode to ensure a consistent position.xyzw
+       * computation."
+       * We only require XY sample offsets. So, this recommendation doesn't
+       * look useful at the moment. We might need this in future.
+       */
+      if (wm_prog_data->uses_pos_offset)
+         wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
+      else
+         wm.PositionXYOffsetSelect = POSOFFSET_NONE;
+#endif
+         wm.LineStippleEnable = cso->cso.line_stipple_enable;
+         wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
+
+#if GFX_VER < 7
+         if (wm_prog_data->base.use_alt_mode)
+            wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
+         wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
+         wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
+#endif
+
+#if GFX_VER >= 6
+         wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
+
+         struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+         if (fb->samples > 1) {
+            if (cso->cso.multisample)
+               wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
+            else
+               wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+
+            if (wm_prog_data->persample_dispatch)
+               wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+            else
+               wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
+         } else {
+            wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
+            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
+         }
+#endif
+
+         wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
+
+         if (wm_prog_data->uses_kill ||
+             ice->state.cso_zsa->cso.alpha_enabled ||
+             ice->state.cso_blend->cso.alpha_to_coverage ||
+             (GFX_VER >= 6 && wm_prog_data->uses_omask))
+            wm.PixelShaderKillsPixel = true;
+
+         if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
+             writes_depth || wm.PixelShaderKillsPixel ||
+             (GFX_VER >= 6 && wm_prog_data->has_side_effects))
+            wm.ThreadDispatchEnable = true;
+
+#if GFX_VER >= 7
+         wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
+         wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
+#else
+         if (wm_prog_data->base.total_scratch) {
+            struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
+                                                            MESA_SHADER_FRAGMENT);
+            wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
+            wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
+         }
+
+         wm.PixelShaderComputedDepth = writes_depth;
+
+#endif
+         /* The "UAV access enable" bits are unnecessary on HSW because they only
+          * seem to have an effect on the HW-assisted coherency mechanism which we
+          * don't need, and the rasterization-related UAV_ONLY flag and the
+          * DISPATCH_ENABLE bit can be set independently from it.
+          * C.f. gen8_upload_ps_extra().
+          *
+          * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
+          * _NEW_COLOR
+          */
+#if GFX_VERx10 == 75
+         if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
+             wm_prog_data->has_side_effects)
+            wm.PSUAVonly = ON;
+#endif
+
+#if GFX_VER >= 7
+      /* BRW_NEW_FS_PROG_DATA */
+         if (wm_prog_data->early_fragment_tests)
+           wm.EarlyDepthStencilControl = EDSC_PREPS;
+         else if (wm_prog_data->has_side_effects)
+           wm.EarlyDepthStencilControl = EDSC_PSEXEC;
+#endif
+      };
+
+#if GFX_VER <= 5
+      if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
+         crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
+            clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
+         }
+         ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
+      }
+#endif
+   }
+
+#if GFX_VER >= 7
+   if (dirty & CROCUS_DIRTY_GEN7_SBE) {
+      crocus_emit_sbe(batch, ice);
+   }
+#endif
+
+#if GFX_VER >= 6
+   if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
+      uint32_t ds_offset;
+      void *ds_map = stream_state(batch,
+                                  sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
+                                  64, &ds_offset);
+      _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
+         set_depth_stencil_bits(ice, &ds);
+      }
+
+#if GFX_VER == 6
+      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+         ptr.DEPTH_STENCIL_STATEChange = true;
+      }
+#else
+      crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
+         ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
+      }
+#endif
+   }
+
+   if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
+      /* Align to 64-byte boundary as per anv. */
+      uint32_t scissor_offset;
+      struct pipe_scissor_state *scissor_map = (void *)
+         stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
+                      64, &scissor_offset);
+      for (int i = 0; i < ice->state.num_viewports; i++) {
+         struct pipe_scissor_state scissor;
+         crocus_fill_scissor_rect(ice, i, &scissor);
+         scissor_map[i] = scissor;
+      }
+
+      crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
+         ptr.ScissorRectPointer = scissor_offset;
+      }
+   }
+#endif
+
+   if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
+      struct isl_device *isl_dev = &batch->screen->isl_dev;
+#if GFX_VER >= 6
+      crocus_emit_depth_stall_flushes(batch);
+#endif
+      void *batch_ptr;
+      struct crocus_resource *zres, *sres;
+      struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
+      batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
+
+      struct isl_view view = {
+                              .base_level = 0,
+                              .levels = 1,
+                              .base_array_layer = 0,
+                              .array_len = 1,
+                              .swizzle = ISL_SWIZZLE_IDENTITY,
+      };
+      struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
+
+      if (cso->zsbuf) {
+         crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
+         struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
+         if (zsbuf->align_res) {
+            zres = (struct crocus_resource *)zsbuf->align_res;
+         }
+         view.base_level = cso->zsbuf->u.tex.level;
+         view.base_array_layer = cso->zsbuf->u.tex.first_layer;
+         view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
+
+         if (zres) {
+            view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
+
+            info.depth_surf = &zres->surf;
+            info.depth_address = crocus_command_reloc(batch,
+                                                      (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
+                                                      zres->bo, 0, RELOC_32BIT);
+
+            info.mocs = crocus_mocs(zres->bo, isl_dev);
+            view.format = zres->surf.format;
+
+            if (crocus_resource_level_has_hiz(zres, view.base_level)) {
+               info.hiz_usage = zres->aux.usage;
+               info.hiz_surf = &zres->aux.surf;
+               uint32_t hiz_offset = 0;
+
+#if GFX_VER == 6
+               /* HiZ surfaces on Sandy Bridge technically don't support
+                * mip-mapping.  However, we can fake it by offsetting to the
+                * first slice of LOD0 in the HiZ surface.
+                */
+               isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
+                                                   view.base_level, 0, 0,
+                                                   &hiz_offset, NULL, NULL);
+#endif
+               info.hiz_address = crocus_command_reloc(batch,
+                                                       (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
+                                                       zres->aux.bo, zres->aux.offset + hiz_offset,
+                                                       RELOC_32BIT);
+               info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
+            }
+         }
+
+#if GFX_VER >= 6
+         if (sres) {
+            view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
+            info.stencil_aux_usage = sres->aux.usage;
+            info.stencil_surf = &sres->surf;
+
+            uint32_t stencil_offset = 0;
+#if GFX_VER == 6
+            /* Stencil surfaces on Sandy Bridge technically don't support
+             * mip-mapping.  However, we can fake it by offsetting to the
+             * first slice of LOD0 in the stencil surface.
+             */
+            isl_surf_get_image_offset_B_tile_sa(&sres->surf,
+                                                view.base_level, 0, 0,
+                                                &stencil_offset, NULL, NULL);
+#endif
+
+            info.stencil_address = crocus_command_reloc(batch,
+                                                        (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
+                                                        sres->bo, stencil_offset, RELOC_32BIT);
+            if (!zres) {
+               view.format = sres->surf.format;
+               info.mocs = crocus_mocs(sres->bo, isl_dev);
+            }
+         }
+#endif
+      }
+      isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
+   }
+
+   /* TODO: Disable emitting this until something uses a stipple. */
+   if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
+         for (int i = 0; i < 32; i++) {
+            poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
+         }
+      }
+   }
+
+   if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
+      struct crocus_rasterizer_state *cso = ice->state.cso_rast;
+      crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
+   }
+
+#if GFX_VER <= 5
+   if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
+      upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
+                                      ice->shaders.vs_offset, ice->shaders.sf_offset,
+                                      ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
+      crocus_upload_urb_fence(batch);
+
+      crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
+        cs.NumberofURBEntries = ice->urb.nr_cs_entries;
+        cs.URBEntryAllocationSize = ice->urb.csize - 1;
+      }
+      dirty |= CROCUS_DIRTY_GEN4_CURBE;
+   }
+#endif
+   if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
+      struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
+      if (fb->width && fb->height) {
+         crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
+            rect.ClippedDrawingRectangleXMax = fb->width - 1;
+            rect.ClippedDrawingRectangleYMax = fb->height - 1;
+         }
+      }
+   }
+
+   if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
+      const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
+      const uint32_t count = user_count +
+         ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
+      uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
+
+      if (count) {
+         const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
+
+         uint32_t *map =
+            crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
+         _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
+            vb.DWordLength = (vb_dwords * count + 1) - 2;
+         }
+         map += 1;
+
+         uint32_t bound = dynamic_bound;
+         int i;
+         while (bound) {
+            i = u_bit_scan(&bound);
+            struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
+            struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
+            uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
+
+            emit_vertex_buffer_state(batch, i, bo,
+                                     buf->buffer_offset,
+                                     ice->state.vb_end[i],
+                                     buf->stride,
+                                     step_rate,
+                                     &map);
+         }
+         i = user_count;
+         if (ice->state.vs_uses_draw_params) {
+            struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
+            emit_vertex_buffer_state(batch, i++,
+                                     res->bo,
+                                     ice->draw.draw_params.offset,
+                                     ice->draw.draw_params.res->width0,
+                                     0, 0, &map);
+         }
+         if (ice->state.vs_uses_derived_draw_params) {
+            struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
+            emit_vertex_buffer_state(batch, i++,
+                                     res->bo,
+                                     ice->draw.derived_draw_params.offset,
+                                     ice->draw.derived_draw_params.res->width0,
+                                     0, 0, &map);
+         }
+      }
+   }
+
+   if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
+      struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
+      const unsigned entries = MAX2(cso->count, 1);
+      if (!(ice->state.vs_needs_sgvs_element ||
+            ice->state.vs_uses_derived_draw_params ||
+            ice->state.vs_needs_edge_flag)) {
+         crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
+                         (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
+      } else {
+         uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
+         const unsigned dyn_count = cso->count +
+            ice->state.vs_needs_sgvs_element +
+            ice->state.vs_uses_derived_draw_params;
+
+         crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
+                           &dynamic_ves, ve) {
+            ve.DWordLength =
+               1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
+         }
+         memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
+                (cso->count - ice->state.vs_needs_edge_flag) *
+                GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
+         uint32_t *ve_pack_dest =
+            &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
+                         GENX(VERTEX_ELEMENT_STATE_length)];
+
+         if (ice->state.vs_needs_sgvs_element) {
+            uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
+                                 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+            crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+               ve.Valid = true;
+               ve.VertexBufferIndex =
+                  util_bitcount64(ice->state.bound_vertex_buffers);
+               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+               ve.Component0Control = base_ctrl;
+               ve.Component1Control = base_ctrl;
+               ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
+               ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
+#if GFX_VER < 5
+               ve.DestinationElementOffset = cso->count * 4;
+#endif
+            }
+            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+         }
+         if (ice->state.vs_uses_derived_draw_params) {
+            crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
+               ve.Valid = true;
+               ve.VertexBufferIndex =
+                  util_bitcount64(ice->state.bound_vertex_buffers) +
+                  ice->state.vs_uses_draw_params;
+               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
+               ve.Component0Control = VFCOMP_STORE_SRC;
+               ve.Component1Control = VFCOMP_STORE_SRC;
+               ve.Component2Control = VFCOMP_STORE_0;
+               ve.Component3Control = VFCOMP_STORE_0;
+#if GFX_VER < 5
+               ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
+#endif
+            }
+            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
+         }
+         if (ice->state.vs_needs_edge_flag) {
+            for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
+               ve_pack_dest[i] = cso->edgeflag_ve[i];
+         }
+
+         crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
+                         (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
+      }
+   }
+
+#if GFX_VERx10 == 75
+   if (dirty & CROCUS_DIRTY_GEN75_VF) {
+      crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
+         if (draw->primitive_restart) {
+            vf.IndexedDrawCutIndexEnable = true;
+            vf.CutIndex = draw->restart_index;
+         }
+      }
+   }
+#endif
+
+#if GFX_VER <= 5
+   if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
+      gen4_upload_curbe(batch);
+   }
+#endif
+}
+
+static void
+crocus_upload_render_state(struct crocus_context *ice,
+                           struct crocus_batch *batch,
+                           const struct pipe_draw_info *draw,
+                           unsigned drawid_offset,
+                           const struct pipe_draw_indirect_info *indirect,
+                           const struct pipe_draw_start_count_bias *sc)
+{
+#if GFX_VER == 7
+   bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
+#endif
+   bool emit_index = false;
+   batch->no_wrap = true;
+
+   if (!batch->contains_draw) {
+      emit_index = true;
+      batch->contains_draw = true;
+   }
+   crocus_update_surface_base_address(batch);
+
+   crocus_upload_dirty_render_state(ice, batch, draw);
+
+   batch->no_wrap = false;
+   if (draw->index_size > 0) {
+      unsigned offset;
+      unsigned size;
+
+      if (draw->has_user_indices) {
+         unsigned start_offset = draw->index_size * sc->start;
+         u_upload_data(ice->ctx.stream_uploader, 0,
+                       sc->count * draw->index_size, 4,
+                       (char *)draw->index.user + start_offset,
+                       &offset, &ice->state.index_buffer.res);
+         offset -= start_offset;
+         size = start_offset + sc->count * draw->index_size;
+         emit_index = true;
+      } else {
+         struct crocus_resource *res = (void *) draw->index.resource;
+         res->bind_history |= PIPE_BIND_INDEX_BUFFER;
+
+         if (ice->state.index_buffer.res != draw->index.resource) {
+            pipe_resource_reference(&ice->state.index_buffer.res,
+                                    draw->index.resource);
+            emit_index = true;
+         }
+         offset = 0;
+         size = draw->index.resource->width0;
+      }
+
+      if (!emit_index &&
+          (ice->state.index_buffer.size != size ||
+           ice->state.index_buffer.index_size != draw->index_size ||
+           ice->state.index_buffer.prim_restart != draw->primitive_restart))
+         emit_index = true;
+
+      if (emit_index) {
+         struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
+
+         crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
+#if !(GFX_VERx10 == 75)
+            ib.CutIndexEnable = draw->primitive_restart;
+#endif
+            ib.IndexFormat = draw->index_size >> 1;
+            ib.BufferStartingAddress = ro_bo(bo, offset);
+            ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
+         }
+         ice->state.index_buffer.size = size;
+         ice->state.index_buffer.offset = offset;
+         ice->state.index_buffer.index_size = draw->index_size;
+         ice->state.index_buffer.prim_restart = draw->primitive_restart;
+      }
+   }
+
+#define _3DPRIM_END_OFFSET          0x2420
+#define _3DPRIM_START_VERTEX        0x2430
+#define _3DPRIM_VERTEX_COUNT        0x2434
+#define _3DPRIM_INSTANCE_COUNT      0x2438
+#define _3DPRIM_START_INSTANCE      0x243C
+#define _3DPRIM_BASE_VERTEX         0x2440
+
+#if GFX_VER == 7
+   if (indirect && !indirect->count_from_stream_output) {
+      if (indirect->indirect_draw_count) {
+         use_predicate = true;
+
+         struct crocus_bo *draw_count_bo =
+            crocus_resource_bo(indirect->indirect_draw_count);
+         unsigned draw_count_offset =
+            indirect->indirect_draw_count_offset;
+
+         crocus_emit_pipe_control_flush(batch,
+                                        "ensure indirect draw buffer is flushed",
+                                        PIPE_CONTROL_FLUSH_ENABLE);
+         if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
+#if GFX_VERx10 == 75
+            struct mi_builder b;
+            mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+            /* comparison = draw id < draw count */
+            struct mi_value comparison =
+               mi_ult(&b, mi_imm(drawid_offset),
+                      mi_mem32(ro_bo(draw_count_bo,
+                                     draw_count_offset)));
+
+            /* predicate = comparison & conditional rendering predicate */
+            struct mi_value pred = mi_iand(&b, comparison,
+                                           mi_reg32(CS_GPR(15)));
+
+            mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
+            mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
+
+            unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+               MI_PREDICATE_COMBINEOP_SET |
+               MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+
+            crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+#endif
+         } else {
+            uint32_t mi_predicate;
+
+            /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
+            crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
+            /* Upload the current draw count from the draw parameters buffer
+             * to MI_PREDICATE_SRC0.
+             */
+            crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
+                                       draw_count_bo, draw_count_offset);
+            /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
+            crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
+
+            if (drawid_offset == 0) {
+               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
+                  MI_PREDICATE_COMBINEOP_SET |
+                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+            } else {
+               /* While draw_index < draw_count the predicate's result will be
+                *  (draw_index == draw_count) ^ TRUE = TRUE
+                * When draw_index == draw_count the result is
+                *  (TRUE) ^ TRUE = FALSE
+                * After this all results will be:
+                *  (FALSE) ^ FALSE = FALSE
+                */
+               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
+                  MI_PREDICATE_COMBINEOP_XOR |
+                  MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
+            }
+            crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
+         }
+      }
+
+#if GFX_VER >= 7
+      struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
+      assert(bo);
+
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
+         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
+      }
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
+         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
+      }
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = _3DPRIM_START_VERTEX;
+         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
+      }
+      if (draw->index_size) {
+         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
+            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
+         }
+         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
+            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
+         }
+      } else {
+         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
+            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
+         }
+         crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+            lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
+            lri.DataDWord = 0;
+         }
+      }
+#endif
+   } else if (indirect && indirect->count_from_stream_output) {
+#if GFX_VERx10 == 75
+      struct crocus_stream_output_target *so =
+         (void *) indirect->count_from_stream_output;
+
+      /* XXX: Replace with actual cache tracking */
+      crocus_emit_pipe_control_flush(batch,
+                                     "draw count from stream output stall",
+                                     PIPE_CONTROL_CS_STALL);
+
+      struct mi_builder b;
+      mi_builder_init(&b, &batch->screen->devinfo, batch);
+
+      struct crocus_address addr =
+         ro_bo(crocus_resource_bo(&so->offset_res->base), so->offset_offset);
+      struct mi_value offset =
+         mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
+
+      mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
+               mi_udiv32_imm(&b, offset, so->stride));
+
+      _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
+      _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
+      _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
+      _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
+#endif
+   }
+#else
+   assert(!indirect);
+#endif
+
+   crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
+      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
+#if GFX_VER == 7
+      prim.PredicateEnable = use_predicate;
+#endif
+
+      prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, draw->vertices_per_patch);
+      if (indirect) {
+         // XXX Probably have to do something for gen6 here?
+#if GFX_VER == 7
+         prim.IndirectParameterEnable = true;
+#endif
+      } else {
+#if GFX_VER >= 5
+         prim.StartInstanceLocation = draw->start_instance;
+#endif
+         prim.InstanceCount = draw->instance_count;
+         prim.VertexCountPerInstance = sc->count;
+
+         prim.StartVertexLocation = sc->start;
+
+         if (draw->index_size) {
+            prim.BaseVertexLocation += sc->index_bias;
+         }
+      }
+   }
+}
+
+#if GFX_VER == 7
+
+static void
+crocus_upload_compute_state(struct crocus_context *ice,
+                            struct crocus_batch *batch,
+                            const struct pipe_grid_info *grid)
+{
+   const uint64_t stage_dirty = ice->state.stage_dirty;
+   struct crocus_screen *screen = batch->screen;
+   const struct intel_device_info *devinfo = &screen->devinfo;
+   struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
+   struct crocus_compiled_shader *shader =
+      ice->shaders.prog[MESA_SHADER_COMPUTE];
+   struct brw_stage_prog_data *prog_data = shader->prog_data;
+   struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
+   const struct brw_cs_dispatch_info dispatch =
+      brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
+
+   crocus_update_surface_base_address(batch);
+   if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
+      upload_sysvals(ice, MESA_SHADER_COMPUTE);
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
+      crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
+      ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
+         crocus_upload_binding_table(ice, batch,
+                                     ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
+                                     ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
+   }
+
+   if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
+      crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
+
+   if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
+       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
+      /* The MEDIA_VFE_STATE documentation for Gen8+ says:
+       *
+       *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *    the only bits that are changed are scoreboard related: Scoreboard
+       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
+       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
+       *    sufficient."
+       */
+      crocus_emit_pipe_control_flush(batch,
+                                     "workaround: stall before MEDIA_VFE_STATE",
+                                     PIPE_CONTROL_CS_STALL);
+
+      crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
+         if (prog_data->total_scratch) {
+            struct crocus_bo *bo =
+               crocus_get_scratch_space(ice, prog_data->total_scratch,
+                                        MESA_SHADER_COMPUTE);
+#if GFX_VERx10 == 75
+            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
+             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
+             */
+            vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
+#else
+            /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
+             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
+             */
+            vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
+#endif
+            vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
+         }
+
+         vfe.MaximumNumberofThreads =
+            devinfo->max_cs_threads * screen->subslice_total - 1;
+         vfe.ResetGatewayTimer =
+            Resettingrelativetimerandlatchingtheglobaltimestamp;
+         vfe.BypassGatewayControl = true;
+         vfe.GPGPUMode = 1;
+         vfe.NumberofURBEntries = 0;
+         vfe.URBEntryAllocationSize = 0;
+
+         vfe.CURBEAllocationSize =
+            ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
+                  cs_prog_data->push.cross_thread.regs, 2);
+      }
+   }
+
+   /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
+   if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
+       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
+      uint32_t curbe_data_offset = 0;
+      assert(cs_prog_data->push.cross_thread.dwords == 0 &&
+             cs_prog_data->push.per_thread.dwords == 1 &&
+             cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
+      const unsigned push_const_size =
+         brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
+      uint32_t *curbe_data_map =
+         stream_state(batch,
+                      ALIGN(push_const_size, 64), 64,
+                      &curbe_data_offset);
+      assert(curbe_data_map);
+      memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
+      crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
+                                       curbe_data_map);
+
+      crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
+         curbe.CURBEDataStartAddress = curbe_data_offset;
+      }
+   }
+
+   if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
+                      CROCUS_STAGE_DIRTY_BINDINGS_CS |
+                      CROCUS_STAGE_DIRTY_CONSTANTS_CS |
+                      CROCUS_STAGE_DIRTY_CS)) {
+      uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
+      const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
+      crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
+         idd.KernelStartPointer = ksp;
+         idd.SamplerStatePointer = shs->sampler_offset;
+         idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
+         idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
+         idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
+         idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
+         idd.BarrierEnable = cs_prog_data->uses_barrier;
+         idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
+                                                     prog_data->total_shared);
+#if GFX_VERx10 >= 75
+         idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
+#endif
+      }
+
+      crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
+         load.InterfaceDescriptorTotalLength =
+            GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
+         load.InterfaceDescriptorDataStartAddress =
+            emit_state(batch, desc, sizeof(desc), 64);
+      }
+   }
+
+#define GPGPU_DISPATCHDIMX 0x2500
+#define GPGPU_DISPATCHDIMY 0x2504
+#define GPGPU_DISPATCHDIMZ 0x2508
+
+   if (grid->indirect) {
+      struct crocus_state_ref *grid_size = &ice->state.grid_size;
+      struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
+         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
+      }
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
+         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
+      }
+      crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
+         lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
+         lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
+      }
+
+      /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
+      _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
+      crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
+
+      /* Load compute_dispatch_indirect_x_size into SRC0 */
+      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
+
+      /* predicate = (compute_dispatch_indirect_x_size == 0); */
+      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOAD;
+         mip.CombineOperation = COMBINE_SET;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      };
+
+      /* Load compute_dispatch_indirect_y_size into SRC0 */
+      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
+
+      /* predicate = (compute_dispatch_indirect_y_size == 0); */
+      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOAD;
+         mip.CombineOperation = COMBINE_OR;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      };
+
+      /* Load compute_dispatch_indirect_z_size into SRC0 */
+      crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
+
+      /* predicate = (compute_dispatch_indirect_z_size == 0); */
+      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOAD;
+         mip.CombineOperation = COMBINE_OR;
+         mip.CompareOperation = COMPARE_SRCS_EQUAL;
+      };
+
+      /* predicate = !predicate; */
+#define COMPARE_FALSE                           1
+      crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
+         mip.LoadOperation    = LOAD_LOADINV;
+         mip.CombineOperation = COMBINE_OR;
+         mip.CompareOperation = COMPARE_FALSE;
+      }
+
+   }
+
+   crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.IndirectParameterEnable    = grid->indirect != NULL;
+      ggw.PredicateEnable            = grid->indirect != NULL;
+      ggw.SIMDSize                   = dispatch.simd_size / 16;
+      ggw.ThreadDepthCounterMaximum  = 0;
+      ggw.ThreadHeightCounterMaximum = 0;
+      ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
+      ggw.ThreadGroupIDXDimension    = grid->grid[0];
+      ggw.ThreadGroupIDYDimension    = grid->grid[1];
+      ggw.ThreadGroupIDZDimension    = grid->grid[2];
+      ggw.RightExecutionMask         = dispatch.right_mask;
+      ggw.BottomExecutionMask        = 0xffffffff;
+   }
+
+   crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
+
+   batch->contains_draw = true;
+}
+
+#endif /* GFX_VER == 7 */
+
+/**
+ * State module teardown.
+ */
+static void
+crocus_destroy_state(struct crocus_context *ice)
+{
+
+   pipe_resource_reference(&ice->draw.draw_params.res, NULL);
+   pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
+
+   for (int i = 0; i < 4; i++) {
+      pipe_so_target_reference(&ice->state.so_target[i], NULL);
+   }
+
+   for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
+      pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
+
+   for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
+      struct crocus_shader_state *shs = &ice->state.shaders[stage];
+      for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+         pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
+      }
+      for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
+         pipe_resource_reference(&shs->image[i].base.resource, NULL);
+      }
+      for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
+         pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
+      }
+      for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)
+                                     &shs->textures[i], NULL);
+      }
+   }
+
+   pipe_resource_reference(&ice->state.grid_size.res, NULL);
+
+   pipe_resource_reference(&ice->state.index_buffer.res, NULL);
+}
+
+/* ------------------------------------------------------------------- */
+
+static void
+crocus_rebind_buffer(struct crocus_context *ice,
+                     struct crocus_resource *res)
+{
+   struct pipe_context *ctx = &ice->ctx;
+
+   assert(res->base.target == PIPE_BUFFER);
+
+   /* Buffers can't be framebuffer attachments, nor display related,
+    * and we don't have upstream Clover support.
+    */
+   assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
+                                 PIPE_BIND_RENDER_TARGET |
+                                 PIPE_BIND_BLENDABLE |
+                                 PIPE_BIND_DISPLAY_TARGET |
+                                 PIPE_BIND_CURSOR |
+                                 PIPE_BIND_COMPUTE_RESOURCE |
+                                 PIPE_BIND_GLOBAL)));
+
+   if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
+      uint64_t bound_vbs = ice->state.bound_vertex_buffers;
+      while (bound_vbs) {
+         const int i = u_bit_scan64(&bound_vbs);
+         struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
+
+         if (!buffer->is_user_buffer && &res->base == buffer->buffer.resource)
+            ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
+      }
+   }
+
+   if (res->bind_history & PIPE_BIND_INDEX_BUFFER) {
+      if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
+         pipe_resource_reference(&ice->state.index_buffer.res, NULL);
+   }
+   /* There is no need to handle these:
+    * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
+    * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
+    */
+
+   if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
+      /* XXX: be careful about resetting vs appending... */
+      assert(false);
+   }
+
+   for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
+      struct crocus_shader_state *shs = &ice->state.shaders[s];
+      enum pipe_shader_type p_stage = stage_to_pipe(s);
+
+      if (!(res->bind_stages & (1 << s)))
+         continue;
+
+      if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
+         /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
+         uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
+         while (bound_cbufs) {
+            const int i = u_bit_scan(&bound_cbufs);
+            struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
+
+            if (res->bo == crocus_resource_bo(cbuf->buffer)) {
+               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
+            }
+         }
+      }
+
+      if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
+         uint32_t bound_ssbos = shs->bound_ssbos;
+         while (bound_ssbos) {
+            const int i = u_bit_scan(&bound_ssbos);
+            struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
+
+            if (res->bo == crocus_resource_bo(ssbo->buffer)) {
+               struct pipe_shader_buffer buf = {
+                  .buffer = &res->base,
+                  .buffer_offset = ssbo->buffer_offset,
+                  .buffer_size = ssbo->buffer_size,
+               };
+               crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
+                                         (shs->writable_ssbos >> i) & 1);
+            }
+         }
+      }
+
+      if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
+         uint32_t bound_sampler_views = shs->bound_sampler_views;
+         while (bound_sampler_views) {
+            const int i = u_bit_scan(&bound_sampler_views);
+            struct crocus_sampler_view *isv = shs->textures[i];
+            struct crocus_bo *bo = isv->res->bo;
+
+            if (res->bo == bo) {
+               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
+            }
+         }
+      }
+
+      if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
+         uint32_t bound_image_views = shs->bound_image_views;
+         while (bound_image_views) {
+            const int i = u_bit_scan(&bound_image_views);
+            struct crocus_image_view *iv = &shs->image[i];
+            struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
+
+            if (res->bo == bo)
+               ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
+         }
+      }
+   }
+}
+
+/* ------------------------------------------------------------------- */
+
+static unsigned
+flags_to_post_sync_op(uint32_t flags)
+{
+   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
+      return WriteImmediateData;
+
+   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
+      return WritePSDepthCount;
+
+   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
+      return WriteTimestamp;
+
+   return 0;
+}
+
+/*
+ * Do the given flags have a Post Sync or LRI Post Sync operation?
+ */
+static enum pipe_control_flags
+get_post_sync_flags(enum pipe_control_flags flags)
+{
+   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
+            PIPE_CONTROL_WRITE_DEPTH_COUNT |
+            PIPE_CONTROL_WRITE_TIMESTAMP |
+            PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
+    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
+    */
+   assert(util_bitcount(flags) <= 1);
+
+   return flags;
+}
+
+#define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
+
+/**
+ * Emit a series of PIPE_CONTROL commands, taking into account any
+ * workarounds necessary to actually accomplish the caller's request.
+ *
+ * Unless otherwise noted, spec quotations in this function come from:
+ *
+ * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
+ * Restrictions for PIPE_CONTROL.
+ *
+ * You should not use this function directly.  Use the helpers in
+ * crocus_pipe_control.c instead, which may split the pipe control further.
+ */
+static void
+crocus_emit_raw_pipe_control(struct crocus_batch *batch,
+                             const char *reason,
+                             uint32_t flags,
+                             struct crocus_bo *bo,
+                             uint32_t offset,
+                             uint64_t imm)
+{
+   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
+   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
+   UNUSED enum pipe_control_flags non_lri_post_sync_flags =
+      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Recursive PIPE_CONTROL workarounds --------------------------------
+    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
+    *
+    * We do these first because we want to look at the original operation,
+    * rather than any workarounds we set.
+    */
+
+   /* "Flush Types" workarounds ---------------------------------------------
+    * We do these now because they may add post-sync operations or CS stalls.
+    */
+
+   if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+      /* Hardware workaround: SNB B-Spec says:
+       *
+       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
+       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
+       *     required."
+       */
+      crocus_emit_post_sync_nonzero_flush(batch);
+   }
+
+   if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+      /* Project: PRE-HSW / Argument: Depth Stall
+       *
+       * "The following bits must be clear:
+       *  - Render Target Cache Flush Enable ([12] of DW1)
+       *  - Depth Cache Flush Enable ([0] of DW1)"
+       */
+      assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                        PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
+   }
+
+   if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
+       *
+       *    "This bit must be DISABLED for operations other than writing
+       *     PS_DEPTH_COUNT."
+       *
+       * This seems like nonsense.  An Ivybridge workaround requires us to
+       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
+       * operation.  Gen8+ requires us to emit depth stalls and depth cache
+       * flushes together.  So, it's hard to imagine this means anything other
+       * than "we originally intended this to be used for PS_DEPTH_COUNT".
+       *
+       * We ignore the supposed restriction and do nothing.
+       */
+   }
+
+   if (!(GFX_VERx10 == 75) && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
+      /* Project: PRE-HSW / Argument: Depth Cache Flush
+       *
+       * "Depth Stall must be clear ([13] of DW1)."
+       */
+      assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
+   }
+
+   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
+      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
+       *
+       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
+       *     PS_DEPTH_COUNT or TIMESTAMP queries."
+       *
+       * TODO: Implement end-of-pipe checking.
+       */
+      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
+   }
+
+   if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
+      /* From the PIPE_CONTROL instruction table, bit 1:
+       *
+       *    "This bit is ignored if Depth Stall Enable is set.
+       *     Further, the render cache is not flushed even if Write Cache
+       *     Flush Enable bit is set."
+       *
+       * We assert that the caller doesn't do this combination, to try and
+       * prevent mistakes.  It shouldn't hurt the GPU, though.
+       *
+       * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
+       * and "Render Target Flush" combo is explicitly required for BTI
+       * update workarounds.
+       */
+      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
+                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
+   }
+
+   /* PIPE_CONTROL page workarounds ------------------------------------- */
+
+   if (GFX_VER == 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
+      /* From the PIPE_CONTROL page itself:
+       *
+       *    "IVB, HSW, BDW
+       *     Restriction: Pipe_control with CS-stall bit set must be issued
+       *     before a pipe-control command that has the State Cache
+       *     Invalidate bit set."
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if ((GFX_VERx10 == 75)) {
+      /* From the PIPE_CONTROL page itself:
+       *
+       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
+       *     Prior to programming a PIPECONTROL command with any of the RO
+       *     cache invalidation bit set, program a PIPECONTROL flush command
+       *     with “CS stall” bit and “HDC Flush” bit set."
+       *
+       * TODO: Actually implement this.  What's an HDC Flush?
+       */
+   }
+
+   if (flags & PIPE_CONTROL_FLUSH_LLC) {
+      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
+       *
+       *    "Project: ALL
+       *     SW must always program Post-Sync Operation to "Write Immediate
+       *     Data" when Flush LLC is set."
+       *
+       * For now, we just require the caller to do it.
+       */
+      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
+   }
+
+   /* "Post-Sync Operation" workarounds -------------------------------- */
+
+   /* Project: All / Argument: Global Snapshot Count Reset [19]
+    *
+    * "This bit must not be exercised on any product.
+    *  Requires stall bit ([20] of DW1) set."
+    *
+    * We don't use this, so we just assert that it isn't used.  The
+    * PIPE_CONTROL instruction page indicates that they intended this
+    * as a debug feature and don't think it is useful in production,
+    * but it may actually be usable, should we ever want to.
+    */
+   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
+
+   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
+                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
+      /* Project: All / Arguments:
+       *
+       * - Generic Media State Clear [16]
+       * - Indirect State Pointers Disable [16]
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
+       * State Clear) says:
+       *
+       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
+       *     programmed prior to programming a PIPECONTROL command with "Media
+       *     State Clear" set in GPGPU mode of operation"
+       *
+       * This is a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
+      /* Project: All / Argument: Store Data Index
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0'."
+       *
+       * For now, we just assert that the caller does this.  We might want to
+       * automatically add a write to the workaround BO...
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (flags & PIPE_CONTROL_SYNC_GFDT) {
+      /* Project: All / Argument: Sync GFDT
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0' or 0x2520[13] must be set."
+       *
+       * For now, we just assert that the caller does this.
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (GFX_VER >= 6 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+      /* Project: SNB, IVB, HSW / Argument: TLB inv
+       *
+       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
+       *  must be set to something other than '0'."
+       *
+       * For now, we just assert that the caller does this.
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+      /* Project: IVB+ / Argument: TLB inv
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, from the PIPE_CONTROL instruction table:
+       *
+       *    "Project: SKL+
+       *     Post Sync Operation or CS stall must be set to ensure a TLB
+       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
+       *     cache to invalidate."
+       *
+       * This is not a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+    *
+    * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+    *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+    *
+    * Note that the kernel does CS stalls between batches, so we only need
+    * to count them within a batch.  We currently naively count every 4, and
+    * don't skip the ones with only read-cache-invalidate bits set.  This
+    * may or may not be a problem...
+    */
+   if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         batch->pipe_controls_since_last_cs_stall = 0;
+      }
+
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++batch->pipe_controls_since_last_cs_stall == 4) {
+         batch->pipe_controls_since_last_cs_stall = 0;
+         flags |= PIPE_CONTROL_CS_STALL;
+      }
+   }
+
+   /* "Stall" workarounds ----------------------------------------------
+    * These have to come after the earlier ones because we may have added
+    * some additional CS stalls above.
+    */
+
+   if (flags & PIPE_CONTROL_CS_STALL) {
+      /* Project: PRE-SKL, VLV, CHV
+       *
+       * "[All Stepping][All SKUs]:
+       *
+       *  One of the following must also be set:
+       *
+       *  - Render Target Cache Flush Enable ([12] of DW1)
+       *  - Depth Cache Flush Enable ([0] of DW1)
+       *  - Stall at Pixel Scoreboard ([1] of DW1)
+       *  - Depth Stall ([13] of DW1)
+       *  - Post-Sync Operation ([13] of DW1)
+       *  - DC Flush Enable ([5] of DW1)"
+       *
+       * If we don't already have one of those bits set, we choose to add
+       * "Stall at Pixel Scoreboard".  Some of the other bits require a
+       * CS stall as a workaround (see above), which would send us into
+       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
+       * appears to be safe, so we choose that.
+       */
+      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               PIPE_CONTROL_WRITE_IMMEDIATE |
+                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                               PIPE_CONTROL_WRITE_TIMESTAMP |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_DATA_CACHE_FLUSH;
+      if (!(flags & wa_bits))
+         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+   }
+
+   /* Emit --------------------------------------------------------------- */
+
+   if (INTEL_DEBUG & DEBUG_PIPE_CONTROL) {
+      fprintf(stderr,
+              "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
+              (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
+              (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
+              (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
+              (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
+              (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
+              (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
+              (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
+              (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
+              (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
+              (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
+              (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
+              (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
+              (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
+              (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
+              (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
+              (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
+              "SnapRes" : "",
+              (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
+              "ISPDis" : "",
+              (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
+              (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
+              (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
+              imm, reason);
+   }
+
+   crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
+#if GFX_VER >= 7
+      pc.LRIPostSyncOperation = NoLRIOperation;
+      pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
+      pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
+#endif
+#if GFX_VER >= 6
+      pc.StoreDataIndex = 0;
+      pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
+      pc.GlobalSnapshotCountReset =
+         flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
+      pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
+      pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
+      pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
+      pc.RenderTargetCacheFlushEnable =
+         flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
+      pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+      pc.StateCacheInvalidationEnable =
+         flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+      pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
+      pc.ConstantCacheInvalidationEnable =
+         flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+#else
+      pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
+#endif
+      pc.PostSyncOperation = flags_to_post_sync_op(flags);
+      pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
+      pc.InstructionCacheInvalidateEnable =
+         flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
+      pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
+#if GFX_VER >= 5 || GFX_VERx10 == 45
+      pc.IndirectStatePointersDisable =
+         flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
+#endif
+#if GFX_VER >= 6
+      pc.TextureCacheInvalidationEnable =
+         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+#elif GFX_VER == 5 || GFX_VERx10 == 45
+      pc.TextureCacheFlushEnable =
+         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+#endif
+      pc.Address = ggtt_bo(bo, offset);
+      if (GFX_VER < 7 && bo)
+         pc.DestinationAddressType = DAT_GGTT;
+      pc.ImmediateData = imm;
+   }
+}
+
+#if GFX_VER == 6
+void
+genX(upload_urb)(struct crocus_batch *batch,
+                 unsigned vs_size,
+                 bool gs_present,
+                 unsigned gs_size)
+{
+   struct crocus_context *ice = batch->ice;
+   int nr_vs_entries, nr_gs_entries;
+   int total_urb_size = ice->urb.size * 1024; /* in bytes */
+   const struct intel_device_info *devinfo = &batch->screen->devinfo;
+
+   /* Calculate how many entries fit in each stage's section of the URB */
+   if (gs_present) {
+      nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
+      nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
+   } else {
+      nr_vs_entries = total_urb_size / (vs_size * 128);
+      nr_gs_entries = 0;
+   }
+
+   /* Then clamp to the maximum allowed by the hardware */
+   if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
+      nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
+
+   if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
+      nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
+
+   /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
+   ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
+   ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
+
+   assert(ice->urb.nr_vs_entries >=
+          devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
+   assert(ice->urb.nr_vs_entries % 4 == 0);
+   assert(ice->urb.nr_gs_entries % 4 == 0);
+   assert(vs_size <= 5);
+   assert(gs_size <= 5);
+
+   crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
+      urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
+      urb.VSURBEntryAllocationSize = vs_size - 1;
+
+      urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
+      urb.GSURBEntryAllocationSize = gs_size - 1;
+   };
+   /* From the PRM Volume 2 part 1, section 1.4.7:
+    *
+    *   Because of a urb corruption caused by allocating a previous gsunit’s
+    *   urb entry to vsunit software is required to send a "GS NULL
+    *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
+    *   a dummy DRAW call before any case where VS will be taking over GS URB
+    *   space.
+    *
+    * It is not clear exactly what this means ("URB fence" is a command that
+    * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
+    * a workaround.
+    */
+   if (ice->urb.gs_present && !gs_present)
+      crocus_emit_mi_flush(batch);
+   ice->urb.gs_present = gs_present;
+}
+#endif
+
+static void
+crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
+{
+}
+
+static void
+crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
+                                 struct crocus_bo *bo,
+                                 uint32_t offset_in_bytes,
+                                 uint32_t report_id)
+{
+#if GFX_VER >= 7
+   crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
+      mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
+      mi_rpc.ReportID = report_id;
+   }
+#endif
+}
+
+/**
+ * From the PRM, Volume 2a:
+ *
+ *    "Indirect State Pointers Disable
+ *
+ *    At the completion of the post-sync operation associated with this pipe
+ *    control packet, the indirect state pointers in the hardware are
+ *    considered invalid; the indirect pointers are not saved in the context.
+ *    If any new indirect state commands are executed in the command stream
+ *    while the pipe control is pending, the new indirect state commands are
+ *    preserved.
+ *
+ *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
+ *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
+ *    commands are only considered as Indirect State Pointers. Once ISP is
+ *    issued in a context, SW must initialize by programming push constant
+ *    commands for all the shaders (at least to zero length) before attempting
+ *    any rendering operation for the same context."
+ *
+ * 3DSTATE_CONSTANT_* packets are restored during a context restore,
+ * even though they point to a BO that has been already unreferenced at
+ * the end of the previous batch buffer. This has been fine so far since
+ * we are protected by these scratch page (every address not covered by
+ * a BO should be pointing to the scratch page). But on CNL, it is
+ * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
+ * instruction.
+ *
+ * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
+ * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
+ * context restore, so the mentioned hang doesn't happen. However,
+ * software must program push constant commands for all stages prior to
+ * rendering anything, so we flag them as dirty.
+ *
+ * Finally, we also make sure to stall at pixel scoreboard to make sure the
+ * constants have been loaded into the EUs prior to disable the push constants
+ * so that it doesn't hang a previous 3DPRIMITIVE.
+ */
+#if GFX_VER >= 7
+static void
+gen7_emit_isp_disable(struct crocus_batch *batch)
+{
+   crocus_emit_raw_pipe_control(batch, "isp disable",
+                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                                PIPE_CONTROL_CS_STALL,
+                                NULL, 0, 0);
+   crocus_emit_raw_pipe_control(batch, "isp disable",
+                                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
+                                PIPE_CONTROL_CS_STALL,
+                                NULL, 0, 0);
+
+   struct crocus_context *ice = batch->ice;
+   ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
+                              CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
+                              CROCUS_STAGE_DIRTY_CONSTANTS_TES |
+                              CROCUS_STAGE_DIRTY_CONSTANTS_GS |
+                              CROCUS_STAGE_DIRTY_CONSTANTS_FS);
+}
+#endif
+
+#if GFX_VER >= 7
+static void
+crocus_state_finish_batch(struct crocus_batch *batch)
+{
+#if GFX_VERx10 == 75
+   if (batch->name == CROCUS_BATCH_RENDER) {
+      crocus_emit_mi_flush(batch);
+      crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
+         ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
+      }
+
+      crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_CS_STALL);
+   }
+#endif
+   gen7_emit_isp_disable(batch);
+}
+#endif
+
+static void
+crocus_batch_reset_dirty(struct crocus_batch *batch)
+{
+   /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
+    * as the old state batch won't still be available.
+    */
+   batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
+      CROCUS_DIRTY_COLOR_CALC_STATE;
+
+   batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
+
+   batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
+
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
+
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
+   batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
+   batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
+
+#if GFX_VER >= 6
+   /* SCISSOR_STATE */
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
+
+#endif
+#if GFX_VER <= 5
+   /* dirty the SF state on gen4/5 */
+   batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
+   batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
+   batch->ice->state.dirty |= CROCUS_DIRTY_WM;
+#endif
+#if GFX_VER >= 7
+   /* Streamout dirty */
+   batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
+   batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
+   batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
+#endif
+}
+
+#if GFX_VERx10 == 75
+struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
+{
+   return &ice->state.cso_rast->cso;
+}
+#endif
+
+#if GFX_VER >= 6
+static void update_so_strides(struct crocus_context *ice,
+                              uint16_t *strides)
+{
+   for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+      struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
+      if (so)
+         so->stride = strides[i] * sizeof(uint32_t);
+   }
+}
+#endif
+
+static void
+crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
+{
+   struct crocus_context *ice = (struct crocus_context *) ctx;
+
+   if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
+      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
+      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
+   }
+
+   if (ice->batch_count == 1)
+      return;
+
+   if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
+      ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
+      ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
+   }
+}
+
+void
+genX(init_screen_state)(struct crocus_screen *screen)
+{
+   assert(screen->devinfo.verx10 == GFX_VERx10);
+   screen->vtbl.destroy_state = crocus_destroy_state;
+   screen->vtbl.init_render_context = crocus_init_render_context;
+   screen->vtbl.upload_render_state = crocus_upload_render_state;
+#if GFX_VER == 7
+   screen->vtbl.init_compute_context = crocus_init_compute_context;
+   screen->vtbl.upload_compute_state = crocus_upload_compute_state;
+#endif
+   screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
+   screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
+   screen->vtbl.rebind_buffer = crocus_rebind_buffer;
+#if GFX_VERx10 == 75
+   screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
+   screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
+   screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
+   screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
+   screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
+   screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
+#endif
+#if GFX_VER >= 7
+   screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
+   screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
+   screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
+   screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
+#endif
+   screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
+#if GFX_VER >= 6
+   screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
+   screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
+#endif
+   screen->vtbl.populate_vs_key = crocus_populate_vs_key;
+   screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
+   screen->vtbl.populate_tes_key = crocus_populate_tes_key;
+   screen->vtbl.populate_gs_key = crocus_populate_gs_key;
+   screen->vtbl.populate_fs_key = crocus_populate_fs_key;
+   screen->vtbl.populate_cs_key = crocus_populate_cs_key;
+   screen->vtbl.lost_genx_state = crocus_lost_genx_state;
+#if GFX_VER >= 7
+   screen->vtbl.finish_batch = crocus_state_finish_batch;
+#endif
+#if GFX_VER <= 5
+   screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
+   screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
+#endif
+   screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
+   screen->vtbl.translate_prim_type = translate_prim_type;
+#if GFX_VER >= 6
+   screen->vtbl.update_so_strides = update_so_strides;
+   screen->vtbl.get_so_offset = crocus_get_so_offset;
+#endif
+
+   genX(init_blt)(screen);
+}
+
+void
+genX(init_state)(struct crocus_context *ice)
+{
+   struct pipe_context *ctx = &ice->ctx;
+
+   ctx->create_blend_state = crocus_create_blend_state;
+   ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
+   ctx->create_rasterizer_state = crocus_create_rasterizer_state;
+   ctx->create_sampler_state = crocus_create_sampler_state;
+   ctx->create_sampler_view = crocus_create_sampler_view;
+   ctx->create_surface = crocus_create_surface;
+   ctx->create_vertex_elements_state = crocus_create_vertex_elements;
+   ctx->bind_blend_state = crocus_bind_blend_state;
+   ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
+   ctx->bind_sampler_states = crocus_bind_sampler_states;
+   ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
+   ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
+   ctx->delete_blend_state = crocus_delete_state;
+   ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
+   ctx->delete_rasterizer_state = crocus_delete_state;
+   ctx->delete_sampler_state = crocus_delete_state;
+   ctx->delete_vertex_elements_state = crocus_delete_state;
+   ctx->set_blend_color = crocus_set_blend_color;
+   ctx->set_clip_state = crocus_set_clip_state;
+   ctx->set_constant_buffer = crocus_set_constant_buffer;
+   ctx->set_shader_buffers = crocus_set_shader_buffers;
+   ctx->set_shader_images = crocus_set_shader_images;
+   ctx->set_sampler_views = crocus_set_sampler_views;
+   ctx->set_tess_state = crocus_set_tess_state;
+   ctx->set_framebuffer_state = crocus_set_framebuffer_state;
+   ctx->set_polygon_stipple = crocus_set_polygon_stipple;
+   ctx->set_sample_mask = crocus_set_sample_mask;
+   ctx->set_scissor_states = crocus_set_scissor_states;
+   ctx->set_stencil_ref = crocus_set_stencil_ref;
+   ctx->set_vertex_buffers = crocus_set_vertex_buffers;
+   ctx->set_viewport_states = crocus_set_viewport_states;
+   ctx->sampler_view_destroy = crocus_sampler_view_destroy;
+   ctx->surface_destroy = crocus_surface_destroy;
+   ctx->draw_vbo = crocus_draw_vbo;
+   ctx->launch_grid = crocus_launch_grid;
+
+   ctx->set_frontend_noop = crocus_set_frontend_noop;
+
+#if GFX_VER >= 6
+   ctx->create_stream_output_target = crocus_create_stream_output_target;
+   ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
+   ctx->set_stream_output_targets = crocus_set_stream_output_targets;
+#endif
+
+   ice->state.dirty = ~0ull;
+   ice->state.stage_dirty = ~0ull;
+
+   ice->state.statistics_counters_enabled = true;
+
+   ice->state.sample_mask = 0xff;
+   ice->state.num_viewports = 1;
+   ice->state.prim_mode = PIPE_PRIM_MAX;
+   ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
+   ice->draw.derived_params.drawid = -1;
+
+   /* Default all scissor rectangles to be empty regions. */
+   for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
+      ice->state.scissors[i] = (struct pipe_scissor_state) {
+         .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
+      };
+   }
+}
diff --git a/src/gallium/drivers/crocus/crocus_todo.txt b/src/gallium/drivers/crocus/crocus_todo.txt
new file mode 100644
index 00000000000..1a6d3c9a710
--- /dev/null
+++ b/src/gallium/drivers/crocus/crocus_todo.txt
@@ -0,0 +1,16 @@
+Quick TODO list from what I can see:
+
+General:
+Re-emit SURFACE_STATE_BASE_ADDRESS at the top of every batch
+
+Gen4:
+rgb32 issue
+
+Gen5:
+rgb32 issue
+
+Gen6:
+vec4 push constants
+
+Gen7:
+
diff --git a/src/gallium/drivers/crocus/driinfo_crocus.h b/src/gallium/drivers/crocus/driinfo_crocus.h
new file mode 100644
index 00000000000..829bf7f818c
--- /dev/null
+++ b/src/gallium/drivers/crocus/driinfo_crocus.h
@@ -0,0 +1,11 @@
+// crocus specific driconf options
+
+DRI_CONF_SECTION_DEBUG
+   DRI_CONF_DUAL_COLOR_BLEND_BY_LOCATION(false)
+   DRI_CONF_DISABLE_THROTTLING(false)
+   DRI_CONF_ALWAYS_FLUSH_CACHE(false)
+DRI_CONF_SECTION_END
+
+DRI_CONF_SECTION_PERFORMANCE
+   DRI_CONF_OPT_E(bo_reuse, 1, 0, 1, "Buffer object reuse",)
+DRI_CONF_SECTION_END
diff --git a/src/gallium/drivers/crocus/gen4_blorp_exec.h b/src/gallium/drivers/crocus/gen4_blorp_exec.h
new file mode 100644
index 00000000000..bc19a1b39fc
--- /dev/null
+++ b/src/gallium/drivers/crocus/gen4_blorp_exec.h
@@ -0,0 +1,190 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+static inline struct blorp_address
+dynamic_state_address(struct blorp_batch *blorp_batch, uint32_t offset)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = batch->state.bo,
+      .offset = offset,
+   };
+
+}
+
+static inline struct blorp_address
+instruction_state_address(struct blorp_batch *blorp_batch, uint32_t offset)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   return (struct blorp_address) {
+      .buffer = batch->ice->shaders.cache_bo,
+      .offset = offset,
+   };
+}
+
+static struct blorp_address
+blorp_emit_vs_state(struct blorp_batch *blorp_batch)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   uint32_t offset;
+   blorp_emit_dynamic(blorp_batch, GENX(VS_STATE), vs, 64, &offset) {
+      vs.Enable = false;
+      vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
+#if GFX_VER == 5
+      vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> 2;
+#else
+      vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries;
+#endif
+   }
+
+   return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_sf_state(struct blorp_batch *blorp_batch,
+                    const struct blorp_params *params)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+   const struct brw_sf_prog_data *prog_data = params->sf_prog_data;
+
+   uint32_t offset;
+   blorp_emit_dynamic(blorp_batch, GENX(SF_STATE), sf, 64, &offset) {
+#if GFX_VER == 4
+      sf.KernelStartPointer =
+         instruction_state_address(blorp_batch, params->sf_prog_kernel);
+#else
+      sf.KernelStartPointer = params->sf_prog_kernel;
+#endif
+      sf.GRFRegisterCount = DIV_ROUND_UP(prog_data->total_grf, 16) - 1;
+      sf.VertexURBEntryReadLength = prog_data->urb_read_length;
+      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
+      sf.DispatchGRFStartRegisterForURBData = 3;
+      sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
+      sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
+
+#if GFX_VER == 5
+      sf.MaximumNumberofThreads = MIN2(48, batch->ice->urb.nr_sf_entries) - 1;
+#else
+      sf.MaximumNumberofThreads = MIN2(24, batch->ice->urb.nr_sf_entries) - 1;
+#endif
+      sf.ViewportTransformEnable = false;
+
+      sf.CullMode = CULLMODE_NONE;
+   }
+
+   return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_wm_state(struct blorp_batch *blorp_batch,
+                    const struct blorp_params *params)
+{
+   const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
+
+   uint32_t offset;
+   blorp_emit_dynamic(blorp_batch, GENX(WM_STATE), wm, 64, &offset) {
+      if (params->src.enabled) {
+         /* Iron Lake can't do sampler prefetch */
+         wm.SamplerCount = (GFX_VER != 5);
+         wm.BindingTableEntryCount = 2;
+         uint32_t sampler = blorp_emit_sampler_state(blorp_batch);
+         wm.SamplerStatePointer = dynamic_state_address(blorp_batch, sampler);
+      }
+
+      if (prog_data) {
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            prog_data->base.dispatch_grf_start_reg;
+         wm.SetupURBEntryReadLength = prog_data->num_varying_inputs * 2;
+         wm.SetupURBEntryReadOffset = 0;
+
+         wm.DepthCoefficientURBReadOffset = 1;
+         wm.PixelShaderKillsPixel = prog_data->uses_kill;
+         wm.ThreadDispatchEnable = true;
+         wm.EarlyDepthTestEnable = true;
+
+         wm._8PixelDispatchEnable = prog_data->dispatch_8;
+         wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
+#if GFX_VER == 4
+         wm.KernelStartPointer0 =
+            instruction_state_address(blorp_batch, params->wm_prog_kernel);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
+#else
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
+         wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(prog_data, wm, 1);
+         wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(prog_data, wm, 2);
+#endif
+      }
+
+      wm.MaximumNumberofThreads =
+         blorp_batch->blorp->compiler->devinfo->max_wm_threads - 1;
+   }
+
+   return dynamic_state_address(blorp_batch, offset);
+}
+
+static struct blorp_address
+blorp_emit_color_calc_state(struct blorp_batch *blorp_batch)
+{
+   uint32_t cc_viewport = blorp_emit_cc_viewport(blorp_batch);
+
+   uint32_t offset;
+   blorp_emit_dynamic(blorp_batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {
+      cc.CCViewportStatePointer = dynamic_state_address(blorp_batch, cc_viewport);
+   }
+
+   return dynamic_state_address(blorp_batch, offset);
+}
+
+static void
+blorp_emit_pipeline(struct blorp_batch *blorp_batch,
+                    const struct blorp_params *params)
+{
+   struct crocus_batch *batch = blorp_batch->driver_batch;
+
+   emit_urb_config(blorp_batch, params, NULL);
+
+   blorp_emit(blorp_batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
+      pp.PointertoVSState = blorp_emit_vs_state(blorp_batch);
+      pp.GSEnable = false;
+      pp.ClipEnable = false;
+      pp.PointertoSFState = blorp_emit_sf_state(blorp_batch, params);
+      pp.PointertoWMState = blorp_emit_wm_state(blorp_batch, params);
+      pp.PointertoColorCalcState = blorp_emit_color_calc_state(blorp_batch);
+   }
+
+   batch->screen->vtbl.upload_urb_fence(batch);
+
+   blorp_emit(blorp_batch, GENX(CS_URB_STATE), curb);
+   blorp_emit(blorp_batch, GENX(CONSTANT_BUFFER), curb);
+}
diff --git a/src/gallium/drivers/crocus/meson.build b/src/gallium/drivers/crocus/meson.build
new file mode 100644
index 00000000000..2bdb1f2cfb5
--- /dev/null
+++ b/src/gallium/drivers/crocus/meson.build
@@ -0,0 +1,90 @@
+# Copyright © 2017-2019 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+files_libcrocus = files(
+  'gen4_blorp_exec.h',
+  'driinfo_crocus.h',
+  'crocus_batch.c',
+  'crocus_batch.h',
+  'crocus_blit.c',
+  'crocus_bufmgr.c',
+  'crocus_bufmgr.h',
+  'crocus_clear.c',
+  'crocus_context.c',
+  'crocus_context.h',
+  'crocus_draw.c',
+  'crocus_fence.c',
+  'crocus_fence.h',
+  'crocus_fine_fence.c',
+  'crocus_fine_fence.h',
+  'crocus_formats.c',
+  'crocus_genx_macros.h',
+  'crocus_genx_protos.h',
+  'crocus_monitor.c',
+  'crocus_pipe.h',
+  'crocus_pipe_control.c',
+  'crocus_program.c',
+  'crocus_program_cache.c',
+  'crocus_resolve.c',
+  'crocus_resource.c',
+  'crocus_resource.h',
+  'crocus_screen.c',
+  'crocus_screen.h',
+  'crocus_disk_cache.c',
+)
+
+crocus_per_hw_ver_libs = []
+foreach v : ['40', '45', '50', '60', '70', '75']
+  crocus_per_hw_ver_libs += static_library(
+    'crocus_per_hw_ver@0@'.format(v),
+    ['crocus_blorp.c', 'crocus_query.c', 'crocus_state.c', 'crocus_blt.c', gen_xml_pack],
+    include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_intel],
+    c_args : [
+      no_override_init_args, c_sse2_args,
+      '-DGFX_VERx10=@0@'.format(v),
+    ],
+    gnu_symbol_visibility : 'hidden',
+    dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
+  )
+endforeach
+
+libcrocus = static_library(
+  'crocus',
+  [files_libcrocus, gen_xml_pack],
+  include_directories : [
+    inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_intel,
+    inc_gallium_drivers,
+    # these should not be necessary, but main/macros.h...
+    inc_mesa, inc_mapi
+  ],
+  c_args : [c_sse2_args],
+  cpp_args : [c_sse2_args],
+  gnu_symbol_visibility : 'hidden',
+  dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_libintel_common, idep_nir_headers],
+  link_with : [
+    crocus_per_hw_ver_libs, libintel_compiler, libintel_dev, libisl,
+    libblorp, libintel_perf
+  ],
+)
+
+driver_crocus = declare_dependency(
+  compile_args : '-DGALLIUM_CROCUS',
+  link_with : [libcrocus, libcrocuswinsys],
+)
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index 3b3bb07f1de..e64d7399ae1 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -129,6 +129,12 @@ if with_gallium_tegra
 else
   driver_tegra = declare_dependency()
 endif
+if with_gallium_crocus
+  subdir('winsys/crocus/drm')
+  subdir('drivers/crocus')
+else
+  driver_crocus = declare_dependency()
+endif
 if with_gallium_iris
   subdir('winsys/iris/drm')
   subdir('drivers/iris')
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index daef41613db..cc6c805641b 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -64,7 +64,7 @@ libgallium_nine = shared_library(
     dep_selinux, dep_libdrm, dep_llvm, dep_thread,
     idep_xmlconfig, idep_mesautil, idep_nir,
     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
-    driver_i915, driver_svga, driver_iris
+    driver_i915, driver_svga, driver_iris, driver_crocus
   ],
   name_prefix : '',
   version : '.'.join(nine_version),
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index 90b48bf508e..e4cc199b363 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -58,7 +58,7 @@ libgallium_dri = shared_library(
     driver_kmsro, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
     driver_tegra, driver_i915, driver_svga, driver_virgl,
     driver_swr, driver_panfrost, driver_iris, driver_lima, driver_zink, driver_d3d12,
-    driver_asahi
+    driver_asahi, driver_crocus
   ],
   # Will be deleted during installation, see install_megadrivers.py
   install : true,
@@ -98,6 +98,7 @@ foreach d : [[with_gallium_kmsro, [
              [with_gallium_panfrost, 'panfrost_dri.so'],
              [with_gallium_etnaviv, 'etnaviv_dri.so'],
              [with_gallium_tegra, 'tegra_dri.so'],
+             [with_gallium_crocus, 'crocus_dri.so'],
              [with_gallium_iris, 'iris_dri.so'],
              [with_gallium_i915, 'i915_dri.so'],
              [with_gallium_r300, 'r300_dri.so'],
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index 9df8da61803..3c7c2325f17 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -42,6 +42,10 @@ DEFINE_LOADER_DRM_ENTRYPOINT(i915)
 DEFINE_LOADER_DRM_ENTRYPOINT(iris)
 #endif
 
+#if defined(GALLIUM_CROCUS)
+DEFINE_LOADER_DRM_ENTRYPOINT(crocus)
+#endif
+
 #if defined(GALLIUM_NOUVEAU)
 DEFINE_LOADER_DRM_ENTRYPOINT(nouveau)
 #endif
diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_public.h b/src/gallium/winsys/crocus/drm/crocus_drm_public.h
new file mode 100644
index 00000000000..614543136be
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/crocus_drm_public.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef CROCUS_DRM_PUBLIC_H
+#define CROCUS_DRM_PUBLIC_H
+
+struct pipe_screen;
+struct pipe_screen_config;
+
+struct pipe_screen *
+crocus_drm_screen_create(int drm_fd, const struct pipe_screen_config *config);
+
+#endif /* CROCUS_DRM_PUBLIC_H */
diff --git a/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c
new file mode 100644
index 00000000000..ffeeba567ac
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/crocus_drm_winsys.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "util/os_file.h"
+
+#include "crocus_drm_public.h"
+#include "crocus/crocus_screen.h"
+
+struct pipe_screen *
+crocus_drm_screen_create(int fd, const struct pipe_screen_config *config)
+{
+   int newfd = os_dupfd_cloexec(fd);
+   if (newfd < 0)
+      return NULL;
+   return crocus_screen_create(newfd, config);
+}
diff --git a/src/gallium/winsys/crocus/drm/meson.build b/src/gallium/winsys/crocus/drm/meson.build
new file mode 100644
index 00000000000..4e82fe52437
--- /dev/null
+++ b/src/gallium/winsys/crocus/drm/meson.build
@@ -0,0 +1,29 @@
+# Copyright © 2017 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+libcrocuswinsys = static_library(
+  'crocuswinsys',
+  files('crocus_drm_winsys.c'),
+  include_directories : [
+    inc_src, inc_include,
+    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
+  ],
+  gnu_symbol_visibility : 'hidden',
+)
diff --git a/src/intel/common/intel_batch_decoder.c b/src/intel/common/intel_batch_decoder.c
index 12ed17f6fef..ed5b3242215 100644
--- a/src/intel/common/intel_batch_decoder.c
+++ b/src/intel/common/intel_batch_decoder.c
@@ -829,7 +829,7 @@ decode_dynamic_state_pointers(struct intel_batch_decode_ctx *ctx,
    struct intel_field_iterator iter;
    intel_field_iterator_init(&iter, inst, p, 0, false);
    while (intel_field_iterator_next(&iter)) {
-      if (str_ends_with(iter.name, "Pointer")) {
+      if (str_ends_with(iter.name, "Pointer") || !strncmp(iter.name, "Pointer", 7)) {
          state_offset = iter.raw_value;
          break;
       }
@@ -901,6 +901,13 @@ decode_3dstate_cc_state_pointers(struct intel_batch_decode_ctx *ctx,
 }
 
 static void
+decode_3dstate_ds_state_pointers(struct intel_batch_decode_ctx *ctx,
+                                 const uint32_t *p)
+{
+   decode_dynamic_state_pointers(ctx, "DEPTH_STENCIL_STATE", p, 1);
+}
+
+static void
 decode_3dstate_scissor_state_pointers(struct intel_batch_decode_ctx *ctx,
                                       const uint32_t *p)
 {
@@ -1208,6 +1215,7 @@ struct custom_decoder {
    { "3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP", decode_3dstate_viewport_state_pointers_sf_clip },
    { "3DSTATE_BLEND_STATE_POINTERS", decode_3dstate_blend_state_pointers },
    { "3DSTATE_CC_STATE_POINTERS", decode_3dstate_cc_state_pointers },
+   { "3DSTATE_DEPTH_STENCIL_STATE_POINTERS", decode_3dstate_ds_state_pointers },
    { "3DSTATE_SCISSOR_STATE_POINTERS", decode_3dstate_scissor_state_pointers },
    { "3DSTATE_SLICE_TABLE_STATE_POINTERS", decode_3dstate_slice_table_state_pointers },
    { "MI_LOAD_REGISTER_IMM", decode_load_register_imm },
diff --git a/src/loader/pci_id_driver_map.h b/src/loader/pci_id_driver_map.h
index 4129389d889..1cece69feaf 100644
--- a/src/loader/pci_id_driver_map.h
+++ b/src/loader/pci_id_driver_map.h
@@ -76,6 +76,7 @@ static const struct {
    { 0x8086, "i915", i915_chip_ids, ARRAY_SIZE(i915_chip_ids) },
    { 0x8086, "i965", i965_chip_ids, ARRAY_SIZE(i965_chip_ids) },
    { 0x8086, "iris", NULL, -1, is_kernel_i915 },
+   { 0x8086, "crocus", NULL, -1, is_kernel_i915 },
    { 0x1002, "radeon", r100_chip_ids, ARRAY_SIZE(r100_chip_ids) },
    { 0x1002, "r200", r200_chip_ids, ARRAY_SIZE(r200_chip_ids) },
    { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) },
author	Dave Airlie <airlied@gmail.com>	2021-06-01 13:14:51 +1000
committer	Dave Airlie <airlied@gmail.com>	2021-06-14 06:34:05 +1000
commit	f3630548f1da904ec6c63b43ece7e68afdb8867e (patch)
tree	05cfc909591aba9d8bf4bdeb9ba32ce8db2c58f4 /src
parent	8da92b5c0a358e30be557cae3303a4027b24db1c (diff)